Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_CRSMATRIX_DEF_HPP
11 #define TPETRA_CRSMATRIX_DEF_HPP
12 
20 
21 #include "Tpetra_Import_Util.hpp"
22 #include "Tpetra_Import_Util2.hpp"
23 #include "Tpetra_RowMatrix.hpp"
24 #include "Tpetra_LocalCrsMatrixOperator.hpp"
25 #include "Tpetra_computeRowAndColumnOneNorms.hpp"
27 
34 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
42 #include "Tpetra_Details_packCrsMatrix.hpp"
43 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
45 #include "Teuchos_FancyOStream.hpp"
46 #include "Teuchos_RCP.hpp"
47 #include "Teuchos_DataAccess.hpp"
48 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
49 #include "KokkosBlas1_scal.hpp"
50 #include "KokkosSparse_getDiagCopy.hpp"
51 #include "KokkosSparse_spmv.hpp"
53 
54 #include <memory>
55 #include <sstream>
56 #include <typeinfo>
57 #include <utility>
58 #include <vector>
59 
60 namespace Tpetra {
61 
62 namespace { // (anonymous)
63 
64  template<class T, class BinaryFunction>
65  T atomic_binary_function_update (T* const dest,
66  const T& inputVal,
67  BinaryFunction f)
68  {
69  T oldVal = *dest;
70  T assume;
71 
72  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
73  // POWER architectures, because 'newval' depends on 'assume',
74  // which depends on 'oldVal', which depends on '*dest'. This
75  // sets up a chain of read dependencies that should ensure
76  // correct behavior given a sane memory model.
77  do {
78  assume = oldVal;
79  T newVal = f (assume, inputVal);
80  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
81  } while (assume != oldVal);
82 
83  return oldVal;
84  }
85 } // namespace (anonymous)
86 
87 //
88 // Users must never rely on anything in the Details namespace.
89 //
90 namespace Details {
91 
101 template<class Scalar>
102 struct AbsMax {
104  Scalar operator() (const Scalar& x, const Scalar& y) {
105  typedef Teuchos::ScalarTraits<Scalar> STS;
106  return std::max (STS::magnitude (x), STS::magnitude (y));
107  }
108 };
109 
110 } // namespace Details
111 } // namespace Tpetra
112 
113 namespace Tpetra {
114 
115  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
116  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
117  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
118  size_t maxNumEntriesPerRow,
119  const Teuchos::RCP<Teuchos::ParameterList>& params) :
120  dist_object_type (rowMap)
121  {
122  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
123  "[, RCP<ParameterList>]): ";
124  Teuchos::RCP<crs_graph_type> graph;
125  try {
126  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
127  params));
128  }
129  catch (std::exception& e) {
130  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
131  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
132  "size_t [, RCP<ParameterList>]) threw an exception: "
133  << e.what ());
134  }
135  // myGraph_ not null means that the matrix owns the graph. That's
136  // different than the const CrsGraph constructor, where the matrix
137  // does _not_ own the graph.
138  myGraph_ = graph;
139  staticGraph_ = myGraph_;
140  resumeFill (params);
142  }
143 
144  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
146  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
147  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
148  const Teuchos::RCP<Teuchos::ParameterList>& params) :
149  dist_object_type (rowMap)
150  {
151  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
152  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
153  Teuchos::RCP<crs_graph_type> graph;
154  try {
155  using Teuchos::rcp;
156  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
157  params));
158  }
159  catch (std::exception& e) {
160  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
161  (true, std::runtime_error, "CrsGraph constructor "
162  "(RCP<const Map>, ArrayView<const size_t>"
163  "[, RCP<ParameterList>]) threw an exception: "
164  << e.what ());
165  }
166  // myGraph_ not null means that the matrix owns the graph. That's
167  // different than the const CrsGraph constructor, where the matrix
168  // does _not_ own the graph.
169  myGraph_ = graph;
170  staticGraph_ = graph;
171  resumeFill (params);
173  }
174 
175  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
177  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
178  const Teuchos::RCP<const map_type>& colMap,
179  const size_t maxNumEntPerRow,
180  const Teuchos::RCP<Teuchos::ParameterList>& params) :
181  dist_object_type (rowMap)
182  {
183  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
184  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
185  const char suffix[] =
186  " Please report this bug to the Tpetra developers.";
187 
188  // An artifact of debugging something a while back.
189  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
190  (! staticGraph_.is_null (), std::logic_error,
191  "staticGraph_ is not null at the beginning of the constructor."
192  << suffix);
193  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
194  (! myGraph_.is_null (), std::logic_error,
195  "myGraph_ is not null at the beginning of the constructor."
196  << suffix);
197  Teuchos::RCP<crs_graph_type> graph;
198  try {
199  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
200  maxNumEntPerRow,
201  params));
202  }
203  catch (std::exception& e) {
204  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
205  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
206  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
207  "exception: " << e.what ());
208  }
209  // myGraph_ not null means that the matrix owns the graph. That's
210  // different than the const CrsGraph constructor, where the matrix
211  // does _not_ own the graph.
212  myGraph_ = graph;
213  staticGraph_ = myGraph_;
214  resumeFill (params);
216  }
217 
218  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
220  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
221  const Teuchos::RCP<const map_type>& colMap,
222  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
223  const Teuchos::RCP<Teuchos::ParameterList>& params) :
224  dist_object_type (rowMap)
225  {
226  const char tfecfFuncName[] =
227  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
228  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
229  Teuchos::RCP<crs_graph_type> graph;
230  try {
231  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
232  numEntPerRowToAlloc,
233  params));
234  }
235  catch (std::exception& e) {
236  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
237  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
238  "RCP<const Map>, ArrayView<const size_t>[, "
239  "RCP<ParameterList>]) threw an exception: " << e.what ());
240  }
241  // myGraph_ not null means that the matrix owns the graph. That's
242  // different than the const CrsGraph constructor, where the matrix
243  // does _not_ own the graph.
244  myGraph_ = graph;
245  staticGraph_ = graph;
246  resumeFill (params);
248  }
249 
250 
251  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
253  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
254  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
255  dist_object_type (graph->getRowMap ()),
256  staticGraph_ (graph),
257  storageStatus_ (Details::STORAGE_1D_PACKED)
258  {
259  using std::endl;
260  typedef typename local_matrix_device_type::values_type values_type;
261  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
262  "RCP<ParameterList>]): ";
263  const bool verbose = Details::Behavior::verbose("CrsMatrix");
264 
265  std::unique_ptr<std::string> prefix;
266  if (verbose) {
267  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
268  std::ostringstream os;
269  os << *prefix << "Start" << endl;
270  std::cerr << os.str ();
271  }
272 
273  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
274  (graph.is_null (), std::runtime_error, "Input graph is null.");
275  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
276  (! graph->isFillComplete (), std::runtime_error, "Input graph "
277  "is not fill complete. You must call fillComplete on the "
278  "graph before using it to construct a CrsMatrix. Note that "
279  "calling resumeFill on the graph makes it not fill complete, "
280  "even if you had previously called fillComplete. In that "
281  "case, you must call fillComplete on the graph again.");
282 
283  // The graph is fill complete, so it is locally indexed and has a
284  // fixed structure. This means we can allocate the (1-D) array of
285  // values and build the local matrix right now. Note that the
286  // local matrix's number of columns comes from the column Map, not
287  // the domain Map.
288 
289  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
290  if (verbose) {
291  std::ostringstream os;
292  os << *prefix << "Allocate values: " << numEnt << endl;
293  std::cerr << os.str ();
294  }
295 
296  values_type val ("Tpetra::CrsMatrix::values", numEnt);
297  valuesPacked_wdv = values_wdv_type(val);
298  valuesUnpacked_wdv = valuesPacked_wdv;
299 
301 
302  if (verbose) {
303  std::ostringstream os;
304  os << *prefix << "Done" << endl;
305  std::cerr << os.str ();
306  }
307  }
308 
309  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
312  const Teuchos::RCP<const crs_graph_type>& graph,
313  const Teuchos::RCP<Teuchos::ParameterList>& params) :
314  dist_object_type (graph->getRowMap ()),
315  staticGraph_ (graph),
316  storageStatus_ (matrix.storageStatus_)
317  {
318  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
319  "local_matrix_device_type::values_type, "
320  "[,RCP<ParameterList>]): ";
321  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
322  (graph.is_null (), std::runtime_error, "Input graph is null.");
323  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
324  (! graph->isFillComplete (), std::runtime_error, "Input graph "
325  "is not fill complete. You must call fillComplete on the "
326  "graph before using it to construct a CrsMatrix. Note that "
327  "calling resumeFill on the graph makes it not fill complete, "
328  "even if you had previously called fillComplete. In that "
329  "case, you must call fillComplete on the graph again.");
330 
331  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
332  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
333 
334  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
335  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
336 
338  }
339 
340 
341  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
343  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
344  const typename local_matrix_device_type::values_type& values,
345  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
346  dist_object_type (graph->getRowMap ()),
347  staticGraph_ (graph),
348  storageStatus_ (Details::STORAGE_1D_PACKED)
349  {
350  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
351  "local_matrix_device_type::values_type, "
352  "[,RCP<ParameterList>]): ";
353  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
354  (graph.is_null (), std::runtime_error, "Input graph is null.");
355  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
356  (! graph->isFillComplete (), std::runtime_error, "Input graph "
357  "is not fill complete. You must call fillComplete on the "
358  "graph before using it to construct a CrsMatrix. Note that "
359  "calling resumeFill on the graph makes it not fill complete, "
360  "even if you had previously called fillComplete. In that "
361  "case, you must call fillComplete on the graph again.");
362 
363  // The graph is fill complete, so it is locally indexed and has a
364  // fixed structure. This means we can allocate the (1-D) array of
365  // values and build the local matrix right now. Note that the
366  // local matrix's number of columns comes from the column Map, not
367  // the domain Map.
368 
369  valuesPacked_wdv = values_wdv_type(values);
370  valuesUnpacked_wdv = valuesPacked_wdv;
371 
372  // FIXME (22 Jun 2016) I would very much like to get rid of
373  // k_values1D_ at some point. I find it confusing to have all
374  // these extra references lying around.
375  // KDDKDD ALMOST THERE, MARK!
376 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
377 
379  }
380 
381  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
383  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
384  const Teuchos::RCP<const map_type>& colMap,
385  const typename local_graph_device_type::row_map_type& rowPointers,
386  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
387  const typename local_matrix_device_type::values_type& values,
388  const Teuchos::RCP<Teuchos::ParameterList>& params) :
389  dist_object_type (rowMap),
390  storageStatus_ (Details::STORAGE_1D_PACKED)
391  {
392  using Details::getEntryOnHost;
393  using Teuchos::RCP;
394  using std::endl;
395  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
396  "RCP<const Map>, ptr, ind, val[, params]): ";
397  const char suffix[] =
398  ". Please report this bug to the Tpetra developers.";
399  const bool debug = Details::Behavior::debug("CrsMatrix");
400  const bool verbose = Details::Behavior::verbose("CrsMatrix");
401 
402  std::unique_ptr<std::string> prefix;
403  if (verbose) {
404  prefix = this->createPrefix(
405  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
406  std::ostringstream os;
407  os << *prefix << "Start" << endl;
408  std::cerr << os.str ();
409  }
410 
411  // Check the user's input. Note that this might throw only on
412  // some processes but not others, causing deadlock. We prefer
413  // deadlock due to exceptions to segfaults, because users can
414  // catch exceptions.
415  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
416  (values.extent(0) != columnIndices.extent(0),
417  std::invalid_argument, "values.extent(0)=" << values.extent(0)
418  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
419  << ".");
420  if (debug && rowPointers.extent(0) != 0) {
421  const size_t numEnt =
422  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
423  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
424  (numEnt != size_t(columnIndices.extent(0)) ||
425  numEnt != size_t(values.extent(0)),
426  std::invalid_argument, "Last entry of rowPointers says that "
427  "the matrix has " << numEnt << " entr"
428  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
429  "columnIndices and values don't match this. "
430  "columnIndices.extent(0)=" << columnIndices.extent (0)
431  << " and values.extent(0)=" << values.extent (0) << ".");
432  }
433 
434  RCP<crs_graph_type> graph;
435  try {
436  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
437  columnIndices, params));
438  }
439  catch (std::exception& e) {
440  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
441  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
442  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
443  << e.what ());
444  }
445  // The newly created CrsGraph _must_ have a local graph at this
446  // point. We don't really care whether CrsGraph's constructor
447  // deep-copies or shallow-copies the input, but the dimensions
448  // have to be right. That's how we tell whether the CrsGraph has
449  // a local graph.
450  auto lclGraph = graph->getLocalGraphDevice ();
451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
452  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
453  lclGraph.entries.extent (0) != columnIndices.extent (0),
454  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
455  "ind[, params]) did not set the local graph correctly." << suffix);
456  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
457  (lclGraph.entries.extent (0) != values.extent (0),
458  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
459  "params]) did not set the local graph correctly. "
460  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
461  << " != values.extent(0) = " << values.extent (0) << suffix);
462 
463  // myGraph_ not null means that the matrix owns the graph. This
464  // is true because the column indices come in as nonconst,
465  // implying shared ownership.
466  myGraph_ = graph;
467  staticGraph_ = graph;
468 
469  // The graph may not be fill complete yet. However, it is locally
470  // indexed (since we have a column Map) and has a fixed structure
471  // (due to the input arrays). This means we can allocate the
472  // (1-D) array of values and build the local matrix right now.
473  // Note that the local matrix's number of columns comes from the
474  // column Map, not the domain Map.
475 
476  valuesPacked_wdv = values_wdv_type(values);
477  valuesUnpacked_wdv = valuesPacked_wdv;
478 
479  // FIXME (22 Jun 2016) I would very much like to get rid of
480  // k_values1D_ at some point. I find it confusing to have all
481  // these extra references lying around.
482 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
483 
485  if (verbose) {
486  std::ostringstream os;
487  os << *prefix << "Done" << endl;
488  std::cerr << os.str();
489  }
490  }
491 
492  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
494  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
495  const Teuchos::RCP<const map_type>& colMap,
496  const Teuchos::ArrayRCP<size_t>& ptr,
497  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
498  const Teuchos::ArrayRCP<Scalar>& val,
499  const Teuchos::RCP<Teuchos::ParameterList>& params) :
500  dist_object_type (rowMap),
501  storageStatus_ (Details::STORAGE_1D_PACKED)
502  {
503  using Kokkos::Compat::getKokkosViewDeepCopy;
504  using Teuchos::av_reinterpret_cast;
505  using Teuchos::RCP;
506  using values_type = typename local_matrix_device_type::values_type;
507  using IST = impl_scalar_type;
508  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
509  "RCP<const Map>, ptr, ind, val[, params]): ";
510 
511  RCP<crs_graph_type> graph;
512  try {
513  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
514  ind, params));
515  }
516  catch (std::exception& e) {
517  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
518  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
519  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
520  "RCP<ParameterList>]) threw an exception: " << e.what ());
521  }
522  // myGraph_ not null means that the matrix owns the graph. This
523  // is true because the column indices come in as nonconst,
524  // implying shared ownership.
525  myGraph_ = graph;
526  staticGraph_ = graph;
527 
528  // The graph may not be fill complete yet. However, it is locally
529  // indexed (since we have a column Map) and has a fixed structure
530  // (due to the input arrays). This means we can allocate the
531  // (1-D) array of values and build the local matrix right now.
532  // Note that the local matrix's number of columns comes from the
533  // column Map, not the domain Map.
534 
535  // The graph _must_ have a local graph at this point. We don't
536  // really care whether CrsGraph's constructor deep-copies or
537  // shallow-copies the input, but the dimensions have to be right.
538  // That's how we tell whether the CrsGraph has a local graph.
539  auto lclGraph = staticGraph_->getLocalGraphDevice ();
540  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
541  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
542  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
543  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
544  "ptr, ind[, params]) did not set the local graph correctly. "
545  "Please report this bug to the Tpetra developers.");
546 
547  values_type valIn =
548  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
549  valuesPacked_wdv = values_wdv_type(valIn);
550  valuesUnpacked_wdv = valuesPacked_wdv;
551 
552  // FIXME (22 Jun 2016) I would very much like to get rid of
553  // k_values1D_ at some point. I find it confusing to have all
554  // these extra references lying around.
555 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
556 
558  }
559 
560  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
562  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
563  const Teuchos::RCP<const map_type>& colMap,
564  const local_matrix_device_type& lclMatrix,
565  const Teuchos::RCP<Teuchos::ParameterList>& params) :
566  dist_object_type (rowMap),
567  storageStatus_ (Details::STORAGE_1D_PACKED),
568  fillComplete_ (true)
569  {
570  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
571  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
572  const char suffix[] =
573  " Please report this bug to the Tpetra developers.";
574 
575  Teuchos::RCP<crs_graph_type> graph;
576  try {
577  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
578  lclMatrix.graph, params));
579  }
580  catch (std::exception& e) {
581  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
582  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
583  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
584  "exception: " << e.what ());
585  }
586  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
587  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
588  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
589  "did not produce a fill-complete graph. Please report this bug to the "
590  "Tpetra developers.");
591  // myGraph_ not null means that the matrix owns the graph. This
592  // is true because the column indices come in as nonconst through
593  // the matrix, implying shared ownership.
594  myGraph_ = graph;
595  staticGraph_ = graph;
596 
597  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
598  valuesUnpacked_wdv = valuesPacked_wdv;
599 
600  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
601  (isFillActive (), std::logic_error,
602  "At the end of a CrsMatrix constructor that should produce "
603  "a fillComplete matrix, isFillActive() is true." << suffix);
604  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
605  (! isFillComplete (), std::logic_error, "At the end of a "
606  "CrsMatrix constructor that should produce a fillComplete "
607  "matrix, isFillComplete() is false." << suffix);
609  }
610 
611  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
614  const Teuchos::RCP<const map_type>& rowMap,
615  const Teuchos::RCP<const map_type>& colMap,
616  const Teuchos::RCP<const map_type>& domainMap,
617  const Teuchos::RCP<const map_type>& rangeMap,
618  const Teuchos::RCP<Teuchos::ParameterList>& params) :
619  dist_object_type (rowMap),
620  storageStatus_ (Details::STORAGE_1D_PACKED),
621  fillComplete_ (true)
622  {
623  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
624  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
625  "local_matrix_device_type[, RCP<ParameterList>]): ";
626  const char suffix[] =
627  " Please report this bug to the Tpetra developers.";
628 
629  Teuchos::RCP<crs_graph_type> graph;
630  try {
631  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
632  domainMap, rangeMap, params));
633  }
634  catch (std::exception& e) {
635  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
636  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
637  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
638  "RCP<ParameterList>]) threw an exception: " << e.what ());
639  }
640  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
641  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
642  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
643  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
644  "not produce a fillComplete graph." << suffix);
645  // myGraph_ not null means that the matrix owns the graph. This
646  // is true because the column indices come in as nonconst through
647  // the matrix, implying shared ownership.
648  myGraph_ = graph;
649  staticGraph_ = graph;
650 
651  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
652  valuesUnpacked_wdv = valuesPacked_wdv;
653 
654  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
655  (isFillActive (), std::logic_error,
656  "At the end of a CrsMatrix constructor that should produce "
657  "a fillComplete matrix, isFillActive() is true." << suffix);
658  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
659  (! isFillComplete (), std::logic_error, "At the end of a "
660  "CrsMatrix constructor that should produce a fillComplete "
661  "matrix, isFillComplete() is false." << suffix);
663  }
664 
665  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
668  const Teuchos::RCP<const map_type>& rowMap,
669  const Teuchos::RCP<const map_type>& colMap,
670  const Teuchos::RCP<const map_type>& domainMap,
671  const Teuchos::RCP<const map_type>& rangeMap,
672  const Teuchos::RCP<const import_type>& importer,
673  const Teuchos::RCP<const export_type>& exporter,
674  const Teuchos::RCP<Teuchos::ParameterList>& params) :
675  dist_object_type (rowMap),
676  storageStatus_ (Details::STORAGE_1D_PACKED),
677  fillComplete_ (true)
678  {
679  using Teuchos::rcp;
680  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
681  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
682  const char suffix[] =
683  " Please report this bug to the Tpetra developers.";
684 
685  Teuchos::RCP<crs_graph_type> graph;
686  try {
687  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
688  domainMap, rangeMap, importer,
689  exporter, params));
690  }
691  catch (std::exception& e) {
692  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
693  (true, std::runtime_error, "CrsGraph constructor "
694  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
695  "params) threw: " << e.what ());
696  }
697  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
698  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
699  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
700  "Export, params) did not produce a fill-complete graph. "
701  "Please report this bug to the Tpetra developers.");
702  // myGraph_ not null means that the matrix owns the graph. This
703  // is true because the column indices come in as nonconst through
704  // the matrix, implying shared ownership.
705  myGraph_ = graph;
706  staticGraph_ = graph;
707 
708  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
709  valuesUnpacked_wdv = valuesPacked_wdv;
710 
711  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
712  (isFillActive (), std::logic_error,
713  "At the end of a CrsMatrix constructor that should produce "
714  "a fillComplete matrix, isFillActive() is true." << suffix);
715  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
716  (! isFillComplete (), std::logic_error, "At the end of a "
717  "CrsMatrix constructor that should produce a fillComplete "
718  "matrix, isFillComplete() is false." << suffix);
720  }
721 
722  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
725  const Teuchos::DataAccess copyOrView):
726  dist_object_type (source.getCrsGraph()->getRowMap ()),
727  staticGraph_ (source.getCrsGraph()),
728  storageStatus_ (source.storageStatus_)
729  {
730  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
731  "const CrsMatrix&, const Teuchos::DataAccess): ";
732  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
733  (! source.isFillComplete (), std::invalid_argument,
734  "Source graph must be fillComplete().");
735 
736  if (copyOrView == Teuchos::Copy) {
737  using values_type = typename local_matrix_device_type::values_type;
738  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
739  using Kokkos::view_alloc;
740  using Kokkos::WithoutInitializing;
741  values_type newvals (view_alloc ("val", WithoutInitializing),
742  vals.extent (0));
743  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
744  Kokkos::deep_copy (newvals, vals);
745  valuesPacked_wdv = values_wdv_type(newvals);
746  valuesUnpacked_wdv = valuesPacked_wdv;
747  fillComplete (source.getDomainMap (), source.getRangeMap ());
748  }
749  else if (copyOrView == Teuchos::View) {
750  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
751  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
752  fillComplete (source.getDomainMap (), source.getRangeMap ());
753  }
754  else {
755  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
756  (true, std::invalid_argument, "Second argument 'copyOrView' "
757  "has an invalid value " << copyOrView << ". Valid values "
758  "include Teuchos::Copy = " << Teuchos::Copy << " and "
759  "Teuchos::View = " << Teuchos::View << ".");
760  }
762  }
763 
764  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
765  void
768  {
769  std::swap(crs_matrix.importMV_, this->importMV_);
770  std::swap(crs_matrix.exportMV_, this->exportMV_);
771  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
772  std::swap(crs_matrix.myGraph_, this->myGraph_);
773  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
774  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
775  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
776  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
777  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
778  }
779 
780  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
781  Teuchos::RCP<const Teuchos::Comm<int> >
783  getComm () const {
784  return getCrsGraphRef ().getComm ();
785  }
786 
787  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
788  bool
790  isFillComplete () const {
791  return fillComplete_;
792  }
793 
794  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
795  bool
797  isFillActive () const {
798  return ! fillComplete_;
799  }
800 
801  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
802  bool
805  return this->getCrsGraphRef ().isStorageOptimized ();
806  }
807 
808  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
809  bool
812  return getCrsGraphRef ().isLocallyIndexed ();
813  }
814 
815  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
816  bool
819  return getCrsGraphRef ().isGloballyIndexed ();
820  }
821 
822  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
823  bool
825  hasColMap () const {
826  return getCrsGraphRef ().hasColMap ();
827  }
828 
829  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
833  return getCrsGraphRef ().getGlobalNumEntries ();
834  }
835 
836  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
837  size_t
840  return getCrsGraphRef ().getLocalNumEntries ();
841  }
842 
843  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
847  return getCrsGraphRef ().getGlobalNumRows ();
848  }
849 
850  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
854  return getCrsGraphRef ().getGlobalNumCols ();
855  }
856 
857  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
858  size_t
861  return getCrsGraphRef ().getLocalNumRows ();
862  }
863 
864 
865  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
866  size_t
869  return getCrsGraphRef ().getLocalNumCols ();
870  }
871 
872 
873  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
874  size_t
876  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
877  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
878  }
879 
880  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
881  size_t
883  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
884  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
885  }
886 
887  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
888  size_t
891  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
892  }
893 
894  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
895  size_t
898  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
899  }
900 
901  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
902  GlobalOrdinal
904  getIndexBase () const {
905  return getRowMap ()->getIndexBase ();
906  }
907 
908  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
909  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
911  getRowMap () const {
912  return getCrsGraphRef ().getRowMap ();
913  }
914 
915  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
916  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
918  getColMap () const {
919  return getCrsGraphRef ().getColMap ();
920  }
921 
922  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
923  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
925  getDomainMap () const {
926  return getCrsGraphRef ().getDomainMap ();
927  }
928 
929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
930  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
932  getRangeMap () const {
933  return getCrsGraphRef ().getRangeMap ();
934  }
935 
936  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
937  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
939  getGraph () const {
940  if (staticGraph_ != Teuchos::null) {
941  return staticGraph_;
942  }
943  return myGraph_;
944  }
945 
946  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
947  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
949  getCrsGraph () const {
950  if (staticGraph_ != Teuchos::null) {
951  return staticGraph_;
952  }
953  return myGraph_;
954  }
955 
956  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
959  getCrsGraphRef () const
960  {
961 #ifdef HAVE_TPETRA_DEBUG
962  constexpr bool debug = true;
963 #else
964  constexpr bool debug = false;
965 #endif // HAVE_TPETRA_DEBUG
966 
967  if (! this->staticGraph_.is_null ()) {
968  return * (this->staticGraph_);
969  }
970  else {
971  if (debug) {
972  const char tfecfFuncName[] = "getCrsGraphRef: ";
973  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
974  (this->myGraph_.is_null (), std::logic_error,
975  "Both staticGraph_ and myGraph_ are null. "
976  "Please report this bug to the Tpetra developers.");
977  }
978  return * (this->myGraph_);
979  }
980  }
981 
982  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
983  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
986  {
987  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
988  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
989  numCols,
990  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
991  staticGraph_->getLocalGraphDevice());
992  }
993 
994  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
995  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
997  getLocalMatrixHost () const
998  {
999  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1000  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
1001  valuesPacked_wdv.getHostView(Access::ReadWrite),
1002  staticGraph_->getLocalGraphHost());
1003  }
1004 
1005  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1006  bool
1008  isStaticGraph () const {
1009  return myGraph_.is_null ();
1010  }
1011 
1012  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1013  bool
1016  return true;
1017  }
1018 
1019  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1020  bool
1023  return true;
1024  }
1025 
1026  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1027  void
1029  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1030  const bool verbose)
1031  {
1032  using Details::Behavior;
1034  using std::endl;
1035  const char tfecfFuncName[] = "allocateValues: ";
1036  const char suffix[] =
1037  " Please report this bug to the Tpetra developers.";
1038  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1039 
1040  std::unique_ptr<std::string> prefix;
1041  if (verbose) {
1042  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1043  std::ostringstream os;
1044  os << *prefix << "lg: "
1045  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1046  << ", gas: Graph"
1047  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1048  << "Allocated" << endl;
1049  std::cerr << os.str();
1050  }
1051 
1052  const bool debug = Behavior::debug("CrsMatrix");
1053  if (debug) {
1054  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1055  (this->staticGraph_.is_null (), std::logic_error,
1056  "staticGraph_ is null." << suffix);
1057 
1058  // If the graph indices are already allocated, then gas should be
1059  // GraphAlreadyAllocated. Otherwise, gas should be
1060  // GraphNotYetAllocated.
1061  if ((gas == GraphAlreadyAllocated) !=
1062  staticGraph_->indicesAreAllocated ()) {
1063  const char err1[] = "The caller has asserted that the graph "
1064  "is ";
1065  const char err2[] = "already allocated, but the static graph "
1066  "says that its indices are ";
1067  const char err3[] = "already allocated. ";
1068  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1069  (gas == GraphAlreadyAllocated &&
1070  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1071  err1 << err2 << "not " << err3 << suffix);
1072  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1073  (gas != GraphAlreadyAllocated &&
1074  staticGraph_->indicesAreAllocated (), std::logic_error,
1075  err1 << "not " << err2 << err3 << suffix);
1076  }
1077 
1078  // If the graph is unallocated, then it had better be a
1079  // matrix-owned graph. ("Matrix-owned graph" means that the
1080  // matrix gets to define the graph structure. If the CrsMatrix
1081  // constructor that takes an RCP<const CrsGraph> was used, then
1082  // the matrix does _not_ own the graph.)
1083  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1084  (! this->staticGraph_->indicesAreAllocated () &&
1085  this->myGraph_.is_null (), std::logic_error,
1086  "The static graph says that its indices are not allocated, "
1087  "but the graph is not owned by the matrix." << suffix);
1088  }
1089 
1090  if (gas == GraphNotYetAllocated) {
1091  if (debug) {
1092  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1093  (this->myGraph_.is_null (), std::logic_error,
1094  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1095  }
1096  try {
1097  this->myGraph_->allocateIndices (lg, verbose);
1098  }
1099  catch (std::exception& e) {
1100  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1101  (true, std::runtime_error, "CrsGraph::allocateIndices "
1102  "threw an exception: " << e.what ());
1103  }
1104  catch (...) {
1105  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1106  (true, std::runtime_error, "CrsGraph::allocateIndices "
1107  "threw an exception not a subclass of std::exception.");
1108  }
1109  }
1110 
1111  // Allocate matrix values.
1112  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1113  if (debug) {
1114  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1115  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1116  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1117  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1118  }
1119 
1120  // Allocate array of (packed???) matrix values.
1121  using values_type = typename local_matrix_device_type::values_type;
1122  if (verbose) {
1123  std::ostringstream os;
1124  os << *prefix << "Allocate values_wdv: Pre "
1125  << valuesUnpacked_wdv.extent(0) << ", post "
1126  << lclTotalNumEntries << endl;
1127  std::cerr << os.str();
1128  }
1129 // this->k_values1D_ =
1130  valuesUnpacked_wdv = values_wdv_type(
1131  values_type("Tpetra::CrsMatrix::values",
1132  lclTotalNumEntries));
1133  }
1134 
1135 
1136  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1137  void
1139  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1140  {
1142  using ::Tpetra::Details::getEntryOnHost;
1143  using Teuchos::arcp_const_cast;
1144  using Teuchos::Array;
1145  using Teuchos::ArrayRCP;
1146  using Teuchos::null;
1147  using Teuchos::RCP;
1148  using Teuchos::rcp;
1149  using std::endl;
1150  using row_map_type = typename local_graph_device_type::row_map_type;
1151  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1152  using values_type = typename local_matrix_device_type::values_type;
1153  Details::ProfilingRegion regionFLGAM
1154  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1155 
1156  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1157  "fillComplete or expertStaticFillComplete): ";
1158  const char suffix[] =
1159  " Please report this bug to the Tpetra developers.";
1160  const bool debug = Details::Behavior::debug("CrsMatrix");
1161  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1162 
1163  std::unique_ptr<std::string> prefix;
1164  if (verbose) {
1165  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1166  std::ostringstream os;
1167  os << *prefix << endl;
1168  std::cerr << os.str ();
1169  }
1170 
1171  if (debug) {
1172  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1173  // matrix owns the graph, which means myGraph_ is not null.
1174  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1175  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1176  "(myGraph_) is null. This means that the matrix has a "
1177  "const (a.k.a. \"static\") graph. fillComplete or "
1178  "expertStaticFillComplete should never call "
1179  "fillLocalGraphAndMatrix in that case." << suffix);
1180  }
1181 
1182  const size_t lclNumRows = this->getLocalNumRows ();
1183 
1184  // This method's goal is to fill in the three arrays (compressed
1185  // sparse row format) that define the sparse graph's and matrix's
1186  // structure, and the sparse matrix's values.
1187  //
1188  // Get references to the data in myGraph_, so we can modify them
1189  // as well. Note that we only call fillLocalGraphAndMatrix() if
1190  // the matrix owns the graph, which means myGraph_ is not null.
1191 
1192  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1193  // See: https://github.com/trilinos/Trilinos/issues/12237
1194  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1195  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1196 
1197  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1198  myGraph_->rowPtrsUnpacked_dev_;
1199 
1200  if (debug) {
1201  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1202  (curRowOffsets.extent (0) == 0, std::logic_error,
1203  "curRowOffsets.extent(0) == 0.");
1204  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1205  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1206  "curRowOffsets.extent(0) = "
1207  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1208  << (lclNumRows + 1) << ".");
1209  const size_t numOffsets = curRowOffsets.extent (0);
1210  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1211  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1212  (numOffsets != 0 &&
1213  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1214  std::logic_error, "numOffsets = " <<
1215  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1216  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1217  << numOffsets << ") = " << valToCheck << ".");
1218  }
1219 
1220  if (myGraph_->getLocalNumEntries() !=
1221  myGraph_->getLocalAllocationSize()) {
1222 
1223  // Use the nonconst version of row_map_type for k_ptrs,
1224  // because row_map_type is const and we need to modify k_ptrs here.
1225  typename row_map_type::non_const_type k_ptrs;
1226  row_map_type k_ptrs_const;
1227  lclinds_1d_type k_inds;
1228  values_type k_vals;
1229 
1230  if (verbose) {
1231  std::ostringstream os;
1232  const auto numEnt = myGraph_->getLocalNumEntries();
1233  const auto allocSize = myGraph_->getLocalAllocationSize();
1234  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1235  << ", allocSize=" << allocSize << endl;
1236  std::cerr << os.str ();
1237  }
1238  // The matrix's current 1-D storage is "unpacked." This means
1239  // the row offsets may differ from what the final row offsets
1240  // should be. This could happen, for example, if the user
1241  // set an upper
1242  // bound on the number of entries per row, but didn't fill all
1243  // those entries.
1244  if (debug && curRowOffsets.extent (0) != 0) {
1245  const size_t numOffsets =
1246  static_cast<size_t> (curRowOffsets.extent (0));
1247  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1248  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1249  (static_cast<size_t> (valToCheck) !=
1250  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1251  std::logic_error, "(unpacked branch) Before "
1252  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1253  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1254  " = " << valuesUnpacked_wdv.extent (0) << ".");
1255  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1256  (static_cast<size_t> (valToCheck) !=
1257  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1258  std::logic_error, "(unpacked branch) Before "
1259  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1260  << ") = " << valToCheck
1261  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1262  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1263  }
1264  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1265  // the array of valid entry counts per row.
1266 
1267  // Total number of entries in the matrix on the calling
1268  // process. We will compute this in the loop below. It's
1269  // cheap to compute and useful as a sanity check.
1270  size_t lclTotalNumEntries = 0;
1271  {
1272  // Allocate the packed row offsets array. We use a nonconst
1273  // temporary (packedRowOffsets) here, because k_ptrs is
1274  // const. We will assign packedRowOffsets to k_ptrs below.
1275  if (verbose) {
1276  std::ostringstream os;
1277  os << *prefix << "Allocate packed row offsets: "
1278  << (lclNumRows+1) << endl;
1279  std::cerr << os.str ();
1280  }
1281  typename row_map_type::non_const_type
1282  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1283  typename row_entries_type::const_type numRowEnt_h =
1284  myGraph_->k_numRowEntries_;
1285  // We're computing offsets on device. This function can
1286  // handle numRowEnt_h being a host View.
1287  lclTotalNumEntries =
1288  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1289  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1290  // to use packedRowOffsets in the loop above and assign here.
1291  k_ptrs = packedRowOffsets;
1292  k_ptrs_const = k_ptrs;
1293  }
1294 
1295  if (debug) {
1296  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1297  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1298  std::logic_error,
1299  "(unpacked branch) After packing k_ptrs, "
1300  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1301  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1302  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1303  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1304  (valToCheck != lclTotalNumEntries, std::logic_error,
1305  "(unpacked branch) After filling k_ptrs, "
1306  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1307  << " != total number of entries on the calling process = "
1308  << lclTotalNumEntries << ".");
1309  }
1310 
1311  // Allocate the arrays of packed column indices and values.
1312  if (verbose) {
1313  std::ostringstream os;
1314  os << *prefix << "Allocate packed local column indices: "
1315  << lclTotalNumEntries << endl;
1316  std::cerr << os.str ();
1317  }
1318  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1319  if (verbose) {
1320  std::ostringstream os;
1321  os << *prefix << "Allocate packed values: "
1322  << lclTotalNumEntries << endl;
1323  std::cerr << os.str ();
1324  }
1325  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1326 
1327  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1328  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1329  // the packed row offsets array k_ptrs that we created above.
1330  //
1331  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1332  // need to keep around the unpacked row offsets, column
1333  // indices, and values arrays.
1334 
1335  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1336  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1337  using inds_packer_type = pack_functor<
1338  typename Graph::local_graph_device_type::entries_type::non_const_type,
1339  typename Graph::local_inds_dualv_type::t_dev::const_type,
1340  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1341  typename Graph::local_graph_device_type::row_map_type>;
1342  inds_packer_type indsPacker (
1343  k_inds,
1344  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1345  k_ptrs, curRowOffsets);
1346  using exec_space = typename decltype (k_inds)::execution_space;
1347  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1348  Kokkos::parallel_for
1349  ("Tpetra::CrsMatrix pack column indices",
1350  range_type (0, lclNumRows), indsPacker);
1351 
1352  // Pack the values from unpacked valuesUnpacked_wdv into packed
1353  // k_vals. We will replace valuesPacked_wdv below.
1354  using vals_packer_type = pack_functor<
1355  typename values_type::non_const_type,
1356  typename values_type::const_type,
1357  typename row_map_type::non_const_type,
1358  typename row_map_type::const_type>;
1359  vals_packer_type valsPacker (
1360  k_vals,
1361  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1362  k_ptrs, curRowOffsets);
1363  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1364  range_type (0, lclNumRows), valsPacker);
1365 
1366  if (debug) {
1367  const char myPrefix[] = "(\"Optimize Storage\""
1368  "=true branch) After packing, ";
1369  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1370  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1371  << "k_ptrs.extent(0) = 0. This probably means that "
1372  "rowPtrsUnpacked_ was never allocated.");
1373  if (k_ptrs.extent (0) != 0) {
1374  const size_t numOffsets (k_ptrs.extent (0));
1375  const auto valToCheck =
1376  getEntryOnHost (k_ptrs, numOffsets - 1);
1377  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1378  (size_t (valToCheck) != k_vals.extent (0),
1379  std::logic_error, myPrefix <<
1380  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1381  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1382  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1383  (size_t (valToCheck) != k_inds.extent (0),
1384  std::logic_error, myPrefix <<
1385  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1386  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1387  }
1388  }
1389  // Build the local graph.
1390  myGraph_->setRowPtrsPacked(k_ptrs_const);
1391  myGraph_->lclIndsPacked_wdv =
1392  typename crs_graph_type::local_inds_wdv_type(k_inds);
1393  valuesPacked_wdv = values_wdv_type(k_vals);
1394  }
1395  else { // We don't have to pack, so just set the pointers.
1396  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1397  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1398  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1399  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1400  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1401  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1402  valuesPacked_wdv = valuesUnpacked_wdv;
1403 
1404  if (verbose) {
1405  std::ostringstream os;
1406  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1407  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1408  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1409  << valuesUnpacked_wdv.extent(0) << endl;
1410  std::cerr << os.str();
1411  }
1412 
1413  if (debug) {
1414  const char myPrefix[] =
1415  "(\"Optimize Storage\"=false branch) ";
1416  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1417  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1418  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1419  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1420  "that rowPtrsUnpacked_ was never allocated.");
1421  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1422  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1423  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1424  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1425  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1426  std::logic_error, myPrefix <<
1427  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1428  << " != valuesPacked_wdv.extent(0) = "
1429  << valuesPacked_wdv.extent (0) << ".");
1430  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1431  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1432  std::logic_error, myPrefix <<
1433  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1434  << " != myGraph_->lclIndsPacked.extent(0) = "
1435  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1436  }
1437  }
1438  }
1439 
1440  if (debug) {
1441  const char myPrefix[] = "After packing, ";
1442  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1443  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1444  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1445  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1446  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1447  (lclNumRows+1) << ".");
1448  if (rowPtrsPackedHost.extent (0) != 0) {
1449  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1450  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1452  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1453  std::logic_error, myPrefix << "k_ptrs_const(" <<
1454  (numOffsets-1) << ") = " << valToCheck
1455  << " != valuesPacked_wdv.extent(0) = "
1456  << valuesPacked_wdv.extent (0) << ".");
1457  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1458  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1459  std::logic_error, myPrefix << "k_ptrs_const(" <<
1460  (numOffsets-1) << ") = " << valToCheck
1461  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1462  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1463  }
1464  }
1465 
1466  // May we ditch the old allocations for the packed (and otherwise
1467  // "optimized") allocations, later in this routine? Optimize
1468  // storage if the graph is not static, or if the graph already has
1469  // optimized storage.
1470  const bool defaultOptStorage =
1471  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1472  const bool requestOptimizedStorage =
1473  (! params.is_null () &&
1474  params->get ("Optimize Storage", defaultOptStorage)) ||
1475  (params.is_null () && defaultOptStorage);
1476 
1477  // The graph has optimized storage when indices are allocated,
1478  // myGraph_->k_numRowEntries_ is empty, and there are more than
1479  // zero rows on this process.
1480  if (requestOptimizedStorage) {
1481  // Free the old, unpacked, unoptimized allocations.
1482  // Free graph data structures that are only needed for
1483  // unpacked 1-D storage.
1484  if (verbose) {
1485  std::ostringstream os;
1486  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1487  << myGraph_->k_numRowEntries_.extent(0) << endl;
1488  std::cerr << os.str();
1489  }
1490 
1491  myGraph_->k_numRowEntries_ = row_entries_type ();
1492 
1493  // Keep the new 1-D packed allocations.
1494  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1495  // We directly set the memory spaces to avoid a memcpy from device to host
1496  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1497  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1498  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1499  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1500  valuesUnpacked_wdv = valuesPacked_wdv;
1501 
1502  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1503  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1504  }
1505  else {
1506  if (verbose) {
1507  std::ostringstream os;
1508  os << *prefix << "User requested NOT to optimize storage"
1509  << endl;
1510  std::cerr << os.str();
1511  }
1512  }
1513  }
1514 
1515  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1516  void
1518  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1519  {
1520  using ::Tpetra::Details::ProfilingRegion;
1521  using Teuchos::ArrayRCP;
1522  using Teuchos::Array;
1523  using Teuchos::null;
1524  using Teuchos::RCP;
1525  using Teuchos::rcp;
1526  using std::endl;
1527  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1528  using non_const_row_map_type = typename row_map_type::non_const_type;
1529  using values_type = typename local_matrix_device_type::values_type;
1530  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1531  const size_t lclNumRows = getLocalNumRows();
1532 
1533  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1534  std::unique_ptr<std::string> prefix;
1535  if (verbose) {
1536  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1537  std::ostringstream os;
1538  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1539  std::cerr << os.str ();
1540  }
1541 
1542  // The goals of this routine are first, to allocate and fill
1543  // packed 1-D storage (see below for an explanation) in the vals
1544  // array, and second, to give vals to the local matrix and
1545  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1546  // row offsets, within the scope of this routine, since we're only
1547  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1548  // fill both the graph and the matrix at the same time).
1549 
1550  // get data from staticGraph_
1551  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1552  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1553  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1554 
1555  row_map_type k_ptrs; // "packed" row offsets array
1556  values_type k_vals; // "packed" values array
1557 
1558  // May we ditch the old allocations for the packed (and otherwise
1559  // "optimized") allocations, later in this routine? Request
1560  // optimized storage by default.
1561  bool requestOptimizedStorage = true;
1562  const bool default_OptimizeStorage =
1563  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1564  if (! params.is_null() &&
1565  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1566  requestOptimizedStorage = false;
1567  }
1568  // If we're not allowed to change a static graph, then we can't
1569  // change the storage of the matrix, either. This means that if
1570  // the graph's storage isn't already optimized, we can't optimize
1571  // the matrix's storage either. Check and give warning, as
1572  // appropriate.
1573  if (! staticGraph_->isStorageOptimized () &&
1574  requestOptimizedStorage) {
1576  (true, std::runtime_error, "You requested optimized storage "
1577  "by setting the \"Optimize Storage\" flag to \"true\" in "
1578  "the ParameterList, or by virtue of default behavior. "
1579  "However, the associated CrsGraph was filled separately and "
1580  "requested not to optimize storage. Therefore, the "
1581  "CrsMatrix cannot optimize storage.");
1582  requestOptimizedStorage = false;
1583  }
1584 
1585  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1586  // See: https://github.com/trilinos/Trilinos/issues/12237
1587  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1588  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1589 
1590  // The matrix's values are currently
1591  // stored in a 1-D format. However, this format is "unpacked";
1592  // it doesn't necessarily have the same row offsets as indicated
1593  // by the ptrs array returned by allocRowPtrs. This could
1594  // happen, for example, if the user
1595  // fixed the number of matrix entries in
1596  // each row, but didn't fill all those entries.
1597  //
1598  // As above, we don't need to keep the "packed" row offsets
1599  // array ptrs here, but we do need it here temporarily, so we
1600  // have to allocate it. We'll free ptrs later in this method.
1601  //
1602  // Note that this routine checks whether storage has already
1603  // been packed. This is a common case for solution of nonlinear
1604  // PDEs using the finite element method, as long as the
1605  // structure of the sparse matrix does not change between linear
1606  // solves.
1607  if (nodeNumEntries != nodeNumAllocated) {
1608  if (verbose) {
1609  std::ostringstream os;
1610  os << *prefix << "Unpacked 1-D storage: numEnt="
1611  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1612  << endl;
1613  std::cerr << os.str();
1614  }
1615  // We have to pack the 1-D storage, since the user didn't fill
1616  // up all requested storage.
1617  if (verbose) {
1618  std::ostringstream os;
1619  os << *prefix << "Allocate packed row offsets: "
1620  << (lclNumRows+1) << endl;
1621  std::cerr << os.str();
1622  }
1623  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1624  lclNumRows+1);
1625  // Total number of entries in the matrix on the calling
1626  // process. We will compute this in the loop below. It's
1627  // cheap to compute and useful as a sanity check.
1628  size_t lclTotalNumEntries = 0;
1629  k_ptrs = tmpk_ptrs;
1630  {
1631  typename row_entries_type::const_type numRowEnt_h =
1632  staticGraph_->k_numRowEntries_;
1633  // This function can handle the counts being a host View.
1634  lclTotalNumEntries =
1635  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1636  }
1637 
1638  // Allocate the "packed" values array.
1639  // It has exactly the right number of entries.
1640  if (verbose) {
1641  std::ostringstream os;
1642  os << *prefix << "Allocate packed values: "
1643  << lclTotalNumEntries << endl;
1644  std::cerr << os.str ();
1645  }
1646  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1647 
1648  // Pack values_wdv into k_vals. We will replace values_wdv below.
1649  pack_functor<
1650  typename values_type::non_const_type,
1651  typename values_type::const_type,
1652  typename row_map_type::non_const_type,
1653  typename row_map_type::const_type> valsPacker
1654  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1655  tmpk_ptrs, k_rowPtrs);
1656 
1657  using exec_space = typename decltype (k_vals)::execution_space;
1658  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1659  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1660  range_type (0, lclNumRows), valsPacker);
1661  valuesPacked_wdv = values_wdv_type(k_vals);
1662  }
1663  else { // We don't have to pack, so just set the pointer.
1664  valuesPacked_wdv = valuesUnpacked_wdv;
1665  if (verbose) {
1666  std::ostringstream os;
1667  os << *prefix << "Storage already packed: "
1668  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1669  std::cerr << os.str();
1670  }
1671  }
1672 
1673  // May we ditch the old allocations for the packed one?
1674  if (requestOptimizedStorage) {
1675  // The user requested optimized storage, so we can dump the
1676  // unpacked 1-D storage, and keep the packed storage.
1677  valuesUnpacked_wdv = valuesPacked_wdv;
1678 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1679  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1680  }
1681  }
1682 
1683  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1684  void
1686  insertIndicesAndValues (crs_graph_type& graph,
1687  RowInfo& rowInfo,
1688  const typename crs_graph_type::SLocalGlobalViews& newInds,
1689  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1690  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1691  const ELocalGlobal lg,
1692  const ELocalGlobal I)
1693  {
1694  const size_t oldNumEnt = rowInfo.numEntries;
1695  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1696 
1697  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1698  // that probably relates to scalar_type vs. impl_scalar_type
1699  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1700  // details; look for GCC_WORKAROUND macro definition.
1701  if (numInserted > 0) {
1702  const size_t startOffset = oldNumEnt;
1703  memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1704  numInserted * sizeof (impl_scalar_type));
1705  }
1706  }
1707 
1708  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1709  void
1711  insertLocalValues (const LocalOrdinal lclRow,
1712  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1713  const Teuchos::ArrayView<const Scalar>& values,
1714  const CombineMode CM)
1715  {
1716  using std::endl;
1717  const char tfecfFuncName[] = "insertLocalValues: ";
1718 
1719  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1720  (! this->isFillActive (), std::runtime_error,
1721  "Fill is not active. After calling fillComplete, you must call "
1722  "resumeFill before you may insert entries into the matrix again.");
1723  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1724  (this->isStaticGraph (), std::runtime_error,
1725  "Cannot insert indices with static graph; use replaceLocalValues() "
1726  "instead.");
1727  // At this point, we know that myGraph_ is nonnull.
1728  crs_graph_type& graph = * (this->myGraph_);
1729  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1730  (graph.colMap_.is_null (), std::runtime_error,
1731  "Cannot insert local indices without a column map.");
1732  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1733  (graph.isGloballyIndexed (),
1734  std::runtime_error, "Graph indices are global; use "
1735  "insertGlobalValues().");
1736  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1737  (values.size () != indices.size (), std::runtime_error,
1738  "values.size() = " << values.size ()
1739  << " != indices.size() = " << indices.size () << ".");
1740  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1741  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1742  "Local row index " << lclRow << " does not belong to this process.");
1743 
1744  if (! graph.indicesAreAllocated ()) {
1745  // We only allocate values at most once per process, so it's OK
1746  // to check TPETRA_VERBOSE here.
1747  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1748  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1749  }
1750 
1751 #ifdef HAVE_TPETRA_DEBUG
1752  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1753  // In a debug build, test whether any of the given column indices
1754  // are not in the column Map. Keep track of the invalid column
1755  // indices so we can tell the user about them.
1756  {
1757  using Teuchos::toString;
1758 
1759  const map_type& colMap = * (graph.colMap_);
1760  Teuchos::Array<LocalOrdinal> badColInds;
1761  bool allInColMap = true;
1762  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1763  if (! colMap.isNodeLocalElement (indices[k])) {
1764  allInColMap = false;
1765  badColInds.push_back (indices[k]);
1766  }
1767  }
1768  if (! allInColMap) {
1769  std::ostringstream os;
1770  os << "You attempted to insert entries in owned row " << lclRow
1771  << ", at the following column indices: " << toString (indices)
1772  << "." << endl;
1773  os << "Of those, the following indices are not in the column Map on "
1774  "this process: " << toString (badColInds) << "." << endl << "Since "
1775  "the matrix has a column Map already, it is invalid to insert "
1776  "entries at those locations.";
1777  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1778  (true, std::invalid_argument, os.str ());
1779  }
1780  }
1781 #endif // HAVE_TPETRA_DEBUG
1782 
1783  RowInfo rowInfo = graph.getRowInfo (lclRow);
1784 
1785  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1786  if (CM == ADD) {
1787  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1788  valsView[offset] += values[k]; };
1789  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1790  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1791  } else if (CM == INSERT) {
1792  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1793  valsView[offset] = values[k]; };
1794  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1795  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1796  } else {
1797  std::ostringstream os;
1798  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1799  << "but this has not been implemented." << endl;
1800  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1801  (true, std::invalid_argument, os.str ());
1802  }
1803  }
1804 
1805  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1806  void
1808  insertLocalValues (const LocalOrdinal localRow,
1809  const LocalOrdinal numEnt,
1810  const Scalar vals[],
1811  const LocalOrdinal cols[],
1812  const CombineMode CM)
1813  {
1814  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1815  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1816  this->insertLocalValues (localRow, colsT, valsT, CM);
1817  }
1818 
1819  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1820  void
1823  RowInfo& rowInfo,
1824  const GlobalOrdinal gblColInds[],
1825  const impl_scalar_type vals[],
1826  const size_t numInputEnt)
1827  {
1828 #ifdef HAVE_TPETRA_DEBUG
1829  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1830  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1831  const size_t curNumEnt = rowInfo.numEntries;
1832 #endif // HAVE_TPETRA_DEBUG
1833 
1834  if (! graph.indicesAreAllocated ()) {
1835  // We only allocate values at most once per process, so it's OK
1836  // to check TPETRA_VERBOSE here.
1837  using ::Tpetra::Details::Behavior;
1838  const bool verbose = Behavior::verbose("CrsMatrix");
1839  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1840  // mfh 23 Jul 2017: allocateValues invalidates existing
1841  // getRowInfo results. Once we get rid of lazy graph
1842  // allocation, we'll be able to move the getRowInfo call outside
1843  // of this method.
1844  rowInfo = graph.getRowInfo (rowInfo.localRow);
1845  }
1846 
1847  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1848  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1849  valsView[offset] += vals[k];
1850  };
1851  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1852 #ifdef HAVE_TPETRA_DEBUG
1853  //numInserted is only used inside the debug code below.
1854  auto numInserted =
1855 #endif
1856  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1857 
1858 #ifdef HAVE_TPETRA_DEBUG
1859  size_t newNumEnt = curNumEnt + numInserted;
1860  const size_t chkNewNumEnt =
1861  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1862  if (chkNewNumEnt != newNumEnt) {
1863  std::ostringstream os;
1864  os << std::endl << "newNumEnt = " << newNumEnt
1865  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1866  << ") = " << chkNewNumEnt << "." << std::endl
1867  << "\torigNumEnt: " << origNumEnt << std::endl
1868  << "\tnumInputEnt: " << numInputEnt << std::endl
1869  << "\tgblColInds: [";
1870  for (size_t k = 0; k < numInputEnt; ++k) {
1871  os << gblColInds[k];
1872  if (k + size_t (1) < numInputEnt) {
1873  os << ",";
1874  }
1875  }
1876  os << "]" << std::endl
1877  << "\tvals: [";
1878  for (size_t k = 0; k < numInputEnt; ++k) {
1879  os << vals[k];
1880  if (k + size_t (1) < numInputEnt) {
1881  os << ",";
1882  }
1883  }
1884  os << "]" << std::endl;
1885 
1886  if (this->supportsRowViews ()) {
1887  values_host_view_type vals2;
1888  if (this->isGloballyIndexed ()) {
1889  global_inds_host_view_type gblColInds2;
1890  const GlobalOrdinal gblRow =
1891  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1892  if (gblRow ==
1893  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1894  os << "Local row index " << rowInfo.localRow << " is invalid!"
1895  << std::endl;
1896  }
1897  else {
1898  bool getViewThrew = false;
1899  try {
1900  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1901  }
1902  catch (std::exception& e) {
1903  getViewThrew = true;
1904  os << "getGlobalRowView threw exception:" << std::endl
1905  << e.what () << std::endl;
1906  }
1907  if (! getViewThrew) {
1908  os << "\tNew global column indices: ";
1909  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1910  os << gblColInds2[jjj] << " ";
1911  os << std::endl;
1912  os << "\tNew values: ";
1913  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1914  os << vals2[jjj] << " ";
1915  os << std::endl;
1916  }
1917  }
1918  }
1919  else if (this->isLocallyIndexed ()) {
1920  local_inds_host_view_type lclColInds2;
1921  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1922  os << "\tNew local column indices: ";
1923  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1924  os << lclColInds2[jjj] << " ";
1925  os << std::endl;
1926  os << "\tNew values: ";
1927  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1928  os << vals2[jjj] << " ";
1929  os << std::endl;
1930  }
1931  }
1932 
1933  os << "Please report this bug to the Tpetra developers.";
1934  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1935  (true, std::logic_error, os.str ());
1936  }
1937 #endif // HAVE_TPETRA_DEBUG
1938  }
1939 
1940  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1941  void
1943  insertGlobalValues (const GlobalOrdinal gblRow,
1944  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1945  const Teuchos::ArrayView<const Scalar>& values)
1946  {
1947  using Teuchos::toString;
1948  using std::endl;
1949  typedef impl_scalar_type IST;
1950  typedef LocalOrdinal LO;
1951  typedef GlobalOrdinal GO;
1952  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1953  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1954  const char tfecfFuncName[] = "insertGlobalValues: ";
1955 
1956 #ifdef HAVE_TPETRA_DEBUG
1957  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1958  (values.size () != indices.size (), std::runtime_error,
1959  "values.size() = " << values.size () << " != indices.size() = "
1960  << indices.size () << ".");
1961 #endif // HAVE_TPETRA_DEBUG
1962 
1963  // getRowMap() is not thread safe, because it increments RCP's
1964  // reference count. getCrsGraphRef() is thread safe.
1965  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
1966  const LO lclRow = rowMap.getLocalElement (gblRow);
1967 
1968  if (lclRow == OTLO::invalid ()) {
1969  // Input row is _not_ owned by the calling process.
1970  //
1971  // See a note (now deleted) from mfh 14 Dec 2012: If input row
1972  // is not in the row Map, it doesn't matter whether or not the
1973  // graph is static; the data just get stashed for later use by
1974  // globalAssemble().
1975  this->insertNonownedGlobalValues (gblRow, indices, values);
1976  }
1977  else { // Input row _is_ owned by the calling process
1978  if (this->isStaticGraph ()) {
1979  // Uh oh! Not allowed to insert into owned rows in that case.
1980  const int myRank = rowMap.getComm ()->getRank ();
1981  const int numProcs = rowMap.getComm ()->getSize ();
1982  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1983  (true, std::runtime_error,
1984  "The matrix was constructed with a constant (\"static\") graph, "
1985  "yet the given global row index " << gblRow << " is in the row "
1986  "Map on the calling process (with rank " << myRank << ", of " <<
1987  numProcs << " process(es)). In this case, you may not insert "
1988  "new entries into rows owned by the calling process.");
1989  }
1990 
1991  crs_graph_type& graph = * (this->myGraph_);
1992  const IST* const inputVals =
1993  reinterpret_cast<const IST*> (values.getRawPtr ());
1994  const GO* const inputGblColInds = indices.getRawPtr ();
1995  const size_t numInputEnt = indices.size ();
1996  RowInfo rowInfo = graph.getRowInfo (lclRow);
1997 
1998  // If the matrix has a column Map, check at this point whether
1999  // the column indices belong to the column Map.
2000  //
2001  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2002  // test to the CrsGraph method, since it may have to do this
2003  // anyway.
2004  if (! graph.colMap_.is_null ()) {
2005  const map_type& colMap = * (graph.colMap_);
2006  // In a debug build, keep track of the nonowned ("bad") column
2007  // indices, so that we can display them in the exception
2008  // message. In a release build, just ditch the loop early if
2009  // we encounter a nonowned column index.
2010 #ifdef HAVE_TPETRA_DEBUG
2011  Teuchos::Array<GO> badColInds;
2012 #endif // HAVE_TPETRA_DEBUG
2013  const size_type numEntriesToInsert = indices.size ();
2014  bool allInColMap = true;
2015  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2016  if (! colMap.isNodeGlobalElement (indices[k])) {
2017  allInColMap = false;
2018 #ifdef HAVE_TPETRA_DEBUG
2019  badColInds.push_back (indices[k]);
2020 #else
2021  break;
2022 #endif // HAVE_TPETRA_DEBUG
2023  }
2024  }
2025  if (! allInColMap) {
2026  std::ostringstream os;
2027  os << "You attempted to insert entries in owned row " << gblRow
2028  << ", at the following column indices: " << toString (indices)
2029  << "." << endl;
2030 #ifdef HAVE_TPETRA_DEBUG
2031  os << "Of those, the following indices are not in the column Map "
2032  "on this process: " << toString (badColInds) << "." << endl
2033  << "Since the matrix has a column Map already, it is invalid "
2034  "to insert entries at those locations.";
2035 #else
2036  os << "At least one of those indices is not in the column Map "
2037  "on this process." << endl << "It is invalid to insert into "
2038  "columns not in the column Map on the process that owns the "
2039  "row.";
2040 #endif // HAVE_TPETRA_DEBUG
2041  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2042  (true, std::invalid_argument, os.str ());
2043  }
2044  }
2045 
2046  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2047  inputVals, numInputEnt);
2048  }
2049  }
2050 
2051 
2052  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2053  void
2055  insertGlobalValues (const GlobalOrdinal globalRow,
2056  const LocalOrdinal numEnt,
2057  const Scalar vals[],
2058  const GlobalOrdinal inds[])
2059  {
2060  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2061  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2062  this->insertGlobalValues (globalRow, indsT, valsT);
2063  }
2064 
2065 
2066  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2067  void
2070  const GlobalOrdinal gblRow,
2071  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2072  const Teuchos::ArrayView<const Scalar>& values,
2073  const bool debug)
2074  {
2075  typedef impl_scalar_type IST;
2076  typedef LocalOrdinal LO;
2077  typedef GlobalOrdinal GO;
2078  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2079  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2080 
2081  if (debug) {
2082  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2083  (values.size () != indices.size (), std::runtime_error,
2084  "values.size() = " << values.size () << " != indices.size() = "
2085  << indices.size () << ".");
2086  }
2087 
2088  // getRowMap() is not thread safe, because it increments RCP's
2089  // reference count. getCrsGraphRef() is thread safe.
2090  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2091  const LO lclRow = rowMap.getLocalElement (gblRow);
2092  if (lclRow == OTLO::invalid ()) {
2093  // Input row is _not_ owned by the calling process.
2094  //
2095  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2096  // is not in the row Map, it doesn't matter whether or not the
2097  // graph is static; the data just get stashed for later use by
2098  // globalAssemble().
2099  this->insertNonownedGlobalValues (gblRow, indices, values);
2100  }
2101  else { // Input row _is_ owned by the calling process
2102  if (this->isStaticGraph ()) {
2103  // Uh oh! Not allowed to insert into owned rows in that case.
2104  const int myRank = rowMap.getComm ()->getRank ();
2105  const int numProcs = rowMap.getComm ()->getSize ();
2106  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2107  (true, std::runtime_error,
2108  "The matrix was constructed with a constant (\"static\") graph, "
2109  "yet the given global row index " << gblRow << " is in the row "
2110  "Map on the calling process (with rank " << myRank << ", of " <<
2111  numProcs << " process(es)). In this case, you may not insert "
2112  "new entries into rows owned by the calling process.");
2113  }
2114 
2115  crs_graph_type& graph = * (this->myGraph_);
2116  const IST* const inputVals =
2117  reinterpret_cast<const IST*> (values.getRawPtr ());
2118  const GO* const inputGblColInds = indices.getRawPtr ();
2119  const size_t numInputEnt = indices.size ();
2120  RowInfo rowInfo = graph.getRowInfo (lclRow);
2121 
2122  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2123  // This branch is similar in function to the following branch, but for
2124  // the special case that the target graph is locally indexed.
2125  // In this case, we cannot simply filter
2126  // out global indices that don't exist on the receiving process and
2127  // insert the remaining (global) indices, but we must convert them (the
2128  // remaining global indices) to local and call `insertLocalValues`.
2129  const map_type& colMap = * (graph.colMap_);
2130  size_t curOffset = 0;
2131  while (curOffset < numInputEnt) {
2132  // Find a sequence of input indices that are in the column Map on the
2133  // calling process. Doing a sequence at a time, instead of one at a
2134  // time, amortizes some overhead.
2135  Teuchos::Array<LO> lclIndices;
2136  size_t endOffset = curOffset;
2137  for ( ; endOffset < numInputEnt; ++endOffset) {
2138  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2139  if (lclIndex != OTLO::invalid())
2140  lclIndices.push_back(lclIndex);
2141  else
2142  break;
2143  }
2144  // curOffset, endOffset: half-exclusive range of indices in the column
2145  // Map on the calling process. If endOffset == curOffset, the range is
2146  // empty.
2147  const LO numIndInSeq = (endOffset - curOffset);
2148  if (numIndInSeq != 0) {
2149  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2150  }
2151  // Invariant before the increment line: Either endOffset ==
2152  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2153  // on the calling process.
2154  if (debug) {
2155  const bool invariant = endOffset == numInputEnt ||
2156  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2157  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2158  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2159  }
2160  curOffset = endOffset + 1;
2161  }
2162  }
2163  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2164  const map_type& colMap = * (graph.colMap_);
2165  size_t curOffset = 0;
2166  while (curOffset < numInputEnt) {
2167  // Find a sequence of input indices that are in the column
2168  // Map on the calling process. Doing a sequence at a time,
2169  // instead of one at a time, amortizes some overhead.
2170  size_t endOffset = curOffset;
2171  for ( ; endOffset < numInputEnt &&
2172  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2173  ++endOffset)
2174  {}
2175  // curOffset, endOffset: half-exclusive range of indices in
2176  // the column Map on the calling process. If endOffset ==
2177  // curOffset, the range is empty.
2178  const LO numIndInSeq = (endOffset - curOffset);
2179  if (numIndInSeq != 0) {
2180  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2181  this->insertGlobalValuesImpl (graph, rowInfo,
2182  inputGblColInds + curOffset,
2183  inputVals + curOffset,
2184  numIndInSeq);
2185  }
2186  // Invariant before the increment line: Either endOffset ==
2187  // numInputEnt, or inputGblColInds[endOffset] is not in the
2188  // column Map on the calling process.
2189  if (debug) {
2190  const bool invariant = endOffset == numInputEnt ||
2191  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2192  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2193  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2194  }
2195  curOffset = endOffset + 1;
2196  }
2197  }
2198  else { // we don't have a column Map.
2199  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2200  inputVals, numInputEnt);
2201  }
2202  }
2203  }
2204 
2205  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2206  void
2207  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2208  insertGlobalValuesFilteredChecked(
2209  const GlobalOrdinal gblRow,
2210  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2211  const Teuchos::ArrayView<const Scalar>& values,
2212  const char* const prefix,
2213  const bool debug,
2214  const bool verbose)
2215  {
2217  using std::endl;
2218 
2219  try {
2220  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2221  }
2222  catch(std::exception& e) {
2223  std::ostringstream os;
2224  if (verbose) {
2225  const size_t maxNumToPrint =
2227  os << *prefix << ": insertGlobalValuesFiltered threw an "
2228  "exception: " << e.what() << endl
2229  << "Global row index: " << gblRow << endl;
2230  verbosePrintArray(os, indices, "Global column indices",
2231  maxNumToPrint);
2232  os << endl;
2233  verbosePrintArray(os, values, "Values", maxNumToPrint);
2234  os << endl;
2235  }
2236  else {
2237  os << ": insertGlobalValuesFiltered threw an exception: "
2238  << e.what();
2239  }
2240  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2241  }
2242  }
2243 
2244  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2245  LocalOrdinal
2248  const crs_graph_type& graph,
2249  const RowInfo& rowInfo,
2250  const LocalOrdinal inds[],
2251  const impl_scalar_type newVals[],
2252  const LocalOrdinal numElts)
2253  {
2254  typedef LocalOrdinal LO;
2255  typedef GlobalOrdinal GO;
2256  const bool sorted = graph.isSorted ();
2257 
2258  size_t hint = 0; // Guess for the current index k into rowVals
2259  LO numValid = 0; // number of valid local column indices
2260 
2261  if (graph.isLocallyIndexed ()) {
2262  // Get a view of the column indices in the row. This amortizes
2263  // the cost of getting the view over all the entries of inds.
2264  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2265 
2266  for (LO j = 0; j < numElts; ++j) {
2267  const LO lclColInd = inds[j];
2268  const size_t offset =
2269  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2270  lclColInd, hint, sorted);
2271  if (offset != rowInfo.numEntries) {
2272  rowVals[offset] = newVals[j];
2273  hint = offset + 1;
2274  ++numValid;
2275  }
2276  }
2277  }
2278  else if (graph.isGloballyIndexed ()) {
2279  if (graph.colMap_.is_null ()) {
2280  return Teuchos::OrdinalTraits<LO>::invalid ();
2281  }
2282  const map_type colMap = * (graph.colMap_);
2283 
2284  // Get a view of the column indices in the row. This amortizes
2285  // the cost of getting the view over all the entries of inds.
2286  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2287 
2288  for (LO j = 0; j < numElts; ++j) {
2289  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2290  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2291  const size_t offset =
2292  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2293  gblColInd, hint, sorted);
2294  if (offset != rowInfo.numEntries) {
2295  rowVals[offset] = newVals[j];
2296  hint = offset + 1;
2297  ++numValid;
2298  }
2299  }
2300  }
2301  }
2302  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2303  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2304  // to be neither locally nor globally indexed on a process.
2305  // This means that the graph or matrix has no entries on that
2306  // process. Epetra also works like this. It's related to lazy
2307  // allocation (on first insertion, not at graph / matrix
2308  // construction). Lazy allocation will go away because it is
2309  // not thread scalable.
2310 
2311  return numValid;
2312  }
2313 
2314  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2315  LocalOrdinal
2317  replaceLocalValues (const LocalOrdinal localRow,
2318  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2319  const Teuchos::ArrayView<const Scalar>& vals)
2320  {
2321  typedef LocalOrdinal LO;
2322 
2323  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2324  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2325  return Teuchos::OrdinalTraits<LO>::invalid ();
2326  }
2327  const LO* const inputInds = lclCols.getRawPtr ();
2328  const Scalar* const inputVals = vals.getRawPtr ();
2329  return this->replaceLocalValues (localRow, numInputEnt,
2330  inputVals, inputInds);
2331  }
2332 
2333  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2335  local_ordinal_type
2338  const local_ordinal_type localRow,
2339  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2340  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2341  {
2342  using LO = local_ordinal_type;
2343  const LO numInputEnt = inputInds.extent(0);
2344  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2345  return Teuchos::OrdinalTraits<LO>::invalid();
2346  }
2347  const Scalar* const inVals =
2348  reinterpret_cast<const Scalar*>(inputVals.data());
2349  return this->replaceLocalValues(localRow, numInputEnt,
2350  inVals, inputInds.data());
2351  }
2352 
2353  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2354  LocalOrdinal
2356  replaceLocalValues (const LocalOrdinal localRow,
2357  const LocalOrdinal numEnt,
2358  const Scalar inputVals[],
2359  const LocalOrdinal inputCols[])
2360  {
2361  typedef impl_scalar_type IST;
2362  typedef LocalOrdinal LO;
2363 
2364  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2365  // Fill must be active and the "nonconst" graph must exist.
2366  return Teuchos::OrdinalTraits<LO>::invalid ();
2367  }
2368  const crs_graph_type& graph = * (this->staticGraph_);
2369  const RowInfo rowInfo = graph.getRowInfo (localRow);
2370 
2371  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2372  // The calling process does not own this row, so it is not
2373  // allowed to modify its values.
2374  return static_cast<LO> (0);
2375  }
2376  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2377  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2378  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2379  inputCols, inVals, numEnt);
2380  }
2381 
2382  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2383  LocalOrdinal
2386  const crs_graph_type& graph,
2387  const RowInfo& rowInfo,
2388  const GlobalOrdinal inds[],
2389  const impl_scalar_type newVals[],
2390  const LocalOrdinal numElts)
2391  {
2392  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2393  auto fun =
2394  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2395  rowVals[offset] = newVals[k];
2396  };
2397  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2398  return graph.findGlobalIndices(rowInfo, indsT, cb);
2399  }
2400 
2401  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2402  LocalOrdinal
2404  replaceGlobalValues (const GlobalOrdinal globalRow,
2405  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2406  const Teuchos::ArrayView<const Scalar>& inputVals)
2407  {
2408  typedef LocalOrdinal LO;
2409 
2410  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2411  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2412  return Teuchos::OrdinalTraits<LO>::invalid ();
2413  }
2414  return this->replaceGlobalValues (globalRow, numInputEnt,
2415  inputVals.getRawPtr (),
2416  inputGblColInds.getRawPtr ());
2417  }
2418 
2419  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2420  LocalOrdinal
2422  replaceGlobalValues (const GlobalOrdinal globalRow,
2423  const LocalOrdinal numEnt,
2424  const Scalar inputVals[],
2425  const GlobalOrdinal inputGblColInds[])
2426  {
2427  typedef impl_scalar_type IST;
2428  typedef LocalOrdinal LO;
2429 
2430  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2431  // Fill must be active and the "nonconst" graph must exist.
2432  return Teuchos::OrdinalTraits<LO>::invalid ();
2433  }
2434  const crs_graph_type& graph = * (this->staticGraph_);
2435 
2436  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2437  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2438  // The input local row is invalid on the calling process,
2439  // which means that the calling process summed 0 entries.
2440  return static_cast<LO> (0);
2441  }
2442 
2443  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2444  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2445  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2446  inputGblColInds, inVals, numEnt);
2447  }
2448 
2449  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2451  local_ordinal_type
2454  const global_ordinal_type globalRow,
2455  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2456  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2457  {
2458  // We use static_assert here to check the template parameters,
2459  // rather than std::enable_if (e.g., on the return value, to
2460  // enable compilation only if the template parameters match the
2461  // desired attributes). This turns obscure link errors into
2462  // clear compilation errors. It also makes the return value a
2463  // lot easier to see.
2464  using LO = local_ordinal_type;
2465  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2466  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2467  return Teuchos::OrdinalTraits<LO>::invalid();
2468  }
2469  const Scalar* const inVals =
2470  reinterpret_cast<const Scalar*>(inputVals.data());
2471  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2472  inputInds.data());
2473  }
2474 
2475  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2476  LocalOrdinal
2479  const crs_graph_type& graph,
2480  const RowInfo& rowInfo,
2481  const GlobalOrdinal inds[],
2482  const impl_scalar_type newVals[],
2483  const LocalOrdinal numElts,
2484  const bool atomic)
2485  {
2486  typedef LocalOrdinal LO;
2487  typedef GlobalOrdinal GO;
2488 
2489  const bool sorted = graph.isSorted ();
2490 
2491  size_t hint = 0; // guess at the index's relative offset in the row
2492  LO numValid = 0; // number of valid input column indices
2493 
2494  if (graph.isLocallyIndexed ()) {
2495  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2496  // pointer does NOT change its reference count. Thus, this
2497  // code is still thread safe.
2498  if (graph.colMap_.is_null ()) {
2499  // NO input column indices are valid in this case, since if
2500  // the column Map is null on the calling process, then the
2501  // calling process owns no graph entries.
2502  return numValid;
2503  }
2504  const map_type& colMap = * (graph.colMap_);
2505 
2506  // Get a view of the column indices in the row. This amortizes
2507  // the cost of getting the view over all the entries of inds.
2508  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2509  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2510 
2511  for (LO j = 0; j < numElts; ++j) {
2512  const LO lclColInd = colMap.getLocalElement (inds[j]);
2513  if (lclColInd != LINV) {
2514  const size_t offset =
2515  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2516  lclColInd, hint, sorted);
2517  if (offset != rowInfo.numEntries) {
2518  if (atomic) {
2519  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2520  }
2521  else {
2522  rowVals[offset] += newVals[j];
2523  }
2524  hint = offset + 1;
2525  numValid++;
2526  }
2527  }
2528  }
2529  }
2530  else if (graph.isGloballyIndexed ()) {
2531  // Get a view of the column indices in the row. This amortizes
2532  // the cost of getting the view over all the entries of inds.
2533  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2534 
2535  for (LO j = 0; j < numElts; ++j) {
2536  const GO gblColInd = inds[j];
2537  const size_t offset =
2538  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2539  gblColInd, hint, sorted);
2540  if (offset != rowInfo.numEntries) {
2541  if (atomic) {
2542  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2543  }
2544  else {
2545  rowVals[offset] += newVals[j];
2546  }
2547  hint = offset + 1;
2548  numValid++;
2549  }
2550  }
2551  }
2552  // If the graph is neither locally nor globally indexed on the
2553  // calling process, that means the calling process has no graph
2554  // entries. Thus, none of the input column indices are valid.
2555 
2556  return numValid;
2557  }
2558 
2559  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2560  LocalOrdinal
2562  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2563  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2564  const Teuchos::ArrayView<const Scalar>& inputVals,
2565  const bool atomic)
2566  {
2567  typedef LocalOrdinal LO;
2568 
2569  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2570  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2571  return Teuchos::OrdinalTraits<LO>::invalid ();
2572  }
2573  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2574  inputVals.getRawPtr (),
2575  inputGblColInds.getRawPtr (),
2576  atomic);
2577  }
2578 
2579  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2580  LocalOrdinal
2582  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2583  const LocalOrdinal numInputEnt,
2584  const Scalar inputVals[],
2585  const GlobalOrdinal inputGblColInds[],
2586  const bool atomic)
2587  {
2588  typedef impl_scalar_type IST;
2589  typedef LocalOrdinal LO;
2590  typedef GlobalOrdinal GO;
2591 
2592  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2593  // Fill must be active and the "nonconst" graph must exist.
2594  return Teuchos::OrdinalTraits<LO>::invalid ();
2595  }
2596  const crs_graph_type& graph = * (this->staticGraph_);
2597 
2598  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2599  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2600  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2601  // thread safe in a debug build, in part because it uses
2602  // Teuchos::ArrayView, and in part because of the data structure
2603  // used to stash outgoing entries.
2604  using Teuchos::ArrayView;
2605  ArrayView<const GO> inputGblColInds_av(
2606  numInputEnt == 0 ? nullptr : inputGblColInds,
2607  numInputEnt);
2608  ArrayView<const Scalar> inputVals_av(
2609  numInputEnt == 0 ? nullptr :
2610  inputVals, numInputEnt);
2611  // gblRow is not in the row Map on the calling process, so stash
2612  // the given entries away in a separate data structure.
2613  // globalAssemble() (called during fillComplete()) will exchange
2614  // that data and sum it in using sumIntoGlobalValues().
2615  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2616  inputVals_av);
2617  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2618  // since we won't know whether the given indices were valid
2619  // until globalAssemble (called in fillComplete) is called.
2620  // That's why insertNonownedGlobalValues doesn't return
2621  // anything. Just for consistency, I'll return the number of
2622  // entries that the user gave us.
2623  return numInputEnt;
2624  }
2625  else { // input row is in the row Map on the calling process
2626  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2627  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2628  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2629  inputGblColInds, inVals,
2630  numInputEnt, atomic);
2631  }
2632  }
2633 
2634  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2635  LocalOrdinal
2637  transformLocalValues (const LocalOrdinal lclRow,
2638  const LocalOrdinal numInputEnt,
2639  const impl_scalar_type inputVals[],
2640  const LocalOrdinal inputCols[],
2641  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2642  const bool atomic)
2643  {
2644  using Tpetra::Details::OrdinalTraits;
2645  typedef LocalOrdinal LO;
2646 
2647  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2648  // Fill must be active and the "nonconst" graph must exist.
2649  return Teuchos::OrdinalTraits<LO>::invalid ();
2650  }
2651  const crs_graph_type& graph = * (this->staticGraph_);
2652  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2653 
2654  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2655  // The calling process does not own this row, so it is not
2656  // allowed to modify its values.
2657  return static_cast<LO> (0);
2658  }
2659  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2660  return this->transformLocalValues (curRowVals.data (), graph,
2661  rowInfo, inputCols, inputVals,
2662  numInputEnt, f, atomic);
2663  }
2664 
2665  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2666  LocalOrdinal
2667  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2668  transformGlobalValues (const GlobalOrdinal gblRow,
2669  const LocalOrdinal numInputEnt,
2670  const impl_scalar_type inputVals[],
2671  const GlobalOrdinal inputCols[],
2672  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2673  const bool atomic)
2674  {
2675  using Tpetra::Details::OrdinalTraits;
2676  typedef LocalOrdinal LO;
2677 
2678  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2679  // Fill must be active and the "nonconst" graph must exist.
2680  return OrdinalTraits<LO>::invalid ();
2681  }
2682  const crs_graph_type& graph = * (this->staticGraph_);
2683  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2684 
2685  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2686  // The calling process does not own this row, so it is not
2687  // allowed to modify its values.
2688  return static_cast<LO> (0);
2689  }
2690  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2691  return this->transformGlobalValues (curRowVals.data (), graph,
2692  rowInfo, inputCols, inputVals,
2693  numInputEnt, f, atomic);
2694  }
2695 
2696  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2697  LocalOrdinal
2698  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2699  transformLocalValues (impl_scalar_type rowVals[],
2700  const crs_graph_type& graph,
2701  const RowInfo& rowInfo,
2702  const LocalOrdinal inds[],
2703  const impl_scalar_type newVals[],
2704  const LocalOrdinal numElts,
2705  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2706  const bool atomic)
2707  {
2708  typedef impl_scalar_type ST;
2709  typedef LocalOrdinal LO;
2710  typedef GlobalOrdinal GO;
2711 
2712  //if (newVals.extent (0) != inds.extent (0)) {
2713  // The sizes of the input arrays must match.
2714  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2715  //}
2716  //const LO numElts = static_cast<LO> (inds.extent (0));
2717  const bool sorted = graph.isSorted ();
2718 
2719  LO numValid = 0; // number of valid input column indices
2720  size_t hint = 0; // Guess for the current index k into rowVals
2721 
2722  if (graph.isLocallyIndexed ()) {
2723  // Get a view of the column indices in the row. This amortizes
2724  // the cost of getting the view over all the entries of inds.
2725  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2726 
2727  for (LO j = 0; j < numElts; ++j) {
2728  const LO lclColInd = inds[j];
2729  const size_t offset =
2730  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2731  lclColInd, hint, sorted);
2732  if (offset != rowInfo.numEntries) {
2733  if (atomic) {
2734  // NOTE (mfh 30 Nov 2015) The commented-out code is
2735  // wrong because another thread may have changed
2736  // rowVals[offset] between those two lines of code.
2737  //
2738  //const ST newVal = f (rowVals[offset], newVals[j]);
2739  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2740 
2741  ST* const dest = &rowVals[offset];
2742  (void) atomic_binary_function_update (dest, newVals[j], f);
2743  }
2744  else {
2745  // use binary function f
2746  rowVals[offset] = f (rowVals[offset], newVals[j]);
2747  }
2748  hint = offset + 1;
2749  ++numValid;
2750  }
2751  }
2752  }
2753  else if (graph.isGloballyIndexed ()) {
2754  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2755  // pointer does NOT change its reference count. Thus, this
2756  // code is still thread safe.
2757  if (graph.colMap_.is_null ()) {
2758  // NO input column indices are valid in this case. Either
2759  // the column Map hasn't been set yet (so local indices
2760  // don't exist yet), or the calling process owns no graph
2761  // entries.
2762  return numValid;
2763  }
2764  const map_type& colMap = * (graph.colMap_);
2765  // Get a view of the column indices in the row. This amortizes
2766  // the cost of getting the view over all the entries of inds.
2767  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2768 
2769  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2770  for (LO j = 0; j < numElts; ++j) {
2771  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2772  if (gblColInd != GINV) {
2773  const size_t offset =
2774  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2775  gblColInd, hint, sorted);
2776  if (offset != rowInfo.numEntries) {
2777  if (atomic) {
2778  // NOTE (mfh 30 Nov 2015) The commented-out code is
2779  // wrong because another thread may have changed
2780  // rowVals[offset] between those two lines of code.
2781  //
2782  //const ST newVal = f (rowVals[offset], newVals[j]);
2783  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2784 
2785  ST* const dest = &rowVals[offset];
2786  (void) atomic_binary_function_update (dest, newVals[j], f);
2787  }
2788  else {
2789  // use binary function f
2790  rowVals[offset] = f (rowVals[offset], newVals[j]);
2791  }
2792  hint = offset + 1;
2793  numValid++;
2794  }
2795  }
2796  }
2797  }
2798  // If the graph is neither locally nor globally indexed on the
2799  // calling process, that means the calling process has no graph
2800  // entries. Thus, none of the input column indices are valid.
2801 
2802  return numValid;
2803  }
2804 
2805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2806  LocalOrdinal
2807  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2808  transformGlobalValues (impl_scalar_type rowVals[],
2809  const crs_graph_type& graph,
2810  const RowInfo& rowInfo,
2811  const GlobalOrdinal inds[],
2812  const impl_scalar_type newVals[],
2813  const LocalOrdinal numElts,
2814  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2815  const bool atomic)
2816  {
2817  typedef impl_scalar_type ST;
2818  typedef LocalOrdinal LO;
2819  typedef GlobalOrdinal GO;
2820 
2821  //if (newVals.extent (0) != inds.extent (0)) {
2822  // The sizes of the input arrays must match.
2823  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2824  //}
2825  //const LO numElts = static_cast<LO> (inds.extent (0));
2826  const bool sorted = graph.isSorted ();
2827 
2828  LO numValid = 0; // number of valid input column indices
2829  size_t hint = 0; // Guess for the current index k into rowVals
2830 
2831  if (graph.isGloballyIndexed ()) {
2832  // Get a view of the column indices in the row. This amortizes
2833  // the cost of getting the view over all the entries of inds.
2834  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2835 
2836  for (LO j = 0; j < numElts; ++j) {
2837  const GO gblColInd = inds[j];
2838  const size_t offset =
2839  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2840  gblColInd, hint, sorted);
2841  if (offset != rowInfo.numEntries) {
2842  if (atomic) {
2843  // NOTE (mfh 30 Nov 2015) The commented-out code is
2844  // wrong because another thread may have changed
2845  // rowVals[offset] between those two lines of code.
2846  //
2847  //const ST newVal = f (rowVals[offset], newVals[j]);
2848  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2849 
2850  ST* const dest = &rowVals[offset];
2851  (void) atomic_binary_function_update (dest, newVals[j], f);
2852  }
2853  else {
2854  // use binary function f
2855  rowVals[offset] = f (rowVals[offset], newVals[j]);
2856  }
2857  hint = offset + 1;
2858  ++numValid;
2859  }
2860  }
2861  }
2862  else if (graph.isLocallyIndexed ()) {
2863  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2864  // pointer does NOT change its reference count. Thus, this
2865  // code is still thread safe.
2866  if (graph.colMap_.is_null ()) {
2867  // NO input column indices are valid in this case. Either the
2868  // column Map hasn't been set yet (so local indices don't
2869  // exist yet), or the calling process owns no graph entries.
2870  return numValid;
2871  }
2872  const map_type& colMap = * (graph.colMap_);
2873  // Get a view of the column indices in the row. This amortizes
2874  // the cost of getting the view over all the entries of inds.
2875  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2876 
2877  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2878  for (LO j = 0; j < numElts; ++j) {
2879  const LO lclColInd = colMap.getLocalElement (inds[j]);
2880  if (lclColInd != LINV) {
2881  const size_t offset =
2882  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2883  lclColInd, hint, sorted);
2884  if (offset != rowInfo.numEntries) {
2885  if (atomic) {
2886  // NOTE (mfh 30 Nov 2015) The commented-out code is
2887  // wrong because another thread may have changed
2888  // rowVals[offset] between those two lines of code.
2889  //
2890  //const ST newVal = f (rowVals[offset], newVals[j]);
2891  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2892 
2893  ST* const dest = &rowVals[offset];
2894  (void) atomic_binary_function_update (dest, newVals[j], f);
2895  }
2896  else {
2897  // use binary function f
2898  rowVals[offset] = f (rowVals[offset], newVals[j]);
2899  }
2900  hint = offset + 1;
2901  numValid++;
2902  }
2903  }
2904  }
2905  }
2906  // If the graph is neither locally nor globally indexed on the
2907  // calling process, that means the calling process has no graph
2908  // entries. Thus, none of the input column indices are valid.
2909 
2910  return numValid;
2911  }
2912 
2913  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2914  LocalOrdinal
2917  const crs_graph_type& graph,
2918  const RowInfo& rowInfo,
2919  const LocalOrdinal inds[],
2920  const impl_scalar_type newVals[],
2921  const LocalOrdinal numElts,
2922  const bool atomic)
2923  {
2924  typedef LocalOrdinal LO;
2925  typedef GlobalOrdinal GO;
2926 
2927  const bool sorted = graph.isSorted ();
2928 
2929  size_t hint = 0; // Guess for the current index k into rowVals
2930  LO numValid = 0; // number of valid local column indices
2931 
2932  if (graph.isLocallyIndexed ()) {
2933  // Get a view of the column indices in the row. This amortizes
2934  // the cost of getting the view over all the entries of inds.
2935  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2936 
2937  for (LO j = 0; j < numElts; ++j) {
2938  const LO lclColInd = inds[j];
2939  const size_t offset =
2940  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2941  lclColInd, hint, sorted);
2942  if (offset != rowInfo.numEntries) {
2943  if (atomic) {
2944  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2945  }
2946  else {
2947  rowVals[offset] += newVals[j];
2948  }
2949  hint = offset + 1;
2950  ++numValid;
2951  }
2952  }
2953  }
2954  else if (graph.isGloballyIndexed ()) {
2955  if (graph.colMap_.is_null ()) {
2956  return Teuchos::OrdinalTraits<LO>::invalid ();
2957  }
2958  const map_type colMap = * (graph.colMap_);
2959 
2960  // Get a view of the column indices in the row. This amortizes
2961  // the cost of getting the view over all the entries of inds.
2962  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2963 
2964  for (LO j = 0; j < numElts; ++j) {
2965  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2966  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2967  const size_t offset =
2968  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2969  gblColInd, hint, sorted);
2970  if (offset != rowInfo.numEntries) {
2971  if (atomic) {
2972  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2973  }
2974  else {
2975  rowVals[offset] += newVals[j];
2976  }
2977  hint = offset + 1;
2978  ++numValid;
2979  }
2980  }
2981  }
2982  }
2983  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2984  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2985  // to be neither locally nor globally indexed on a process.
2986  // This means that the graph or matrix has no entries on that
2987  // process. Epetra also works like this. It's related to lazy
2988  // allocation (on first insertion, not at graph / matrix
2989  // construction). Lazy allocation will go away because it is
2990  // not thread scalable.
2991 
2992  return numValid;
2993  }
2994 
2995  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2996  LocalOrdinal
2998  sumIntoLocalValues (const LocalOrdinal localRow,
2999  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3000  const Teuchos::ArrayView<const Scalar>& values,
3001  const bool atomic)
3002  {
3003  using LO = local_ordinal_type;
3004  const LO numInputEnt = static_cast<LO>(indices.size());
3005  if (static_cast<LO>(values.size()) != numInputEnt) {
3006  return Teuchos::OrdinalTraits<LO>::invalid();
3007  }
3008  const LO* const inputInds = indices.getRawPtr();
3009  const scalar_type* const inputVals = values.getRawPtr();
3010  return this->sumIntoLocalValues(localRow, numInputEnt,
3011  inputVals, inputInds, atomic);
3012  }
3013 
3014  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3016  local_ordinal_type
3019  const local_ordinal_type localRow,
3020  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3021  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3022  const bool atomic)
3023  {
3024  using LO = local_ordinal_type;
3025  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3026  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3027  return Teuchos::OrdinalTraits<LO>::invalid();
3028  }
3029  const scalar_type* inVals =
3030  reinterpret_cast<const scalar_type*>(inputVals.data());
3031  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3032  inputInds.data(), atomic);
3033  }
3034 
3035  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3036  LocalOrdinal
3038  sumIntoLocalValues (const LocalOrdinal localRow,
3039  const LocalOrdinal numEnt,
3040  const Scalar vals[],
3041  const LocalOrdinal cols[],
3042  const bool atomic)
3043  {
3044  typedef impl_scalar_type IST;
3045  typedef LocalOrdinal LO;
3046 
3047  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3048  // Fill must be active and the "nonconst" graph must exist.
3049  return Teuchos::OrdinalTraits<LO>::invalid ();
3050  }
3051  const crs_graph_type& graph = * (this->staticGraph_);
3052  const RowInfo rowInfo = graph.getRowInfo (localRow);
3053 
3054  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3055  // The calling process does not own this row, so it is not
3056  // allowed to modify its values.
3057  return static_cast<LO> (0);
3058  }
3059  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3060  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3061  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3062  cols, inputVals, numEnt, atomic);
3063  }
3064 
3065  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3067  values_dualv_type::t_host::const_type
3069  getValuesViewHost (const RowInfo& rowinfo) const
3070  {
3071  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3072  return typename values_dualv_type::t_host::const_type ();
3073  else
3074  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3075  rowinfo.allocSize,
3076  Access::ReadOnly);
3077  }
3078 
3079  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3081  values_dualv_type::t_host
3084  {
3085  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3086  return typename values_dualv_type::t_host ();
3087  else
3088  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3089  rowinfo.allocSize,
3090  Access::ReadWrite);
3091  }
3092 
3093  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3095  values_dualv_type::t_dev::const_type
3097  getValuesViewDevice (const RowInfo& rowinfo) const
3098  {
3099  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3100  return typename values_dualv_type::t_dev::const_type ();
3101  else
3102  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3103  rowinfo.allocSize,
3104  Access::ReadOnly);
3105  }
3106 
3107  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3109  values_dualv_type::t_dev
3112  {
3113  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3114  return typename values_dualv_type::t_dev ();
3115  else
3116  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3117  rowinfo.allocSize,
3118  Access::ReadWrite);
3119  }
3120 
3121 
3122  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3123  void
3126  nonconst_local_inds_host_view_type &indices,
3127  nonconst_values_host_view_type &values,
3128  size_t& numEntries) const
3129  {
3130  using Teuchos::ArrayView;
3131  using Teuchos::av_reinterpret_cast;
3132  const char tfecfFuncName[] = "getLocalRowCopy: ";
3133 
3134  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3135  (! this->hasColMap (), std::runtime_error,
3136  "The matrix does not have a column Map yet. This means we don't have "
3137  "local indices for columns yet, so it doesn't make sense to call this "
3138  "method. If the matrix doesn't have a column Map yet, you should call "
3139  "fillComplete on it first.");
3140 
3141  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3142  const size_t theNumEntries = rowinfo.numEntries;
3143  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3144  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3145  static_cast<size_t> (values.size ()) < theNumEntries,
3146  std::runtime_error, "Row with local index " << localRow << " has " <<
3147  theNumEntries << " entry/ies, but indices.size() = " <<
3148  indices.size () << " and values.size() = " << values.size () << ".");
3149  numEntries = theNumEntries; // first side effect
3150 
3151  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3152  if (staticGraph_->isLocallyIndexed ()) {
3153  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3154  auto curVals = getValuesViewHost(rowinfo);
3155 
3156  for (size_t j = 0; j < theNumEntries; ++j) {
3157  values[j] = curVals[j];
3158  indices[j] = curLclInds(j);
3159  }
3160  }
3161  else if (staticGraph_->isGloballyIndexed ()) {
3162  // Don't call getColMap(), because it touches RCP's reference count.
3163  const map_type& colMap = * (staticGraph_->colMap_);
3164  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3165  auto curVals = getValuesViewHost(rowinfo);
3166 
3167  for (size_t j = 0; j < theNumEntries; ++j) {
3168  values[j] = curVals[j];
3169  indices[j] = colMap.getLocalElement (curGblInds(j));
3170  }
3171  }
3172  }
3173  }
3174 
3175 
3176 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3177 void
3180  nonconst_global_inds_host_view_type &indices,
3181  nonconst_values_host_view_type &values,
3182  size_t& numEntries) const
3183  {
3184  using Teuchos::ArrayView;
3185  using Teuchos::av_reinterpret_cast;
3186  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3187 
3188  const RowInfo rowinfo =
3189  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3190  const size_t theNumEntries = rowinfo.numEntries;
3191  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3192  static_cast<size_t> (indices.size ()) < theNumEntries ||
3193  static_cast<size_t> (values.size ()) < theNumEntries,
3194  std::runtime_error, "Row with global index " << globalRow << " has "
3195  << theNumEntries << " entry/ies, but indices.size() = " <<
3196  indices.size () << " and values.size() = " << values.size () << ".");
3197  numEntries = theNumEntries; // first side effect
3198 
3199  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3200  if (staticGraph_->isLocallyIndexed ()) {
3201  const map_type& colMap = * (staticGraph_->colMap_);
3202  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3203  auto curVals = getValuesViewHost(rowinfo);
3204 
3205  for (size_t j = 0; j < theNumEntries; ++j) {
3206  values[j] = curVals[j];
3207  indices[j] = colMap.getGlobalElement (curLclInds(j));
3208  }
3209  }
3210  else if (staticGraph_->isGloballyIndexed ()) {
3211  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3212  auto curVals = getValuesViewHost(rowinfo);
3213 
3214  for (size_t j = 0; j < theNumEntries; ++j) {
3215  values[j] = curVals[j];
3216  indices[j] = curGblInds(j);
3217  }
3218  }
3219  }
3220  }
3221 
3222 
3223  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3224  void
3226  getLocalRowView(LocalOrdinal localRow,
3227  local_inds_host_view_type &indices,
3228  values_host_view_type &values) const
3229  {
3230  const char tfecfFuncName[] = "getLocalRowView: ";
3231 
3232  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3233  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3234  "its indices as global indices, so you cannot get a view with local "
3235  "column indices. If the matrix has a column Map, you may call "
3236  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3237  "a view with global column indices by calling getGlobalRowCopy().");
3238 
3239  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3240  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3241  rowInfo.numEntries > 0) {
3242  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3243  rowInfo.offset1D,
3244  rowInfo.numEntries,
3245  Access::ReadOnly);
3246  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3247  rowInfo.numEntries,
3248  Access::ReadOnly);
3249  }
3250  else {
3251  // This does the right thing (reports an empty row) if the input
3252  // row is invalid.
3253  indices = local_inds_host_view_type();
3254  values = values_host_view_type();
3255  }
3256 
3257 #ifdef HAVE_TPETRA_DEBUG
3258  const char suffix[] = ". This should never happen. Please report this "
3259  "bug to the Tpetra developers.";
3260  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3261  (static_cast<size_t> (indices.size ()) !=
3262  static_cast<size_t> (values.size ()), std::logic_error,
3263  "At the end of this method, for local row " << localRow << ", "
3264  "indices.size() = " << indices.size () << " != values.size () = "
3265  << values.size () << suffix);
3266  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3267  (static_cast<size_t> (indices.size ()) !=
3268  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3269  "At the end of this method, for local row " << localRow << ", "
3270  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3271  << rowInfo.numEntries << suffix);
3272  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3273  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3274  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3275  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3276  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3277  expectedNumEntries << suffix);
3278 #endif // HAVE_TPETRA_DEBUG
3279  }
3280 
3281 
3282  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3283  void
3285  getGlobalRowView (GlobalOrdinal globalRow,
3286  global_inds_host_view_type &indices,
3287  values_host_view_type &values) const
3288  {
3289  const char tfecfFuncName[] = "getGlobalRowView: ";
3290 
3291  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3292  isLocallyIndexed (), std::runtime_error,
3293  "The matrix is locally indexed, so we cannot return a view of the row "
3294  "with global column indices. Use getGlobalRowCopy() instead.");
3295 
3296  // This does the right thing (reports an empty row) if the input
3297  // row is invalid.
3298  const RowInfo rowInfo =
3299  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3300  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3301  rowInfo.numEntries > 0) {
3302  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3303  rowInfo.numEntries,
3304  Access::ReadOnly);
3305  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3306  rowInfo.numEntries,
3307  Access::ReadOnly);
3308  }
3309  else {
3310  indices = global_inds_host_view_type();
3311  values = values_host_view_type();
3312  }
3313 
3314 #ifdef HAVE_TPETRA_DEBUG
3315  const char suffix[] = ". This should never happen. Please report this "
3316  "bug to the Tpetra developers.";
3317  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3318  (static_cast<size_t> (indices.size ()) !=
3319  static_cast<size_t> (values.size ()), std::logic_error,
3320  "At the end of this method, for global row " << globalRow << ", "
3321  "indices.size() = " << indices.size () << " != values.size () = "
3322  << values.size () << suffix);
3323  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3324  (static_cast<size_t> (indices.size ()) !=
3325  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3326  "At the end of this method, for global row " << globalRow << ", "
3327  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3328  << rowInfo.numEntries << suffix);
3329  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3330  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3331  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3332  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3333  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3334  " " << expectedNumEntries << suffix);
3335 #endif // HAVE_TPETRA_DEBUG
3336  }
3337 
3338 
3339  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3340  void
3342  scale (const Scalar& alpha)
3343  {
3344  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3345 
3346  const size_t nlrs = staticGraph_->getLocalNumRows ();
3347  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3348  if (! staticGraph_->indicesAreAllocated () ||
3349  nlrs == 0 || numEntries == 0) {
3350  // do nothing
3351  }
3352  else {
3353 
3354  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3355  KokkosBlas::scal(vals, theAlpha, vals);
3356 
3357  }
3358  }
3359 
3360  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3361  void
3363  setAllToScalar (const Scalar& alpha)
3364  {
3365  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3366 
3367  // replace all values in the matrix
3368  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3369  // however, if there are no valid entries, we can short-circuit
3370  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3371  const size_t numEntries = staticGraph_->getLocalNumEntries();
3372  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3373  // do nothing
3374  }
3375  else {
3376  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3377  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3378  theAlpha);
3379  // CAG: This fence was found to be required on Cuda with UVM=on.
3380  Kokkos::fence("CrsMatrix::setAllToScalar");
3381  }
3382  }
3383 
3384  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3385  void
3387  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3388  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3389  const typename local_matrix_device_type::values_type& values)
3390  {
3391  using ProfilingRegion=Details::ProfilingRegion;
3392  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3393  const char tfecfFuncName[] = "setAllValues: ";
3394  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3395  (columnIndices.size () != values.size (), std::invalid_argument,
3396  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3397  " = " << values.size () << ".");
3398  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3399  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3400 
3401  try {
3402  myGraph_->setAllIndices (rowPointers, columnIndices);
3403  }
3404  catch (std::exception &e) {
3405  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3406  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3407  "exception: " << e.what ());
3408  }
3409 
3410  // Make sure that myGraph_ now has a local graph. It may not be
3411  // fillComplete yet, so it's important to check. We don't care
3412  // whether setAllIndices() did a shallow copy or a deep copy, so a
3413  // good way to check is to compare dimensions.
3414  auto lclGraph = myGraph_->getLocalGraphDevice ();
3415  const size_t numEnt = lclGraph.entries.extent (0);
3416  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3417  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3418  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3419  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3420  "local graph. Please report this bug to the Tpetra developers.");
3421 
3422  valuesPacked_wdv = values_wdv_type(values);
3423  valuesUnpacked_wdv = valuesPacked_wdv;
3424 
3425  // Storage MUST be packed, since the interface doesn't give any
3426  // way to indicate any extra space at the end of each row.
3427  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3428 
3429  checkInternalState ();
3430  }
3431 
3432  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3433  void
3435  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3436  {
3437  using ProfilingRegion=Details::ProfilingRegion;
3438  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3439 
3440  auto graph = localDeviceMatrix.graph;
3441  //FIXME how to check whether graph is allocated
3442 
3443  auto rows = graph.row_map;
3444  auto columns = graph.entries;
3445  auto values = localDeviceMatrix.values;
3446 
3447  setAllValues(rows,columns,values);
3448  }
3449 
3450  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3451  void
3453  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3454  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3455  const Teuchos::ArrayRCP<Scalar>& val)
3456  {
3457  using Kokkos::Compat::getKokkosViewDeepCopy;
3458  using Teuchos::ArrayRCP;
3459  using Teuchos::av_reinterpret_cast;
3460  typedef device_type DT;
3461  typedef impl_scalar_type IST;
3462  typedef typename local_graph_device_type::row_map_type row_map_type;
3463  //typedef typename row_map_type::non_const_value_type row_offset_type;
3464  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3465 
3466  // The row offset type may depend on the execution space. It may
3467  // not necessarily be size_t. If it's not, we need to make a deep
3468  // copy. We need to make a deep copy anyway so that Kokkos can
3469  // own the memory. Regardless, ptrIn gets the copy.
3470  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3471  Kokkos::View<const size_t*,
3472  typename row_map_type::array_layout,
3473  Kokkos::HostSpace,
3474  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3475  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3476 
3477  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3478  (ptrNative.extent (0) != ptrSizeT.extent (0),
3479  std::logic_error, "ptrNative.extent(0) = " <<
3480  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3481  << ptrSizeT.extent (0) << ". Please report this bug to the "
3482  "Tpetra developers.");
3483 
3484  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3485  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3486  this->setAllValues (ptrNative, indIn, valIn);
3487  }
3488 
3489  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3490  void
3492  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3493  {
3494  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3495  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3496  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3497 
3498  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3499  // this method in CrsGraph too, so don't call it (otherwise build
3500  // warnings will show up and annoy users). Instead, copy results
3501  // in and out, if the memory space requires it.
3502 
3503  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3504  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3505  offsets.resize (lclNumRows);
3506  }
3507 
3508  // The input ArrayRCP must always be a host pointer. Thus, if
3509  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3510  // to write to that allocation directly as a Kokkos::View.
3511  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3512  // It is always syntactically correct to assign a raw host
3513  // pointer to a device View, so this code will compile correctly
3514  // even if this branch never runs.
3515  typedef Kokkos::View<size_t*, device_type,
3516  Kokkos::MemoryUnmanaged> output_type;
3517  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3518  staticGraph_->getLocalDiagOffsets (offsetsOut);
3519  }
3520  else {
3521  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3522  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3523  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3524  Kokkos::MemoryUnmanaged> output_type;
3525  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3526  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3527  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3528  }
3529  }
3530 
3531  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3532  void
3535  {
3536  using Teuchos::ArrayRCP;
3537  using Teuchos::ArrayView;
3538  using Teuchos::av_reinterpret_cast;
3539  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3540  typedef local_ordinal_type LO;
3541 
3542 
3543  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3544  staticGraph_.is_null (), std::runtime_error,
3545  "This method requires that the matrix have a graph.");
3546  auto rowMapPtr = this->getRowMap ();
3547  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3548  // Processes on which the row Map or its communicator is null
3549  // don't participate. Users shouldn't even call this method on
3550  // those processes.
3551  return;
3552  }
3553  auto colMapPtr = this->getColMap ();
3554  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3555  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3556  "This method requires that the matrix have a column Map.");
3557  const map_type& rowMap = * rowMapPtr;
3558  const map_type& colMap = * colMapPtr;
3559  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3560 
3561 #ifdef HAVE_TPETRA_DEBUG
3562  // isCompatible() requires an all-reduce, and thus this check
3563  // should only be done in debug mode.
3564  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3565  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3566  "The input Vector's Map must be compatible with the CrsMatrix's row "
3567  "Map. You may check this by using Map's isCompatible method: "
3568  "diag.getMap ()->isCompatible (A.getRowMap ());");
3569 #endif // HAVE_TPETRA_DEBUG
3570 
3571  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3572  // 1-D subview of the first (and only) column of D_lcl.
3573  const auto D_lcl_1d =
3574  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3575 
3576  const auto lclRowMap = rowMap.getLocalMap ();
3577  const auto lclColMap = colMap.getLocalMap ();
3579  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3580  lclColMap,
3581  getLocalMatrixDevice ());
3582  }
3583 
3584  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3585  void
3588  const Kokkos::View<const size_t*, device_type,
3589  Kokkos::MemoryUnmanaged>& offsets) const
3590  {
3591  typedef LocalOrdinal LO;
3592 
3593 #ifdef HAVE_TPETRA_DEBUG
3594  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3595  const map_type& rowMap = * (this->getRowMap ());
3596  // isCompatible() requires an all-reduce, and thus this check
3597  // should only be done in debug mode.
3598  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3599  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3600  "The input Vector's Map must be compatible with (in the sense of Map::"
3601  "isCompatible) the CrsMatrix's row Map.");
3602 #endif // HAVE_TPETRA_DEBUG
3603 
3604  // For now, we fill the Vector on the host and sync to device.
3605  // Later, we may write a parallel kernel that works entirely on
3606  // device.
3607  //
3608  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3609  // we write a device kernel, it will not need to assume UVM.
3610 
3611  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3612  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3613  // Get 1-D subview of the first (and only) column of D_lcl.
3614  auto D_lcl_1d =
3615  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3616 
3617  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3618  getLocalMatrixDevice ());
3619  }
3620 
3621  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3622  void
3625  const Teuchos::ArrayView<const size_t>& offsets) const
3626  {
3627  using LO = LocalOrdinal;
3628  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3629  using IST = impl_scalar_type;
3630 
3631 #ifdef HAVE_TPETRA_DEBUG
3632  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3633  const map_type& rowMap = * (this->getRowMap ());
3634  // isCompatible() requires an all-reduce, and thus this check
3635  // should only be done in debug mode.
3636  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3637  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3638  "The input Vector's Map must be compatible with (in the sense of Map::"
3639  "isCompatible) the CrsMatrix's row Map.");
3640 #endif // HAVE_TPETRA_DEBUG
3641 
3642  // See #1510. In case diag has already been marked modified on
3643  // device, we need to clear that flag, since the code below works
3644  // on host.
3645  //diag.clear_sync_state ();
3646 
3647  // For now, we fill the Vector on the host and sync to device.
3648  // Later, we may write a parallel kernel that works entirely on
3649  // device.
3650  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3651  // 1-D subview of the first (and only) column of lclVecHost.
3652  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3653 
3654  using host_offsets_view_type =
3655  Kokkos::View<const size_t*, Kokkos::HostSpace,
3656  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3657  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3658  // Find the diagonal entries and put them in lclVecHost1d.
3659  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3660  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3661  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3662 
3663  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3664  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3665  Kokkos::parallel_for
3666  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3667  range_type (0, myNumRows),
3668  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3669  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3670  if (h_offsets[lclRow] != INV) {
3671  auto curRowOffset = rowPtrsPackedHost (lclRow);
3672  lclVecHost1d(lclRow) =
3673  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3674  }
3675  });
3676  //diag.sync_device ();
3677  }
3678 
3679 
3680  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3681  void
3684  {
3685  using ::Tpetra::Details::ProfilingRegion;
3686  using Teuchos::ArrayRCP;
3687  using Teuchos::ArrayView;
3688  using Teuchos::null;
3689  using Teuchos::RCP;
3690  using Teuchos::rcp;
3691  using Teuchos::rcpFromRef;
3693  const char tfecfFuncName[] = "leftScale: ";
3694 
3695  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3696 
3697  RCP<const vec_type> xp;
3698  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3699  // Take from Epetra: If we have a non-trivial exporter, we must
3700  // import elements that are permuted or are on other processors.
3701  auto exporter = this->getCrsGraphRef ().getExporter ();
3702  if (exporter.get () != nullptr) {
3703  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3704  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3705  xp = tempVec;
3706  }
3707  else {
3708  xp = rcpFromRef (x);
3709  }
3710  }
3711  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3712  xp = rcpFromRef (x);
3713  }
3714  else {
3715  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3716  (true, std::invalid_argument, "x's Map must be the same as "
3717  "either the row Map or the range Map of the CrsMatrix.");
3718  }
3719 
3720  if (this->isFillComplete()) {
3721  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3722  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3724  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3725  x_lcl_1d, false, false);
3726  }
3727  else {
3728  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3729  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3730  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3731  " fillComplete");
3732  }
3733  }
3734 
3735  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3736  void
3739  {
3740  using ::Tpetra::Details::ProfilingRegion;
3741  using Teuchos::ArrayRCP;
3742  using Teuchos::ArrayView;
3743  using Teuchos::null;
3744  using Teuchos::RCP;
3745  using Teuchos::rcp;
3746  using Teuchos::rcpFromRef;
3748  const char tfecfFuncName[] = "rightScale: ";
3749 
3750  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3751 
3752  RCP<const vec_type> xp;
3753  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3754  // Take from Epetra: If we have a non-trivial exporter, we must
3755  // import elements that are permuted or are on other processors.
3756  auto importer = this->getCrsGraphRef ().getImporter ();
3757  if (importer.get () != nullptr) {
3758  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3759  tempVec->doImport (x, *importer, REPLACE);
3760  xp = tempVec;
3761  }
3762  else {
3763  xp = rcpFromRef (x);
3764  }
3765  }
3766  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3767  xp = rcpFromRef (x);
3768  } else {
3769  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3770  (true, std::runtime_error, "x's Map must be the same as "
3771  "either the domain Map or the column Map of the CrsMatrix.");
3772  }
3773 
3774  if (this->isFillComplete()) {
3775  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3776  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3778  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3779  x_lcl_1d, false, false);
3780  }
3781  else {
3782  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3783  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3784  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3785  " fillComplete");
3786  }
3787  }
3788 
3789  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3792  auto equilInfo = computeRowOneNorms(*this);
3793  mag_type myMax;
3794  using range_type = Kokkos::RangePolicy<execution_space, local_ordinal_type>;
3795  Kokkos::parallel_reduce(
3796  "getNormInf", range_type(0, equilInfo.rowNorms.extent(0)),
3797  KOKKOS_LAMBDA(local_ordinal_type i, mag_type & max) {
3798  max = equilInfo.rowNorms(i);
3799  },
3800  Kokkos::Max<mag_type>(myMax));
3801  mag_type totalMax = STM::zero();
3802  Teuchos::reduceAll<int, mag_type>(*(getComm()), Teuchos::REDUCE_MAX, myMax,
3803  Teuchos::outArg(totalMax));
3804  return totalMax;
3805  }
3806 
3807  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3810  getNorm1 (const bool assumeSymmetric) const {
3811  if (assumeSymmetric)
3812  return getNormInf();
3813  auto equilInfo = computeRowAndColumnOneNorms(*this, false);
3814  mag_type myMax;
3815  using range_type = Kokkos::RangePolicy<execution_space, local_ordinal_type>;
3816  Kokkos::parallel_reduce(
3817  "getNorm1", range_type(0, equilInfo.colNorms.extent(0)),
3818  KOKKOS_LAMBDA(local_ordinal_type i, mag_type & max) {
3819  max = equilInfo.colNorms(i);
3820  },
3821  Kokkos::Max<mag_type>(myMax));
3822  mag_type totalMax = STM::zero();
3823  Teuchos::reduceAll<int, mag_type>(*(getComm()), Teuchos::REDUCE_MAX, myMax,
3824  Teuchos::outArg(totalMax));
3825  return totalMax;
3826  }
3827 
3828  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3832  {
3833  using Teuchos::ArrayView;
3834  using Teuchos::outArg;
3835  using Teuchos::REDUCE_SUM;
3836  using Teuchos::reduceAll;
3837 
3838  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3839  // local part of this computation. It could make sense to put
3840  // this operation in the Kokkos::CrsMatrix.
3841 
3842  // check the cache first
3843  mag_type mySum = STM::zero ();
3844  if (getLocalNumEntries() > 0) {
3845  if (isStorageOptimized ()) {
3846  // "Optimized" storage is packed storage. That means we can
3847  // iterate in one pass through the 1-D values array.
3848  const size_t numEntries = getLocalNumEntries ();
3849  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3850  for (size_t k = 0; k < numEntries; ++k) {
3851  auto val = values[k];
3852  // Note (etp 06 Jan 2015) We need abs() here for composite types
3853  // (in general, if mag_type is on the left-hand-side, we need
3854  // abs() on the right-hand-side)
3855  const mag_type val_abs = STS::abs (val);
3856  mySum += val_abs * val_abs;
3857  }
3858  }
3859  else {
3860  const LocalOrdinal numRows =
3861  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3862  for (LocalOrdinal r = 0; r < numRows; ++r) {
3863  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3864  const size_t numEntries = rowInfo.numEntries;
3865  auto A_r = this->getValuesViewHost(rowInfo);
3866  for (size_t k = 0; k < numEntries; ++k) {
3867  const impl_scalar_type val = A_r[k];
3868  const mag_type val_abs = STS::abs (val);
3869  mySum += val_abs * val_abs;
3870  }
3871  }
3872  }
3873  }
3874  mag_type totalSum = STM::zero ();
3875  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3876  mySum, outArg (totalSum));
3877  return STM::sqrt (totalSum);
3878  }
3879 
3880  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3881  void
3883  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3884  {
3885  const char tfecfFuncName[] = "replaceColMap: ";
3886  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3887  // Then replacing the column Map might mean that we need to
3888  // reindex the column indices.
3889  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3890  myGraph_.is_null (), std::runtime_error,
3891  "This method does not work if the matrix has a const graph. The whole "
3892  "idea of a const graph is that you are not allowed to change it, but "
3893  "this method necessarily must modify the graph, since the graph owns "
3894  "the matrix's column Map.");
3895  myGraph_->replaceColMap (newColMap);
3896  }
3897 
3898  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3899  void
3902  const Teuchos::RCP<const map_type>& newColMap,
3903  const Teuchos::RCP<const import_type>& newImport,
3904  const bool sortEachRow)
3905  {
3906  const char tfecfFuncName[] = "reindexColumns: ";
3907  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3908  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3909  "The input graph is null, but the matrix does not own its graph.");
3910 
3911  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3912  const bool sortGraph = false; // we'll sort graph & matrix together below
3913 
3914  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3915 
3916  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3917  const LocalOrdinal lclNumRows =
3918  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3919 
3920  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3921 
3922  const RowInfo rowInfo = theGraph.getRowInfo (row);
3923  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3924  auto vals = this->getValuesViewHostNonConst (rowInfo);
3925 
3926  sort2 (lclColInds.data (),
3927  lclColInds.data () + rowInfo.numEntries,
3928  vals.data ());
3929  }
3930  theGraph.indicesAreSorted_ = true;
3931  }
3932  }
3933 
3934  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3935  void
3937  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3938  {
3939  const char tfecfFuncName[] = "replaceDomainMap: ";
3940  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3941  myGraph_.is_null (), std::runtime_error,
3942  "This method does not work if the matrix has a const graph. The whole "
3943  "idea of a const graph is that you are not allowed to change it, but this"
3944  " method necessarily must modify the graph, since the graph owns the "
3945  "matrix's domain Map and Import objects.");
3946  myGraph_->replaceDomainMap (newDomainMap);
3947  }
3948 
3949  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3950  void
3952  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3953  Teuchos::RCP<const import_type>& newImporter)
3954  {
3955  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3956  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3957  myGraph_.is_null (), std::runtime_error,
3958  "This method does not work if the matrix has a const graph. The whole "
3959  "idea of a const graph is that you are not allowed to change it, but this"
3960  " method necessarily must modify the graph, since the graph owns the "
3961  "matrix's domain Map and Import objects.");
3962  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3963  }
3964 
3965  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3966  void
3968  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
3969  {
3970  const char tfecfFuncName[] = "replaceRangeMap: ";
3971  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3972  myGraph_.is_null (), std::runtime_error,
3973  "This method does not work if the matrix has a const graph. The whole "
3974  "idea of a const graph is that you are not allowed to change it, but this"
3975  " method necessarily must modify the graph, since the graph owns the "
3976  "matrix's domain Map and Import objects.");
3977  myGraph_->replaceRangeMap (newRangeMap);
3978  }
3979 
3980  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3981  void
3983  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
3984  Teuchos::RCP<const export_type>& newExporter)
3985  {
3986  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3987  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3988  myGraph_.is_null (), std::runtime_error,
3989  "This method does not work if the matrix has a const graph. The whole "
3990  "idea of a const graph is that you are not allowed to change it, but this"
3991  " method necessarily must modify the graph, since the graph owns the "
3992  "matrix's domain Map and Import objects.");
3993  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
3994  }
3995 
3996  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3997  void
3999  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4000  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4001  const Teuchos::ArrayView<const Scalar>& values)
4002  {
4003  using Teuchos::Array;
4004  typedef GlobalOrdinal GO;
4005  typedef typename Array<GO>::size_type size_type;
4006 
4007  const size_type numToInsert = indices.size ();
4008  // Add the new data to the list of nonlocals.
4009  // This creates the arrays if they don't exist yet.
4010  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4011  Array<GO>& curRowInds = curRow.first;
4012  Array<Scalar>& curRowVals = curRow.second;
4013  const size_type newCapacity = curRowInds.size () + numToInsert;
4014  curRowInds.reserve (newCapacity);
4015  curRowVals.reserve (newCapacity);
4016  for (size_type k = 0; k < numToInsert; ++k) {
4017  curRowInds.push_back (indices[k]);
4018  curRowVals.push_back (values[k]);
4019  }
4020  }
4021 
4022  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4023  void
4026  {
4027  using Details::Behavior;
4029  using Teuchos::Comm;
4030  using Teuchos::outArg;
4031  using Teuchos::RCP;
4032  using Teuchos::rcp;
4033  using Teuchos::REDUCE_MAX;
4034  using Teuchos::REDUCE_MIN;
4035  using Teuchos::reduceAll;
4036  using std::endl;
4038  //typedef LocalOrdinal LO;
4039  typedef GlobalOrdinal GO;
4040  typedef typename Teuchos::Array<GO>::size_type size_type;
4041  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4042  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4043 
4044  const bool verbose = Behavior::verbose("CrsMatrix");
4045  std::unique_ptr<std::string> prefix;
4046  if (verbose) {
4047  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4048  std::ostringstream os;
4049  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4050  << endl;
4051  std::cerr << os.str();
4052  }
4053  RCP<const Comm<int> > comm = getComm ();
4054 
4055  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4056  (! isFillActive (), std::runtime_error, "Fill must be active before "
4057  "you may call this method.");
4058 
4059  const size_t myNumNonlocalRows = nonlocals_.size ();
4060 
4061  // If no processes have nonlocal rows, then we don't have to do
4062  // anything. Checking this is probably cheaper than constructing
4063  // the Map of nonlocal rows (see below) and noticing that it has
4064  // zero global entries.
4065  {
4066  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4067  int someoneHasNonlocalRows = 0;
4068  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4069  outArg (someoneHasNonlocalRows));
4070  if (someoneHasNonlocalRows == 0) {
4071  return; // no process has nonlocal rows, so nothing to do
4072  }
4073  }
4074 
4075  // 1. Create a list of the "nonlocal" rows on each process. this
4076  // requires iterating over nonlocals_, so while we do this,
4077  // deduplicate the entries and get a count for each nonlocal
4078  // row on this process.
4079  // 2. Construct a new row Map corresponding to those rows. This
4080  // Map is likely overlapping. We know that the Map is not
4081  // empty on all processes, because the above all-reduce and
4082  // return exclude that case.
4083 
4084  RCP<const map_type> nonlocalRowMap;
4085  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4086  {
4087  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4088  size_type curPos = 0;
4089  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4090  ++mapIter, ++curPos) {
4091  myNonlocalGblRows[curPos] = mapIter->first;
4092  // Get the values and column indices by reference, since we
4093  // intend to change them in place (that's what "erase" does).
4094  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4095  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4096 
4097  // Sort both arrays jointly, using the column indices as keys,
4098  // then merge them jointly. "Merge" here adds values
4099  // corresponding to the same column indices. The first 2 args
4100  // of merge2 are output arguments that work just like the
4101  // return value of std::unique.
4102  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4103  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4104  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4105  merge2 (gblCols_newEnd, vals_newEnd,
4106  gblCols.begin (), gblCols.end (),
4107  vals.begin (), vals.end ());
4108  gblCols.erase (gblCols_newEnd, gblCols.end ());
4109  vals.erase (vals_newEnd, vals.end ());
4110  numEntPerNonlocalRow[curPos] = gblCols.size ();
4111  }
4112 
4113  // Currently, Map requires that its indexBase be the global min
4114  // of all its global indices. Map won't compute this for us, so
4115  // we must do it. If our process has no nonlocal rows, set the
4116  // "min" to the max possible GO value. This ensures that if
4117  // some process has at least one nonlocal row, then it will pick
4118  // that up as the min. We know that at least one process has a
4119  // nonlocal row, since the all-reduce and return at the top of
4120  // this method excluded that case.
4121  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4122  {
4123  auto iter = std::min_element (myNonlocalGblRows.begin (),
4124  myNonlocalGblRows.end ());
4125  if (iter != myNonlocalGblRows.end ()) {
4126  myMinNonlocalGblRow = *iter;
4127  }
4128  }
4129  GO gblMinNonlocalGblRow = 0;
4130  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4131  outArg (gblMinNonlocalGblRow));
4132  const GO indexBase = gblMinNonlocalGblRow;
4133  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4134  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4135  }
4136 
4137  // 3. Use the values and column indices for each nonlocal row, as
4138  // stored in nonlocals_, to construct a CrsMatrix corresponding
4139  // to nonlocal rows. We have
4140  // exact counts of the number of entries in each nonlocal row.
4141 
4142  if (verbose) {
4143  std::ostringstream os;
4144  os << *prefix << "Create nonlocal matrix" << endl;
4145  std::cerr << os.str();
4146  }
4147  RCP<crs_matrix_type> nonlocalMatrix =
4148  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4149  {
4150  size_type curPos = 0;
4151  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4152  ++mapIter, ++curPos) {
4153  const GO gblRow = mapIter->first;
4154  // Get values & column indices by ref, just to avoid copy.
4155  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4156  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4157  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4158  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4159  }
4160  }
4161  // There's no need to fill-complete the nonlocals matrix.
4162  // We just use it as a temporary container for the Export.
4163 
4164  // 4. If the original row Map is one to one, then we can Export
4165  // directly from nonlocalMatrix into this. Otherwise, we have
4166  // to create a temporary matrix with a one-to-one row Map,
4167  // Export into that, then Import from the temporary matrix into
4168  // *this.
4169 
4170  auto origRowMap = this->getRowMap ();
4171  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4172 
4173  int isLocallyComplete = 1; // true by default
4174 
4175  if (origRowMapIsOneToOne) {
4176  if (verbose) {
4177  std::ostringstream os;
4178  os << *prefix << "Original row Map is 1-to-1" << endl;
4179  std::cerr << os.str();
4180  }
4181  export_type exportToOrig (nonlocalRowMap, origRowMap);
4182  if (! exportToOrig.isLocallyComplete ()) {
4183  isLocallyComplete = 0;
4184  }
4185  if (verbose) {
4186  std::ostringstream os;
4187  os << *prefix << "doExport from nonlocalMatrix" << endl;
4188  std::cerr << os.str();
4189  }
4190  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4191  // We're done at this point!
4192  }
4193  else {
4194  if (verbose) {
4195  std::ostringstream os;
4196  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4197  std::cerr << os.str();
4198  }
4199  // If you ask a Map whether it is one to one, it does some
4200  // communication and stashes intermediate results for later use
4201  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4202  // much more then the original cost of calling isOneToOne.
4203  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4204  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4205  if (! exportToOneToOne.isLocallyComplete ()) {
4206  isLocallyComplete = 0;
4207  }
4208 
4209  // Create a temporary matrix with the one-to-one row Map.
4210  //
4211  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4212  // each row, to avoid reallocation during the Export operation.
4213  if (verbose) {
4214  std::ostringstream os;
4215  os << *prefix << "Create & doExport into 1-to-1 matrix"
4216  << endl;
4217  std::cerr << os.str();
4218  }
4219  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4220  // Export from matrix of nonlocals into the temp one-to-one matrix.
4221  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4222  Tpetra::ADD);
4223 
4224  // We don't need the matrix of nonlocals anymore, so get rid of
4225  // it, to keep the memory high-water mark down.
4226  if (verbose) {
4227  std::ostringstream os;
4228  os << *prefix << "Free nonlocalMatrix" << endl;
4229  std::cerr << os.str();
4230  }
4231  nonlocalMatrix = Teuchos::null;
4232 
4233  // Import from the one-to-one matrix to the original matrix.
4234  if (verbose) {
4235  std::ostringstream os;
4236  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4237  std::cerr << os.str();
4238  }
4239  import_type importToOrig (oneToOneRowMap, origRowMap);
4240  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4241  }
4242 
4243  // It's safe now to clear out nonlocals_, since we've already
4244  // committed side effects to *this. The standard idiom for
4245  // clearing a Container like std::map, is to swap it with an empty
4246  // Container and let the swapped Container fall out of scope.
4247  if (verbose) {
4248  std::ostringstream os;
4249  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4250  std::cerr << os.str();
4251  }
4252  decltype (nonlocals_) newNonlocals;
4253  std::swap (nonlocals_, newNonlocals);
4254 
4255  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4256  // don't like throwing an exception here. A local return value
4257  // would likely be more useful to users. However, if users find
4258  // themselves exercising nonlocal inserts often, then they are
4259  // probably novice users who need the help. See Gibhub Issues
4260  // #603 and #601 (esp. the latter) for discussion.
4261 
4262  int isGloballyComplete = 0; // output argument of reduceAll
4263  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4264  outArg (isGloballyComplete));
4265  TEUCHOS_TEST_FOR_EXCEPTION
4266  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4267  "you called insertGlobalValues with a global row index which is not in "
4268  "the matrix's row Map on any process in its communicator.");
4269  }
4270 
4271  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4272  void
4274  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4275  {
4276  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4277  myGraph_->resumeFill (params);
4278  }
4279  // Delete the apply helper (if it exists)
4280  applyHelper.reset();
4281  fillComplete_ = false;
4282  }
4283 
4284  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4285  bool
4288  return getCrsGraphRef ().haveGlobalConstants ();
4289  }
4290 
4291  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4292  void
4294  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4295  {
4296  const char tfecfFuncName[] = "fillComplete(params): ";
4297 
4298  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4299  (this->getCrsGraph ().is_null (), std::logic_error,
4300  "getCrsGraph() returns null. This should not happen at this point. "
4301  "Please report this bug to the Tpetra developers.");
4302 
4303  const crs_graph_type& graph = this->getCrsGraphRef ();
4304  if (this->isStaticGraph () && graph.isFillComplete ()) {
4305  // If this matrix's graph is fill complete and the user did not
4306  // supply a domain or range Map, use the graph's domain and
4307  // range Maps.
4308  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4309  }
4310  else { // assume that user's row Map is the domain and range Map
4311  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4312  Teuchos::RCP<const map_type> domainMap = rangeMap;
4313  this->fillComplete (domainMap, rangeMap, params);
4314  }
4315  }
4316 
4317  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4318  void
4320  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4321  const Teuchos::RCP<const map_type>& rangeMap,
4322  const Teuchos::RCP<Teuchos::ParameterList>& params)
4323  {
4324  using Details::Behavior;
4326  using Teuchos::ArrayRCP;
4327  using Teuchos::RCP;
4328  using Teuchos::rcp;
4329  using std::endl;
4330  const char tfecfFuncName[] = "fillComplete: ";
4331  ProfilingRegion regionFillComplete
4332  ("Tpetra::CrsMatrix::fillComplete");
4333  const bool verbose = Behavior::verbose("CrsMatrix");
4334  std::unique_ptr<std::string> prefix;
4335  if (verbose) {
4336  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4337  std::ostringstream os;
4338  os << *prefix << endl;
4339  std::cerr << os.str ();
4340  }
4341  Details::ProfilingRegion region(
4342  "Tpetra::CrsMatrix::fillCompete",
4343  "fillCompete");
4344 
4345  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4346  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4347  "Matrix fill state must be active (isFillActive() "
4348  "must be true) before you may call fillComplete().");
4349  const int numProcs = this->getComm ()->getSize ();
4350 
4351  //
4352  // Read parameters from the input ParameterList.
4353  //
4354  {
4355  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4356 
4357  // If true, the caller promises that no process did nonlocal
4358  // changes since the last call to fillComplete.
4359  bool assertNoNonlocalInserts = false;
4360  // If true, makeColMap sorts remote GIDs (within each remote
4361  // process' group).
4362  bool sortGhosts = true;
4363 
4364  if (! params.is_null ()) {
4365  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4366  assertNoNonlocalInserts);
4367  if (params->isParameter ("sort column map ghost gids")) {
4368  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4369  }
4370  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4371  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4372  }
4373  }
4374  // We also don't need to do global assembly if there is only one
4375  // process in the communicator.
4376  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4377  // This parameter only matters if this matrix owns its graph.
4378  if (! this->myGraph_.is_null ()) {
4379  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4380  }
4381 
4382  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4383  if (this->hasColMap ()) { // use local indices
4384  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4385  }
4386  else { // no column Map, so use global indices
4387  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4388  }
4389  }
4390  // Global assemble, if we need to. This call only costs a single
4391  // all-reduce if we didn't need global assembly after all.
4392  if (needGlobalAssemble) {
4393  this->globalAssemble ();
4394  }
4395  else {
4396  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4397  (numProcs == 1 && nonlocals_.size() > 0,
4398  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4399  "An invalid entry (i.e., with row index not in the row Map) must have "
4400  "been submitted to the CrsMatrix.");
4401  }
4402  }
4403  if (this->isStaticGraph ()) {
4404  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4405  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4406  // checks below only in debug mode. It would be nicer to do a
4407  // local check, then propagate the error state in a deferred
4408  // way, whenever communication happens. That would reduce the
4409  // cost of checking, to the point where it may make sense to
4410  // enable it even in release mode.
4411 #ifdef HAVE_TPETRA_DEBUG
4412  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4413  // input Maps incurs a penalty of two all-reduces for the
4414  // otherwise optimal const graph case.
4415  //
4416  // We could turn these (max) 2 all-reduces into (max) 1, by
4417  // fusing them. We could do this by adding a "locallySameAs"
4418  // method to Map, which would return one of four states:
4419  //
4420  // a. Certainly globally the same
4421  // b. Certainly globally not the same
4422  // c. Locally the same
4423  // d. Locally not the same
4424  //
4425  // The first two states don't require further communication.
4426  // The latter two states require an all-reduce to communicate
4427  // globally, but we only need one all-reduce, since we only need
4428  // to check whether at least one of the Maps is wrong.
4429  const bool domainMapsMatch =
4430  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4431  const bool rangeMapsMatch =
4432  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4433 
4434  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4435  (! domainMapsMatch, std::runtime_error,
4436  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4437  "The graph cannot be changed because it was given to the CrsMatrix "
4438  "constructor as const. You can fix this by passing in the graph's "
4439  "domain Map and range Map to the matrix's fillComplete call.");
4440 
4441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4442  (! rangeMapsMatch, std::runtime_error,
4443  "The CrsMatrix's range Map does not match the graph's range Map. "
4444  "The graph cannot be changed because it was given to the CrsMatrix "
4445  "constructor as const. You can fix this by passing in the graph's "
4446  "domain Map and range Map to the matrix's fillComplete call.");
4447 #endif // HAVE_TPETRA_DEBUG
4448 
4449  // The matrix does _not_ own the graph, and the graph's
4450  // structure is already fixed, so just fill the local matrix.
4451  this->fillLocalMatrix (params);
4452  }
4453  else {
4454  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4455  // Set the graph's domain and range Maps. This will clear the
4456  // Import if the domain Map has changed (is a different
4457  // pointer), and the Export if the range Map has changed (is a
4458  // different pointer).
4459  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4460 
4461  // Make the graph's column Map, if necessary.
4462  Teuchos::Array<int> remotePIDs (0);
4463  const bool mustBuildColMap = ! this->hasColMap ();
4464  if (mustBuildColMap) {
4465  this->myGraph_->makeColMap (remotePIDs);
4466  }
4467 
4468  // Make indices local, if necessary. The method won't do
4469  // anything if the graph is already locally indexed.
4470  const std::pair<size_t, std::string> makeIndicesLocalResult =
4471  this->myGraph_->makeIndicesLocal(verbose);
4472  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4473  // the error state to makeImportExport
4474  // which may do all-reduces and thus may
4475  // have the opportunity to communicate that error state.
4476  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4477  (makeIndicesLocalResult.first != 0, std::runtime_error,
4478  makeIndicesLocalResult.second);
4479 
4480  const bool sorted = this->myGraph_->isSorted ();
4481  const bool merged = this->myGraph_->isMerged ();
4482  this->sortAndMergeIndicesAndValues (sorted, merged);
4483 
4484  // Make Import and Export objects, if they haven't been made
4485  // already. If we made a column Map above, reuse information
4486  // from that process to avoid communiation in the Import setup.
4487  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4488 
4489  // The matrix _does_ own the graph, so fill the local graph at
4490  // the same time as the local matrix.
4491  this->fillLocalGraphAndMatrix (params);
4492 
4493  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4494  params->get ("compute global constants", true);
4495  if (callGraphComputeGlobalConstants) {
4496  this->myGraph_->computeGlobalConstants ();
4497  }
4498  else {
4499  this->myGraph_->computeLocalConstants ();
4500  }
4501  this->myGraph_->fillComplete_ = true;
4502  this->myGraph_->checkInternalState ();
4503  }
4504 
4505  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4506 
4507  this->fillComplete_ = true; // Now we're fill complete!
4508  {
4509  Details::ProfilingRegion region_cis(
4510  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4511  );
4512  this->checkInternalState ();
4513  }
4514  } //fillComplete(domainMap, rangeMap, params)
4515 
4516  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4517  void
4519  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4520  const Teuchos::RCP<const map_type> & rangeMap,
4521  const Teuchos::RCP<const import_type>& importer,
4522  const Teuchos::RCP<const export_type>& exporter,
4523  const Teuchos::RCP<Teuchos::ParameterList> &params)
4524  {
4525 #ifdef HAVE_TPETRA_MMM_TIMINGS
4526  std::string label;
4527  if(!params.is_null())
4528  label = params->get("Timer Label",label);
4529  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4530  using Teuchos::TimeMonitor;
4531 
4532  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4533 #endif
4534 
4535  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4536  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4537  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4538  "must be true) before calling fillComplete().");
4539  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4540  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4541 
4542  {
4543 #ifdef HAVE_TPETRA_MMM_TIMINGS
4544  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4545 #endif
4546  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4547  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4548  }
4549 
4550  {
4551 #ifdef HAVE_TPETRA_MMM_TIMINGS
4552  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4553 #endif
4554  // Fill the local graph and matrix
4555  fillLocalGraphAndMatrix (params);
4556  }
4557  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4558 
4559  // Now we're fill complete!
4560  fillComplete_ = true;
4561 
4562  // Sanity checks at the end.
4563 #ifdef HAVE_TPETRA_DEBUG
4564  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4565  ": We're at the end of fillComplete(), but isFillActive() is true. "
4566  "Please report this bug to the Tpetra developers.");
4567  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4568  ": We're at the end of fillComplete(), but isFillActive() is true. "
4569  "Please report this bug to the Tpetra developers.");
4570 #endif // HAVE_TPETRA_DEBUG
4571  {
4572 #ifdef HAVE_TPETRA_MMM_TIMINGS
4573  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4574 #endif
4575 
4576  checkInternalState();
4577  }
4578  }
4579 
4580  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4582  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4583  {
4584  impl_scalar_type* rowValueIter = vals;
4585  // beg,end define a half-exclusive interval over which to iterate.
4586  LocalOrdinal* beg = cols;
4587  LocalOrdinal* end = cols + rowLen;
4588  LocalOrdinal* newend = beg;
4589  if (beg != end) {
4590  LocalOrdinal* cur = beg + 1;
4591  impl_scalar_type* vcur = rowValueIter + 1;
4592  impl_scalar_type* vend = rowValueIter;
4593  cur = beg+1;
4594  while (cur != end) {
4595  if (*cur != *newend) {
4596  // new entry; save it
4597  ++newend;
4598  ++vend;
4599  (*newend) = (*cur);
4600  (*vend) = (*vcur);
4601  }
4602  else {
4603  // old entry; merge it
4604  //(*vend) = f (*vend, *vcur);
4605  (*vend) += *vcur;
4606  }
4607  ++cur;
4608  ++vcur;
4609  }
4610  ++newend; // one past the last entry, per typical [beg,end) semantics
4611  }
4612  return newend - beg;
4613  }
4614 
4615  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4616  void
4618  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4619  {
4620  using ::Tpetra::Details::ProfilingRegion;
4621  typedef LocalOrdinal LO;
4622  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4623  host_execution_space;
4624  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4625  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4626  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4627 
4628  if (! sorted || ! merged) {
4629  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4630  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4631  "\"static\" (const) graph, since the matrix does not own the graph.");
4632  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4633  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4634  "this matrix claims ! isStaticGraph(). "
4635  "Please report this bug to the Tpetra developers.");
4636  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4637  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4638  "this method if the graph's storage has already been optimized. "
4639  "Please report this bug to the Tpetra developers.");
4640 
4641  crs_graph_type& graph = * (this->myGraph_);
4642  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4643  size_t totalNumDups = 0;
4644  {
4645  //Accessing host unpacked (4-array CRS) local matrix.
4646  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4647  auto rowLengths_ = graph.k_numRowEntries_;
4648  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4649  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4650  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4651  [=] (const LO lclRow, size_t& numDups) {
4652  size_t rowBegin = rowBegins_(lclRow);
4653  size_t rowLen = rowLengths_(lclRow);
4654  LO* cols = cols_.data() + rowBegin;
4655  impl_scalar_type* vals = vals_.data() + rowBegin;
4656  if (! sorted) {
4657  sort2 (cols, cols + rowLen, vals);
4658  }
4659  if (! merged) {
4660  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4661  rowLengths_(lclRow) = newRowLength;
4662  numDups += rowLen - newRowLength;
4663  }
4664  }, totalNumDups);
4665  }
4666  if (! sorted) {
4667  graph.indicesAreSorted_ = true; // we just sorted every row
4668  }
4669  if (! merged) {
4670  graph.noRedundancies_ = true; // we just merged every row
4671  }
4672  }
4673  }
4674 
4675  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4676  void
4680  Scalar alpha,
4681  Scalar beta) const
4682  {
4684  using Teuchos::RCP;
4685  using Teuchos::rcp;
4686  using Teuchos::rcp_const_cast;
4687  using Teuchos::rcpFromRef;
4688  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4689  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4690 
4691  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4692  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4693  // failing only for the Kokkos refactor version of Tpetra. It's a
4694  // good idea regardless to have the bypass.
4695  if (alpha == ZERO) {
4696  if (beta == ZERO) {
4697  Y_in.putScalar (ZERO);
4698  } else if (beta != ONE) {
4699  Y_in.scale (beta);
4700  }
4701  return;
4702  }
4703 
4704  // It's possible that X is a view of Y or vice versa. We don't
4705  // allow this (apply() requires that X and Y not alias one
4706  // another), but it's helpful to detect and work around this case.
4707  // We don't try to to detect the more subtle cases (e.g., one is a
4708  // subview of the other, but their initial pointers differ). We
4709  // only need to do this if this matrix's Import is trivial;
4710  // otherwise, we don't actually apply the operator from X into Y.
4711 
4712  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4713  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4714 
4715  // If beta == 0, then the output MV will be overwritten; none of
4716  // its entries should be read. (Sparse BLAS semantics say that we
4717  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4718  // This matters if we need to do an Export operation; see below.
4719  const bool Y_is_overwritten = (beta == ZERO);
4720 
4721  // We treat the case of a replicated MV output specially.
4722  const bool Y_is_replicated =
4723  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4724 
4725  // This is part of the special case for replicated MV output.
4726  // We'll let each process do its thing, but do an all-reduce at
4727  // the end to sum up the results. Setting beta=0 on all processes
4728  // but Proc 0 makes the math work out for the all-reduce. (This
4729  // assumes that the replicated data is correctly replicated, so
4730  // that the data are the same on all processes.)
4731  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4732  beta = ZERO;
4733  }
4734 
4735  // Temporary MV for Import operation. After the block of code
4736  // below, this will be an (Imported if necessary) column Map MV
4737  // ready to give to localApply(...).
4738  RCP<const MV> X_colMap;
4739  if (importer.is_null ()) {
4740  if (! X_in.isConstantStride ()) {
4741  // Not all sparse mat-vec kernels can handle an input MV with
4742  // nonconstant stride correctly, so we have to copy it in that
4743  // case into a constant stride MV. To make a constant stride
4744  // copy of X_in, we force creation of the column (== domain)
4745  // Map MV (if it hasn't already been created, else fetch the
4746  // cached copy). This avoids creating a new MV each time.
4747  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4748  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4749  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4750  }
4751  else {
4752  // The domain and column Maps are the same, so do the local
4753  // multiply using the domain Map input MV X_in.
4754  X_colMap = rcpFromRef (X_in);
4755  }
4756  }
4757  else { // need to Import source (multi)vector
4758  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4759 
4760  // We're doing an Import anyway, which will copy the relevant
4761  // elements of the domain Map MV X_in into a separate column Map
4762  // MV. Thus, we don't have to worry whether X_in is constant
4763  // stride.
4764  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4765 
4766  // Import from the domain Map MV to the column Map MV.
4767  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4768  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4769  }
4770 
4771  // Temporary MV for doExport (if needed), or for copying a
4772  // nonconstant stride output MV into a constant stride MV. This
4773  // is null if we don't need the temporary MV, that is, if the
4774  // Export is trivial (null).
4775  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4776 
4777  // If we have a nontrivial Export object, we must perform an
4778  // Export. In that case, the local multiply result will go into
4779  // the row Map multivector. We don't have to make a
4780  // constant-stride version of Y_in in this case, because we had to
4781  // make a constant stride Y_rowMap MV and do an Export anyway.
4782  if (! exporter.is_null ()) {
4783  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4784  {
4785  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4786 
4787  // If we're overwriting the output MV Y_in completely (beta ==
4788  // 0), then make sure that it is filled with zeros before we
4789  // do the Export. Otherwise, the ADD combine mode will use
4790  // data in Y_in, which is supposed to be zero.
4791  if (Y_is_overwritten) {
4792  Y_in.putScalar (ZERO);
4793  }
4794  else {
4795  // Scale output MV by beta, so that doExport sums in the
4796  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4797  Y_in.scale (beta);
4798  }
4799  // Do the Export operation.
4800  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4801  }
4802  }
4803  else { // Don't do an Export: row Map and range Map are the same.
4804  //
4805  // If Y_in does not have constant stride, or if the column Map
4806  // MV aliases Y_in, then we can't let the kernel write directly
4807  // to Y_in. Instead, we have to use the cached row (== range)
4808  // Map MV as temporary storage.
4809  //
4810  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4811  // the user passed in the same MultiVector for both X and Y. It
4812  // won't detect whether one MultiVector views the other. We
4813  // should also check the MultiVectors' raw data pointers.
4814  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4815  // Force creating the MV if it hasn't been created already.
4816  // This will reuse a previously created cached MV.
4817  Y_rowMap = getRowMapMultiVector (Y_in, true);
4818 
4819  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4820  // since we're overwriting it anyway.
4821  if (beta != ZERO) {
4822  Tpetra::deep_copy (*Y_rowMap, Y_in);
4823  }
4824  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4825  Tpetra::deep_copy (Y_in, *Y_rowMap);
4826  }
4827  else {
4828  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4829  }
4830  }
4831 
4832  // If the range Map is a locally replicated Map, sum up
4833  // contributions from each process. We set beta = 0 on all
4834  // processes but Proc 0 initially, so this will handle the scaling
4835  // factor beta correctly.
4836  if (Y_is_replicated) {
4837  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4838  Y_in.reduce ();
4839  }
4840  }
4841 
4842  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4843  void
4847  const Teuchos::ETransp mode,
4848  Scalar alpha,
4849  Scalar beta) const
4850  {
4852  using Teuchos::null;
4853  using Teuchos::RCP;
4854  using Teuchos::rcp;
4855  using Teuchos::rcp_const_cast;
4856  using Teuchos::rcpFromRef;
4857  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4858 
4859  // Take shortcuts for alpha == 0.
4860  if (alpha == ZERO) {
4861  // Follow the Sparse BLAS convention by ignoring both the matrix
4862  // and X_in, in this case.
4863  if (beta == ZERO) {
4864  // Follow the Sparse BLAS convention by overwriting any Inf or
4865  // NaN values in Y_in, in this case.
4866  Y_in.putScalar (ZERO);
4867  }
4868  else {
4869  Y_in.scale (beta);
4870  }
4871  return;
4872  }
4873  else if (beta == ZERO) {
4874  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4875  //when bets==0. This was not the case with transpose in a multithreaded
4876  //environment where a multiplication with subsequent atomic_adds is used
4877  //since 0 is effectively not special cased. Doing the explicit set to zero here
4878  //This catches cases where Y is nan or inf.
4879  Y_in.putScalar (ZERO);
4880  }
4881 
4882  const size_t numVectors = X_in.getNumVectors ();
4883 
4884  // We don't allow X_in and Y_in to alias one another. It's hard
4885  // to check this, because advanced users could create views from
4886  // raw pointers. However, if X_in and Y_in reference the same
4887  // object, we will do the user a favor by copying X into new
4888  // storage (with a warning). We only need to do this if we have
4889  // trivial importers; otherwise, we don't actually apply the
4890  // operator from X into Y.
4891  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4892  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4893  // access X indirectly, in case we need to create temporary storage
4894  RCP<const MV> X;
4895 
4896  // some parameters for below
4897  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4898  const bool Y_is_overwritten = (beta == ZERO);
4899  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4900  beta = ZERO;
4901  }
4902 
4903  // The kernels do not allow input or output with nonconstant stride.
4904  if (! X_in.isConstantStride () && importer.is_null ()) {
4905  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4906  } else {
4907  X = rcpFromRef (X_in); // Reference to X_in
4908  }
4909 
4910  // Set up temporary multivectors for Import and/or Export.
4911  if (importer != Teuchos::null) {
4912  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4913  importMV_ = null;
4914  }
4915  if (importMV_ == null) {
4916  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4917  }
4918  }
4919  if (exporter != Teuchos::null) {
4920  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4921  exportMV_ = null;
4922  }
4923  if (exportMV_ == null) {
4924  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4925  }
4926  }
4927 
4928  // If we have a non-trivial exporter, we must import elements that
4929  // are permuted or are on other processors.
4930  if (! exporter.is_null ()) {
4931  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4932  exportMV_->doImport (X_in, *exporter, INSERT);
4933  X = exportMV_; // multiply out of exportMV_
4934  }
4935 
4936  // If we have a non-trivial importer, we must export elements that
4937  // are permuted or belong to other processors. We will compute
4938  // solution into the to-be-exported MV; get a view.
4939  if (importer != Teuchos::null) {
4940  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4941 
4942  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4943  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4944  // back and figure out why this helps. importMV_ SHOULD be
4945  // completely overwritten in the localApply(...) call
4946  // below, because beta == ZERO there.
4947  importMV_->putScalar (ZERO);
4948  // Do the local computation.
4949  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4950 
4951  if (Y_is_overwritten) {
4952  Y_in.putScalar (ZERO);
4953  } else {
4954  Y_in.scale (beta);
4955  }
4956  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4957  }
4958  // otherwise, multiply into Y
4959  else {
4960  // can't multiply in-situ; can't multiply into non-strided multivector
4961  //
4962  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4963  // the user passed in the same MultiVector for both X and Y. It
4964  // won't detect whether one MultiVector views the other. We
4965  // should also check the MultiVectors' raw data pointers.
4966  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4967  // Make a deep copy of Y_in, into which to write the multiply result.
4968  MV Y (Y_in, Teuchos::Copy);
4969  this->localApply (*X, Y, mode, alpha, beta);
4970  Tpetra::deep_copy (Y_in, Y);
4971  } else {
4972  this->localApply (*X, Y_in, mode, alpha, beta);
4973  }
4974  }
4975 
4976  // If the range Map is a locally replicated map, sum the
4977  // contributions from each process. (That's why we set beta=0
4978  // above for all processes but Proc 0.)
4979  if (Y_is_replicated) {
4980  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4981  Y_in.reduce ();
4982  }
4983  }
4984 
4985  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4986  void
4990  const Teuchos::ETransp mode,
4991  const Scalar& alpha,
4992  const Scalar& beta) const
4993  {
4995  using Teuchos::NO_TRANS;
4996  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
4997 
4998  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
4999  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5000 
5001  const bool debug = ::Tpetra::Details::Behavior::debug ();
5002  if (debug) {
5003  const char tfecfFuncName[] = "localApply: ";
5004  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5005  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5006  "X.getNumVectors() = " << X.getNumVectors () << " != "
5007  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5008  const bool transpose = (mode != Teuchos::NO_TRANS);
5009  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5010  (! transpose && X.getLocalLength () !=
5011  getColMap ()->getLocalNumElements (), std::runtime_error,
5012  "NO_TRANS case: X has the wrong number of local rows. "
5013  "X.getLocalLength() = " << X.getLocalLength () << " != "
5014  "getColMap()->getLocalNumElements() = " <<
5015  getColMap ()->getLocalNumElements () << ".");
5016  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5017  (! transpose && Y.getLocalLength () !=
5018  getRowMap ()->getLocalNumElements (), std::runtime_error,
5019  "NO_TRANS case: Y has the wrong number of local rows. "
5020  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5021  "getRowMap()->getLocalNumElements() = " <<
5022  getRowMap ()->getLocalNumElements () << ".");
5023  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5024  (transpose && X.getLocalLength () !=
5025  getRowMap ()->getLocalNumElements (), std::runtime_error,
5026  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5027  "rows. X.getLocalLength() = " << X.getLocalLength ()
5028  << " != getRowMap()->getLocalNumElements() = "
5029  << getRowMap ()->getLocalNumElements () << ".");
5030  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5031  (transpose && Y.getLocalLength () !=
5032  getColMap ()->getLocalNumElements (), std::runtime_error,
5033  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5034  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5035  << " != getColMap()->getLocalNumElements() = "
5036  << getColMap ()->getLocalNumElements () << ".");
5037  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5038  (! isFillComplete (), std::runtime_error, "The matrix is not "
5039  "fill complete. You must call fillComplete() (possibly with "
5040  "domain and range Map arguments) without an intervening "
5041  "resumeFill() call before you may call this method.");
5042  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5043  (! X.isConstantStride () || ! Y.isConstantStride (),
5044  std::runtime_error, "X and Y must be constant stride.");
5045  // If the two pointers are null, then they don't alias one
5046  // another, even though they are equal.
5047  // Kokkos does not guarantee that zero row-extent vectors
5048  // point to different places, so we have to check that too.
5049  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5050  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5051  && X_lcl.extent(0) != 0,
5052  std::runtime_error, "X and Y may not alias one another.");
5053  }
5054 
5055  auto A_lcl = getLocalMatrixDevice();
5056 
5057  if(!applyHelper.get()) {
5058  // The apply helper does not exist, so create it.
5059  // Decide now whether to use the imbalanced row path, or the default.
5060  bool useMergePath = false;
5061 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5062  //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5063  //we can use SPMV_MERGE_PATH for the native spmv as well.
5064  //Take out this ifdef to enable that.
5065  //
5066  //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5067  if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5068  LocalOrdinal nrows = getLocalNumRows();
5069  LocalOrdinal maxRowImbalance = 0;
5070  if(nrows != 0)
5071  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5072 
5073  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5074  useMergePath = true;
5075  }
5076 #endif
5077  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5078  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5079  }
5080 
5081  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5082  const char* modeKK = nullptr;
5083  switch(mode)
5084  {
5085  case Teuchos::NO_TRANS:
5086  modeKK = KokkosSparse::NoTranspose; break;
5087  case Teuchos::TRANS:
5088  modeKK = KokkosSparse::Transpose; break;
5089  case Teuchos::CONJ_TRANS:
5090  modeKK = KokkosSparse::ConjugateTranspose; break;
5091  default:
5092  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5093  }
5094 
5095  if(applyHelper->shouldUseIntRowptrs())
5096  {
5097  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5098  KokkosSparse::spmv(
5099  &applyHelper->handle_int, modeKK,
5100  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5101  }
5102  else
5103  {
5104  KokkosSparse::spmv(
5105  &applyHelper->handle, modeKK,
5106  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5107  }
5108  }
5109 
5110  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5111  void
5115  Teuchos::ETransp mode,
5116  Scalar alpha,
5117  Scalar beta) const
5118  {
5120  const char fnName[] = "Tpetra::CrsMatrix::apply";
5121 
5122  TEUCHOS_TEST_FOR_EXCEPTION
5123  (! isFillComplete (), std::runtime_error,
5124  fnName << ": Cannot call apply() until fillComplete() "
5125  "has been called.");
5126 
5127  if (mode == Teuchos::NO_TRANS) {
5128  ProfilingRegion regionNonTranspose (fnName);
5129  this->applyNonTranspose (X, Y, alpha, beta);
5130  }
5131  else {
5132  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5133  this->applyTranspose (X, Y, mode, alpha, beta);
5134  }
5135  }
5136 
5137 
5138  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5139  template<class T>
5140  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5142  convert () const
5143  {
5144  using Teuchos::RCP;
5145  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5146  const char tfecfFuncName[] = "convert: ";
5147 
5148  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5149  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5150  "of the conversion) is not fill complete. You must first call "
5151  "fillComplete() (possibly with the domain and range Map) without an "
5152  "intervening call to resumeFill(), before you may call this method.");
5153 
5154  RCP<output_matrix_type> newMatrix
5155  (new output_matrix_type (this->getCrsGraph ()));
5156  // Copy old values into new values. impl_scalar_type and T may
5157  // differ, so we can't use Kokkos::deep_copy.
5159  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5160  this->getLocalMatrixDevice ().values);
5161  // Since newmat has a static (const) graph, the graph already has
5162  // a column Map, and Import and Export objects already exist (if
5163  // applicable). Thus, calling fillComplete is cheap.
5164  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5165 
5166  return newMatrix;
5167  }
5168 
5169 
5170  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5171  void
5174  {
5175  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5176  if (debug) {
5177  const char tfecfFuncName[] = "checkInternalState: ";
5178  const char err[] = "Internal state is not consistent. "
5179  "Please report this bug to the Tpetra developers.";
5180 
5181  // This version of the graph (RCP<const crs_graph_type>) must
5182  // always be nonnull.
5183  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5184  (staticGraph_.is_null (), std::logic_error, err);
5185  // myGraph == null means that the matrix has a const ("static")
5186  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5187  // graph).
5188  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5189  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5190  std::logic_error, err);
5191  // if matrix is fill complete, then graph must be fill complete
5192  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5193  (isFillComplete () && ! staticGraph_->isFillComplete (),
5194  std::logic_error, err << " Specifically, the matrix is fill complete, "
5195  "but its graph is NOT fill complete.");
5196  // if values are allocated and they are non-zero in number, then
5197  // one of the allocations should be present
5198  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5199  (staticGraph_->indicesAreAllocated () &&
5200  staticGraph_->getLocalAllocationSize() > 0 &&
5201  staticGraph_->getLocalNumRows() > 0 &&
5202  valuesUnpacked_wdv.extent (0) == 0,
5203  std::logic_error, err);
5204  }
5205  }
5206 
5207  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5208  std::string
5211  {
5212  std::ostringstream os;
5213 
5214  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5215  if (this->getObjectLabel () != "") {
5216  os << "Label: \"" << this->getObjectLabel () << "\", ";
5217  }
5218  if (isFillComplete ()) {
5219  os << "isFillComplete: true"
5220  << ", global dimensions: [" << getGlobalNumRows () << ", "
5221  << getGlobalNumCols () << "]"
5222  << ", global number of entries: " << getGlobalNumEntries ()
5223  << "}";
5224  }
5225  else {
5226  os << "isFillComplete: false"
5227  << ", global dimensions: [" << getGlobalNumRows () << ", "
5228  << getGlobalNumCols () << "]}";
5229  }
5230  return os.str ();
5231  }
5232 
5233  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5234  void
5236  describe (Teuchos::FancyOStream &out,
5237  const Teuchos::EVerbosityLevel verbLevel) const
5238  {
5239  using std::endl;
5240  using std::setw;
5241  using Teuchos::ArrayView;
5242  using Teuchos::Comm;
5243  using Teuchos::RCP;
5244  using Teuchos::TypeNameTraits;
5245  using Teuchos::VERB_DEFAULT;
5246  using Teuchos::VERB_NONE;
5247  using Teuchos::VERB_LOW;
5248  using Teuchos::VERB_MEDIUM;
5249  using Teuchos::VERB_HIGH;
5250  using Teuchos::VERB_EXTREME;
5251 
5252  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5253 
5254  if (vl == VERB_NONE) {
5255  return; // Don't print anything at all
5256  }
5257 
5258  // By convention, describe() always begins with a tab.
5259  Teuchos::OSTab tab0 (out);
5260 
5261  RCP<const Comm<int> > comm = this->getComm();
5262  const int myRank = comm->getRank();
5263  const int numProcs = comm->getSize();
5264  size_t width = 1;
5265  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5266  ++width;
5267  }
5268  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5269 
5270  // none: print nothing
5271  // low: print O(1) info from node 0
5272  // medium: print O(P) info, num entries per process
5273  // high: print O(N) info, num entries per row
5274  // extreme: print O(NNZ) info: print indices and values
5275  //
5276  // for medium and higher, print constituent objects at specified verbLevel
5277  if (myRank == 0) {
5278  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5279  }
5280  Teuchos::OSTab tab1 (out);
5281 
5282  if (myRank == 0) {
5283  if (this->getObjectLabel () != "") {
5284  out << "Label: \"" << this->getObjectLabel () << "\", ";
5285  }
5286  {
5287  out << "Template parameters:" << endl;
5288  Teuchos::OSTab tab2 (out);
5289  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5290  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5291  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5292  << "Node: " << TypeNameTraits<Node>::name () << endl;
5293  }
5294  if (isFillComplete()) {
5295  out << "isFillComplete: true" << endl
5296  << "Global dimensions: [" << getGlobalNumRows () << ", "
5297  << getGlobalNumCols () << "]" << endl
5298  << "Global number of entries: " << getGlobalNumEntries () << endl
5299  << endl << "Global max number of entries in a row: "
5300  << getGlobalMaxNumRowEntries () << endl;
5301  }
5302  else {
5303  out << "isFillComplete: false" << endl
5304  << "Global dimensions: [" << getGlobalNumRows () << ", "
5305  << getGlobalNumCols () << "]" << endl;
5306  }
5307  }
5308 
5309  if (vl < VERB_MEDIUM) {
5310  return; // all done!
5311  }
5312 
5313  // Describe the row Map.
5314  if (myRank == 0) {
5315  out << endl << "Row Map:" << endl;
5316  }
5317  if (getRowMap ().is_null ()) {
5318  if (myRank == 0) {
5319  out << "null" << endl;
5320  }
5321  }
5322  else {
5323  if (myRank == 0) {
5324  out << endl;
5325  }
5326  getRowMap ()->describe (out, vl);
5327  }
5328 
5329  // Describe the column Map.
5330  if (myRank == 0) {
5331  out << "Column Map: ";
5332  }
5333  if (getColMap ().is_null ()) {
5334  if (myRank == 0) {
5335  out << "null" << endl;
5336  }
5337  } else if (getColMap () == getRowMap ()) {
5338  if (myRank == 0) {
5339  out << "same as row Map" << endl;
5340  }
5341  } else {
5342  if (myRank == 0) {
5343  out << endl;
5344  }
5345  getColMap ()->describe (out, vl);
5346  }
5347 
5348  // Describe the domain Map.
5349  if (myRank == 0) {
5350  out << "Domain Map: ";
5351  }
5352  if (getDomainMap ().is_null ()) {
5353  if (myRank == 0) {
5354  out << "null" << endl;
5355  }
5356  } else if (getDomainMap () == getRowMap ()) {
5357  if (myRank == 0) {
5358  out << "same as row Map" << endl;
5359  }
5360  } else if (getDomainMap () == getColMap ()) {
5361  if (myRank == 0) {
5362  out << "same as column Map" << endl;
5363  }
5364  } else {
5365  if (myRank == 0) {
5366  out << endl;
5367  }
5368  getDomainMap ()->describe (out, vl);
5369  }
5370 
5371  // Describe the range Map.
5372  if (myRank == 0) {
5373  out << "Range Map: ";
5374  }
5375  if (getRangeMap ().is_null ()) {
5376  if (myRank == 0) {
5377  out << "null" << endl;
5378  }
5379  } else if (getRangeMap () == getDomainMap ()) {
5380  if (myRank == 0) {
5381  out << "same as domain Map" << endl;
5382  }
5383  } else if (getRangeMap () == getRowMap ()) {
5384  if (myRank == 0) {
5385  out << "same as row Map" << endl;
5386  }
5387  } else {
5388  if (myRank == 0) {
5389  out << endl;
5390  }
5391  getRangeMap ()->describe (out, vl);
5392  }
5393 
5394  // O(P) data
5395  for (int curRank = 0; curRank < numProcs; ++curRank) {
5396  if (myRank == curRank) {
5397  out << "Process rank: " << curRank << endl;
5398  Teuchos::OSTab tab2 (out);
5399  if (! staticGraph_->indicesAreAllocated ()) {
5400  out << "Graph indices not allocated" << endl;
5401  }
5402  else {
5403  out << "Number of allocated entries: "
5404  << staticGraph_->getLocalAllocationSize () << endl;
5405  }
5406  out << "Number of entries: " << getLocalNumEntries () << endl
5407  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5408  << endl;
5409  }
5410  // Give output time to complete by executing some barriers.
5411  comm->barrier ();
5412  comm->barrier ();
5413  comm->barrier ();
5414  }
5415 
5416  if (vl < VERB_HIGH) {
5417  return; // all done!
5418  }
5419 
5420  // O(N) and O(NNZ) data
5421  for (int curRank = 0; curRank < numProcs; ++curRank) {
5422  if (myRank == curRank) {
5423  out << std::setw(width) << "Proc Rank"
5424  << std::setw(width) << "Global Row"
5425  << std::setw(width) << "Num Entries";
5426  if (vl == VERB_EXTREME) {
5427  out << std::setw(width) << "(Index,Value)";
5428  }
5429  out << endl;
5430  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5431  const size_t nE = getNumEntriesInLocalRow(r);
5432  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5433  out << std::setw(width) << myRank
5434  << std::setw(width) << gid
5435  << std::setw(width) << nE;
5436  if (vl == VERB_EXTREME) {
5437  if (isGloballyIndexed()) {
5438  global_inds_host_view_type rowinds;
5439  values_host_view_type rowvals;
5440  getGlobalRowView (gid, rowinds, rowvals);
5441  for (size_t j = 0; j < nE; ++j) {
5442  out << " (" << rowinds[j]
5443  << ", " << rowvals[j]
5444  << ") ";
5445  }
5446  }
5447  else if (isLocallyIndexed()) {
5448  local_inds_host_view_type rowinds;
5449  values_host_view_type rowvals;
5450  getLocalRowView (r, rowinds, rowvals);
5451  for (size_t j=0; j < nE; ++j) {
5452  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5453  << ", " << rowvals[j]
5454  << ") ";
5455  }
5456  } // globally or locally indexed
5457  } // vl == VERB_EXTREME
5458  out << endl;
5459  } // for each row r on this process
5460  } // if (myRank == curRank)
5461 
5462  // Give output time to complete
5463  comm->barrier ();
5464  comm->barrier ();
5465  comm->barrier ();
5466  } // for each process p
5467  }
5468 
5469  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5470  bool
5473  {
5474  // It's not clear what kind of compatibility checks on sizes can
5475  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5476  // compatibility.
5477 
5478  // Currently, the source object must be a RowMatrix with the same
5479  // four template parameters as the target CrsMatrix. We might
5480  // relax this requirement later.
5481  const row_matrix_type* srcRowMat =
5482  dynamic_cast<const row_matrix_type*> (&source);
5483  return (srcRowMat != nullptr);
5484  }
5485 
5486  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5487  void
5490  const typename crs_graph_type::padding_type& padding,
5491  const bool verbose)
5492  {
5494  using Details::padCrsArrays;
5495  using std::endl;
5496  using LO = local_ordinal_type;
5497  using row_ptrs_type =
5498  typename local_graph_device_type::row_map_type::non_const_type;
5499  using range_policy =
5500  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5501  const char tfecfFuncName[] = "applyCrsPadding";
5502  const char suffix[] =
5503  ". Please report this bug to the Tpetra developers.";
5504  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5505 
5506  std::unique_ptr<std::string> prefix;
5507  if (verbose) {
5508  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5509  std::ostringstream os;
5510  os << *prefix << "padding: ";
5511  padding.print(os);
5512  os << endl;
5513  std::cerr << os.str();
5514  }
5515  const int myRank = ! verbose ? -1 : [&] () {
5516  auto map = this->getMap();
5517  if (map.is_null()) {
5518  return -1;
5519  }
5520  auto comm = map->getComm();
5521  if (comm.is_null()) {
5522  return -1;
5523  }
5524  return comm->getRank();
5525  } ();
5526 
5527  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5528  if (! myGraph_->indicesAreAllocated()) {
5529  if (verbose) {
5530  std::ostringstream os;
5531  os << *prefix << "Call allocateIndices" << endl;
5532  std::cerr << os.str();
5533  }
5534  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5535  }
5536 
5537  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5538  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5539  // size needs to increase. That should be the job of
5540  // padCrsArrays.
5541 
5542  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5543  // would use it directly.
5544 
5545  if (verbose) {
5546  std::ostringstream os;
5547  os << *prefix << "Allocate row_ptrs_beg: "
5548  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5549  std::cerr << os.str();
5550  }
5551  using Kokkos::view_alloc;
5552  using Kokkos::WithoutInitializing;
5553  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5554  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5555  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5556  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5557 
5558  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5559  size_t(row_ptr_beg.extent(0) - 1);
5560  if (verbose) {
5561  std::ostringstream os;
5562  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5563  std::cerr << os.str();
5564  }
5565  row_ptrs_type row_ptr_end(
5566  view_alloc("row_ptr_end", WithoutInitializing), N);
5567 
5568  row_ptrs_type num_row_entries_d;
5569 
5570  const bool refill_num_row_entries =
5571  myGraph_->k_numRowEntries_.extent(0) != 0;
5572 
5573  if (refill_num_row_entries) { // unpacked storage
5574  // We can't assume correct *this capture until C++17, and it's
5575  // likely more efficient just to capture what we need anyway.
5576  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5577  myGraph_->k_numRowEntries_);
5578  Kokkos::parallel_for
5579  ("Fill end row pointers", range_policy(0, N),
5580  KOKKOS_LAMBDA (const size_t i) {
5581  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5582  });
5583  }
5584  else {
5585  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5586  // storage, we don't need row_ptr_end to be separate allocation;
5587  // could just have it alias row_ptr_beg+1.
5588  Kokkos::parallel_for
5589  ("Fill end row pointers", range_policy(0, N),
5590  KOKKOS_LAMBDA (const size_t i) {
5591  row_ptr_end(i) = row_ptr_beg(i+1);
5592  });
5593  }
5594 
5595  if (myGraph_->isGloballyIndexed()) {
5596  padCrsArrays(row_ptr_beg, row_ptr_end,
5597  myGraph_->gblInds_wdv,
5598  valuesUnpacked_wdv, padding, myRank, verbose);
5599  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5600  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5601  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5602  (newValuesLen != newColIndsLen, std::logic_error,
5603  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5604  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5605  << suffix);
5606  }
5607  else {
5608  padCrsArrays(row_ptr_beg, row_ptr_end,
5609  myGraph_->lclIndsUnpacked_wdv,
5610  valuesUnpacked_wdv, padding, myRank, verbose);
5611  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5612  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5613  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5614  (newValuesLen != newColIndsLen, std::logic_error,
5615  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5616  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5617  << suffix);
5618  }
5619 
5620  if (refill_num_row_entries) {
5621  Kokkos::parallel_for
5622  ("Fill num entries", range_policy(0, N),
5623  KOKKOS_LAMBDA (const size_t i) {
5624  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5625  });
5626  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5627  }
5628 
5629  if (verbose) {
5630  std::ostringstream os;
5631  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5632  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5633  << ", new size: " << row_ptr_beg.extent(0) << endl;
5634  std::cerr << os.str();
5635  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5636  row_ptr_beg.extent(0) );
5637  }
5638  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5639  }
5640 
5641  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5642  void
5643  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5644  copyAndPermuteStaticGraph(
5645  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5646  const size_t numSameIDs,
5647  const LocalOrdinal permuteToLIDs[],
5648  const LocalOrdinal permuteFromLIDs[],
5649  const size_t numPermutes)
5650  {
5651  using Details::ProfilingRegion;
5652  using Teuchos::Array;
5653  using Teuchos::ArrayView;
5654  using std::endl;
5655  using LO = LocalOrdinal;
5656  using GO = GlobalOrdinal;
5657  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5658  const char suffix[] =
5659  " Please report this bug to the Tpetra developers.";
5660  ProfilingRegion regionCAP
5661  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5662 
5663  const bool debug = Details::Behavior::debug("CrsGraph");
5664  const bool verbose = Details::Behavior::verbose("CrsGraph");
5665  std::unique_ptr<std::string> prefix;
5666  if (verbose) {
5667  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5668  std::ostringstream os;
5669  os << *prefix << "Start" << endl;
5670  }
5671  const char* const prefix_raw =
5672  verbose ? prefix.get()->c_str() : nullptr;
5673 
5674  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5675  //
5676  // Copy the first numSame row from source to target (this matrix).
5677  // This involves copying rows corresponding to LIDs [0, numSame-1].
5678  //
5679  const map_type& srcRowMap = * (srcMat.getRowMap ());
5680  nonconst_global_inds_host_view_type rowInds;
5681  nonconst_values_host_view_type rowVals;
5682  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5683  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5684  // Global ID for the current row index in the source matrix.
5685  // The first numSameIDs GIDs in the two input lists are the
5686  // same, so sourceGID == targetGID in this case.
5687  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5688  const GO targetGID = sourceGID;
5689 
5690  ArrayView<const GO>rowIndsConstView;
5691  ArrayView<const Scalar> rowValsConstView;
5692 
5693  if (sourceIsLocallyIndexed) {
5694  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5695  if (rowLength > static_cast<size_t> (rowInds.size())) {
5696  Kokkos::resize(rowInds,rowLength);
5697  Kokkos::resize(rowVals,rowLength);
5698  }
5699  // Resizing invalidates an Array's views, so we must make new
5700  // ones, even if rowLength hasn't changed.
5701  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5702  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5703 
5704  // The source matrix is locally indexed, so we have to get a
5705  // copy. Really it's the GIDs that have to be copied (because
5706  // they have to be converted from LIDs).
5707  size_t checkRowLength = 0;
5708  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5709  rowValsView, checkRowLength);
5710  if (debug) {
5711  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5712  (rowLength != checkRowLength, std::logic_error, "For "
5713  "global row index " << sourceGID << ", the source "
5714  "matrix's getNumEntriesInGlobalRow returns a row length "
5715  "of " << rowLength << ", but getGlobalRowCopy reports "
5716  "a row length of " << checkRowLength << "." << suffix);
5717  }
5718 
5719  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5720  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5721  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5722  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5723  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5724  rowIndsView.data(), rowIndsView.extent(0),
5725  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5726  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5727  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5728  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5729  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5730  // KDDKDD UVM TEMPORARY: KokkosView interface
5731  }
5732  else { // source matrix is globally indexed.
5733  global_inds_host_view_type rowIndsView;
5734  values_host_view_type rowValsView;
5735  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5736  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5737  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5738  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5739  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5740  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5741  rowIndsView.data(), rowIndsView.extent(0),
5742  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5743  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5744  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5745  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5746  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5747  // KDDKDD UVM TEMPORARY: KokkosView interface
5748 
5749  }
5750 
5751  // Applying a permutation to a matrix with a static graph
5752  // means REPLACE-ing entries.
5753  combineGlobalValues(targetGID, rowIndsConstView,
5754  rowValsConstView, REPLACE,
5755  prefix_raw, debug, verbose);
5756  }
5757 
5758  if (verbose) {
5759  std::ostringstream os;
5760  os << *prefix << "Do permutes" << endl;
5761  }
5762 
5763  const map_type& tgtRowMap = * (this->getRowMap ());
5764  for (size_t p = 0; p < numPermutes; ++p) {
5765  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5766  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5767 
5768  ArrayView<const GO> rowIndsConstView;
5769  ArrayView<const Scalar> rowValsConstView;
5770 
5771  if (sourceIsLocallyIndexed) {
5772  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5773  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5774  Kokkos::resize(rowInds,rowLength);
5775  Kokkos::resize(rowVals,rowLength);
5776  }
5777  // Resizing invalidates an Array's views, so we must make new
5778  // ones, even if rowLength hasn't changed.
5779  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5780  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5781 
5782  // The source matrix is locally indexed, so we have to get a
5783  // copy. Really it's the GIDs that have to be copied (because
5784  // they have to be converted from LIDs).
5785  size_t checkRowLength = 0;
5786  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5787  rowValsView, checkRowLength);
5788  if (debug) {
5789  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5790  (rowLength != checkRowLength, std::logic_error, "For "
5791  "source matrix global row index " << sourceGID << ", "
5792  "getNumEntriesInGlobalRow returns a row length of " <<
5793  rowLength << ", but getGlobalRowCopy a row length of "
5794  << checkRowLength << "." << suffix);
5795  }
5796 
5797  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5798  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5799  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5800  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5801  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5802  rowIndsView.data(), rowIndsView.extent(0),
5803  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5804  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5805  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5806  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5807  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5808  // KDDKDD UVM TEMPORARY: KokkosView interface
5809  }
5810  else {
5811  global_inds_host_view_type rowIndsView;
5812  values_host_view_type rowValsView;
5813  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5814  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5815  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5816  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5817  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5818  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5819  rowIndsView.data(), rowIndsView.extent(0),
5820  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5821  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5822  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5823  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5824  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5825  // KDDKDD UVM TEMPORARY: KokkosView interface
5826  }
5827 
5828  combineGlobalValues(targetGID, rowIndsConstView,
5829  rowValsConstView, REPLACE,
5830  prefix_raw, debug, verbose);
5831  }
5832 
5833  if (verbose) {
5834  std::ostringstream os;
5835  os << *prefix << "Done" << endl;
5836  }
5837  }
5838 
5839  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5840  void
5841  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5842  copyAndPermuteNonStaticGraph(
5843  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5844  const size_t numSameIDs,
5845  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5846  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5847  const size_t numPermutes)
5848  {
5849  using Details::ProfilingRegion;
5850  using Teuchos::Array;
5851  using Teuchos::ArrayView;
5852  using std::endl;
5853  using LO = LocalOrdinal;
5854  using GO = GlobalOrdinal;
5855  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5856  const char suffix[] =
5857  " Please report this bug to the Tpetra developers.";
5858  ProfilingRegion regionCAP
5859  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5860 
5861  const bool debug = Details::Behavior::debug("CrsGraph");
5862  const bool verbose = Details::Behavior::verbose("CrsGraph");
5863  std::unique_ptr<std::string> prefix;
5864  if (verbose) {
5865  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5866  std::ostringstream os;
5867  os << *prefix << "Start" << endl;
5868  }
5869  const char* const prefix_raw =
5870  verbose ? prefix.get()->c_str() : nullptr;
5871 
5872  {
5873  using row_graph_type = RowGraph<LO, GO, Node>;
5874  const row_graph_type& srcGraph = *(srcMat.getGraph());
5875  auto padding =
5876  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5877  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5878  applyCrsPadding(*padding, verbose);
5879  }
5880  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5881  //
5882  // Copy the first numSame row from source to target (this matrix).
5883  // This involves copying rows corresponding to LIDs [0, numSame-1].
5884  //
5885  const map_type& srcRowMap = * (srcMat.getRowMap ());
5886  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5887  using gids_type = nonconst_global_inds_host_view_type;
5888  using vals_type = nonconst_values_host_view_type;
5889  gids_type rowInds;
5890  vals_type rowVals;
5891  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5892  // Global ID for the current row index in the source matrix.
5893  // The first numSameIDs GIDs in the two input lists are the
5894  // same, so sourceGID == targetGID in this case.
5895  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5896  const GO targetGID = sourceGID;
5897 
5898  ArrayView<const GO> rowIndsConstView;
5899  ArrayView<const Scalar> rowValsConstView;
5900 
5901  if (sourceIsLocallyIndexed) {
5902 
5903  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5904  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5905  Kokkos::resize(rowInds,rowLength);
5906  Kokkos::resize(rowVals,rowLength);
5907  }
5908  // Resizing invalidates an Array's views, so we must make new
5909  // ones, even if rowLength hasn't changed.
5910  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5911  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5912 
5913  // The source matrix is locally indexed, so we have to get a
5914  // copy. Really it's the GIDs that have to be copied (because
5915  // they have to be converted from LIDs).
5916  size_t checkRowLength = 0;
5917  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5918  checkRowLength);
5919  if (debug) {
5920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5921  (rowLength != checkRowLength, std::logic_error, ": For "
5922  "global row index " << sourceGID << ", the source "
5923  "matrix's getNumEntriesInGlobalRow returns a row length "
5924  "of " << rowLength << ", but getGlobalRowCopy reports "
5925  "a row length of " << checkRowLength << "." << suffix);
5926  }
5927  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5928  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5929  }
5930  else { // source matrix is globally indexed.
5931  global_inds_host_view_type rowIndsView;
5932  values_host_view_type rowValsView;
5933  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5934 
5935  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5936  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5937  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5938  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5939  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5940  rowIndsView.data(), rowIndsView.extent(0),
5941  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5942  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5943  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5944  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5945  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5946  // KDDKDD UVM TEMPORARY: KokkosView interface
5947  }
5948 
5949  // Combine the data into the target matrix.
5950  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5951  rowValsConstView, prefix_raw, debug, verbose);
5952  }
5953 
5954  if (verbose) {
5955  std::ostringstream os;
5956  os << *prefix << "Do permutes" << endl;
5957  }
5958  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5959  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5960 
5961  const map_type& tgtRowMap = * (this->getRowMap ());
5962  for (size_t p = 0; p < numPermutes; ++p) {
5963  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5964  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5965 
5966  ArrayView<const GO> rowIndsConstView;
5967  ArrayView<const Scalar> rowValsConstView;
5968 
5969  if (sourceIsLocallyIndexed) {
5970  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5971  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5972  Kokkos::resize(rowInds,rowLength);
5973  Kokkos::resize(rowVals,rowLength);
5974  }
5975  // Resizing invalidates an Array's views, so we must make new
5976  // ones, even if rowLength hasn't changed.
5977  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5978  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5979 
5980  // The source matrix is locally indexed, so we have to get a
5981  // copy. Really it's the GIDs that have to be copied (because
5982  // they have to be converted from LIDs).
5983  size_t checkRowLength = 0;
5984  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5985  rowValsView, checkRowLength);
5986  if (debug) {
5987  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5988  (rowLength != checkRowLength, std::logic_error, "For "
5989  "source matrix global row index " << sourceGID << ", "
5990  "getNumEntriesInGlobalRow returns a row length of " <<
5991  rowLength << ", but getGlobalRowCopy a row length of "
5992  << checkRowLength << "." << suffix);
5993  }
5994  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5995  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5996  }
5997  else {
5998  global_inds_host_view_type rowIndsView;
5999  values_host_view_type rowValsView;
6000  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6001 
6002  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6003  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6004  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6005  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6006  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6007  rowIndsView.data(), rowIndsView.extent(0),
6008  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6009  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6010  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6011  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6012  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6013  // KDDKDD UVM TEMPORARY: KokkosView interface
6014  }
6015 
6016  // Combine the data into the target matrix.
6017  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6018  rowValsConstView, prefix_raw, debug, verbose);
6019  }
6020 
6021  if (verbose) {
6022  std::ostringstream os;
6023  os << *prefix << "Done" << endl;
6024  }
6025  }
6026 
6027  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6028  void
6031  const SrcDistObject& srcObj,
6032  const size_t numSameIDs,
6033  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6034  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6035  const CombineMode /*CM*/)
6036  {
6037  using Details::Behavior;
6040  using std::endl;
6041 
6042  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6043  const char tfecfFuncName[] = "copyAndPermute: ";
6044  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6045 
6046  const bool verbose = Behavior::verbose("CrsMatrix");
6047  std::unique_ptr<std::string> prefix;
6048  if (verbose) {
6049  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6050  std::ostringstream os;
6051  os << *prefix << endl
6052  << *prefix << " numSameIDs: " << numSameIDs << endl
6053  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6054  << endl
6055  << *prefix << " "
6056  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6057  << endl
6058  << *prefix << " "
6059  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6060  << endl
6061  << *prefix << " "
6062  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6063  << endl;
6064  std::cerr << os.str ();
6065  }
6066 
6067  const auto numPermute = permuteToLIDs.extent (0);
6068  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6069  (numPermute != permuteFromLIDs.extent (0),
6070  std::invalid_argument, "permuteToLIDs.extent(0) = "
6071  << numPermute << "!= permuteFromLIDs.extent(0) = "
6072  << permuteFromLIDs.extent (0) << ".");
6073 
6074  // This dynamic cast should succeed, because we've already tested
6075  // it in checkSizes().
6077  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6078  if (isStaticGraph ()) {
6079  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6080  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6081  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6082  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6083 
6084  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6085  permuteToLIDs_h.data(),
6086  permuteFromLIDs_h.data(),
6087  numPermute);
6088  }
6089  else {
6090  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6091  permuteFromLIDs, numPermute);
6092  }
6093 
6094  if (verbose) {
6095  std::ostringstream os;
6096  os << *prefix << "Done" << endl;
6097  std::cerr << os.str();
6098  }
6099  }
6100 
6101  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6102  void
6105  (const SrcDistObject& source,
6106  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6107  Kokkos::DualView<char*, buffer_device_type>& exports,
6108  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6109  size_t& constantNumPackets)
6110  {
6111  using Details::Behavior;
6114  using Teuchos::outArg;
6115  using Teuchos::REDUCE_MAX;
6116  using Teuchos::reduceAll;
6117  using std::endl;
6118  typedef LocalOrdinal LO;
6119  typedef GlobalOrdinal GO;
6120  const char tfecfFuncName[] = "packAndPrepare: ";
6121  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6122 
6123  const bool debug = Behavior::debug("CrsMatrix");
6124  const bool verbose = Behavior::verbose("CrsMatrix");
6125 
6126  // Processes on which the communicator is null should not participate.
6127  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6128  if (pComm.is_null ()) {
6129  return;
6130  }
6131  const Teuchos::Comm<int>& comm = *pComm;
6132  const int myRank = comm.getSize ();
6133 
6134  std::unique_ptr<std::string> prefix;
6135  if (verbose) {
6136  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6137  std::ostringstream os;
6138  os << *prefix << "Start" << endl
6139  << *prefix << " "
6140  << dualViewStatusToString (exportLIDs, "exportLIDs")
6141  << endl
6142  << *prefix << " "
6143  << dualViewStatusToString (exports, "exports")
6144  << endl
6145  << *prefix << " "
6146  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6147  << endl;
6148  std::cerr << os.str ();
6149  }
6150 
6151  // Attempt to cast the source object to CrsMatrix. If successful,
6152  // use the source object's packNew() method to pack its data for
6153  // communication. Otherwise, attempt to cast to RowMatrix; if
6154  // successful, use the source object's pack() method. Otherwise,
6155  // the source object doesn't have the right type.
6156  //
6157  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6158  // RowMatrix to have the same Node type. Unfortunately, we don't
6159  // have a way to ask if the RowMatrix is "a RowMatrix with any
6160  // Node type," since RowMatrix doesn't have a base class. A
6161  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6162  // not currently exist, would satisfy this requirement.
6163  //
6164  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6165  // type doesn't technically need to match the target object's
6166  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6167  // and GO need not be the same, as long as there is no overflow of
6168  // the indices. However, checking for index overflow is global
6169  // and therefore undesirable.
6170 
6171  std::ostringstream msg; // for collecting error messages
6172  int lclBad = 0; // to be set below
6173 
6174  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6175  const crs_matrix_type* srcCrsMat =
6176  dynamic_cast<const crs_matrix_type*> (&source);
6177  if (srcCrsMat != nullptr) {
6178  if (verbose) {
6179  std::ostringstream os;
6180  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6181  "calling packNew" << endl;
6182  std::cerr << os.str ();
6183  }
6184  try {
6185  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6186  constantNumPackets);
6187  }
6188  catch (std::exception& e) {
6189  lclBad = 1;
6190  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6191  }
6192  }
6193  else {
6194  using Kokkos::HostSpace;
6195  using Kokkos::subview;
6196  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6197  using range_type = Kokkos::pair<size_t, size_t>;
6198 
6199  if (verbose) {
6200  std::ostringstream os;
6201  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6202  << endl;
6203  std::cerr << os.str ();
6204  }
6205 
6206  const row_matrix_type* srcRowMat =
6207  dynamic_cast<const row_matrix_type*> (&source);
6208  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6209  (srcRowMat == nullptr, std::invalid_argument,
6210  "The source object of the Import or Export operation is neither a "
6211  "CrsMatrix (with the same template parameters as the target object), "
6212  "nor a RowMatrix (with the same first four template parameters as the "
6213  "target object).");
6214 
6215  // For the RowMatrix case, we need to convert from
6216  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6217  // so terribly efficient, since packing a non-CrsMatrix
6218  // RowMatrix for Import/Export into a CrsMatrix is not a
6219  // critical case. Thus, we may allocate Teuchos::Array objects
6220  // here and copy to and from Kokkos::*View.
6221 
6222  // View exportLIDs's host data as a Teuchos::ArrayView.
6223  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6224  auto exportLIDs_h = exportLIDs.view_host ();
6225  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6226  exportLIDs_h.size ());
6227 
6228  // pack() will allocate exports_a as needed. We'll copy back
6229  // into exports (after (re)allocating exports if needed) below.
6230  Teuchos::Array<char> exports_a;
6231 
6232  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6233  // need to sync, since we're doing write-only access, but we do
6234  // need to mark the DualView as modified on host.
6235 
6236  numPacketsPerLID.clear_sync_state (); // write-only access
6237  numPacketsPerLID.modify_host ();
6238  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6239  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6240  numPacketsPerLID_h.size ());
6241 
6242  // Invoke RowMatrix's legacy pack() interface, using above
6243  // Teuchos::Array* objects.
6244  try {
6245  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6246  constantNumPackets);
6247  }
6248  catch (std::exception& e) {
6249  lclBad = 1;
6250  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6251  }
6252 
6253  // Allocate 'exports', and copy exports_a back into it.
6254  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6255  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6256  const std::string oldLabel = exports.view_device().label ();
6257  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6258  exports = exports_type (newLabel, newAllocSize);
6259  }
6260  // It's safe to assume that we're working on host anyway, so
6261  // just keep exports sync'd to host.
6262  // ignore current device contents
6263  exports.modify_host();
6264 
6265  auto exports_h = exports.view_host ();
6266  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6267 
6268  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6269  // exports_a into a nonowning Kokkos::View first before copying.
6270  typedef typename exports_type::t_host::execution_space HES;
6271  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6272  Kokkos::View<const char*, host_device_type>
6273  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6274  // DEEP_COPY REVIEW - NOT TESTED
6275  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6276  }
6277 
6278  if (debug) {
6279  int gblBad = 0; // output argument; to be set below
6280  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6281  if (gblBad != 0) {
6282  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6283  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6284  (true, std::logic_error, "packNew() or pack() threw an exception on "
6285  "one or more participating processes.");
6286  }
6287  }
6288  else {
6289  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6290  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6291  "or more participating processes. Here is this process' error "
6292  "message: " << msg.str ());
6293  }
6294 
6295  if (verbose) {
6296  std::ostringstream os;
6297  os << *prefix << "packAndPrepare: Done!" << endl
6298  << *prefix << " "
6299  << dualViewStatusToString (exportLIDs, "exportLIDs")
6300  << endl
6301  << *prefix << " "
6302  << dualViewStatusToString (exports, "exports")
6303  << endl
6304  << *prefix << " "
6305  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6306  << endl;
6307  std::cerr << os.str ();
6308  }
6309  }
6310 
6311  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6312  size_t
6313  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6314  packRow (char exports[],
6315  const size_t offset,
6316  const size_t numEnt,
6317  const GlobalOrdinal gidsIn[],
6318  const impl_scalar_type valsIn[],
6319  const size_t numBytesPerValue) const
6320  {
6321  using Kokkos::View;
6322  using Kokkos::subview;
6324  typedef LocalOrdinal LO;
6325  typedef GlobalOrdinal GO;
6326  typedef impl_scalar_type ST;
6327 
6328  if (numEnt == 0) {
6329  // Empty rows always take zero bytes, to ensure sparsity.
6330  return 0;
6331  }
6332 
6333  const GO gid = 0; // packValueCount wants this
6334  const LO numEntLO = static_cast<size_t> (numEnt);
6335 
6336  const size_t numEntBeg = offset;
6337  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6338  const size_t gidsBeg = numEntBeg + numEntLen;
6339  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6340  const size_t valsBeg = gidsBeg + gidsLen;
6341  const size_t valsLen = numEnt * numBytesPerValue;
6342 
6343  char* const numEntOut = exports + numEntBeg;
6344  char* const gidsOut = exports + gidsBeg;
6345  char* const valsOut = exports + valsBeg;
6346 
6347  size_t numBytesOut = 0;
6348  int errorCode = 0;
6349  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
6350 
6351  {
6352  Kokkos::pair<int, size_t> p;
6353  p = PackTraits<GO>::packArray (gidsOut, gidsIn, numEnt);
6354  errorCode += p.first;
6355  numBytesOut += p.second;
6356 
6357  p = PackTraits<ST>::packArray (valsOut, valsIn, numEnt);
6358  errorCode += p.first;
6359  numBytesOut += p.second;
6360  }
6361 
6362  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6363  TEUCHOS_TEST_FOR_EXCEPTION
6364  (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
6365  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6366  << expectedNumBytes << ".");
6367  TEUCHOS_TEST_FOR_EXCEPTION
6368  (errorCode != 0, std::runtime_error, "packRow: "
6369  "PackTraits::packArray returned a nonzero error code");
6370 
6371  return numBytesOut;
6372  }
6373 
6374  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6375  size_t
6376  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6377  unpackRow (GlobalOrdinal gidsOut[],
6378  impl_scalar_type valsOut[],
6379  const char imports[],
6380  const size_t offset,
6381  const size_t numBytes,
6382  const size_t numEnt,
6383  const size_t numBytesPerValue)
6384  {
6385  using Kokkos::View;
6386  using Kokkos::subview;
6388  typedef LocalOrdinal LO;
6389  typedef GlobalOrdinal GO;
6390  typedef impl_scalar_type ST;
6391 
6392  Details::ProfilingRegion region_upack_row(
6393  "Tpetra::CrsMatrix::unpackRow",
6394  "Import/Export"
6395  );
6396 
6397  if (numBytes == 0) {
6398  // Rows with zero bytes should always have zero entries.
6399  if (numEnt != 0) {
6400  const int myRank = this->getMap ()->getComm ()->getRank ();
6401  TEUCHOS_TEST_FOR_EXCEPTION
6402  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6403  "unpackRow: The number of bytes to unpack numBytes=0, but the "
6404  "number of entries to unpack (as reported by numPacketsPerLID) "
6405  "for this row numEnt=" << numEnt << " != 0.");
6406  }
6407  return 0;
6408  }
6409 
6410  if (numEnt == 0 && numBytes != 0) {
6411  const int myRank = this->getMap ()->getComm ()->getRank ();
6412  TEUCHOS_TEST_FOR_EXCEPTION
6413  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6414  "unpackRow: The number of entries to unpack (as reported by "
6415  "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6416  "numBytes=" << numBytes << " != 0.");
6417  }
6418 
6419  const GO gid = 0; // packValueCount wants this
6420  const LO lid = 0; // packValueCount wants this
6421 
6422  const size_t numEntBeg = offset;
6423  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
6424  const size_t gidsBeg = numEntBeg + numEntLen;
6425  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6426  const size_t valsBeg = gidsBeg + gidsLen;
6427  const size_t valsLen = numEnt * numBytesPerValue;
6428 
6429  const char* const numEntIn = imports + numEntBeg;
6430  const char* const gidsIn = imports + gidsBeg;
6431  const char* const valsIn = imports + valsBeg;
6432 
6433  size_t numBytesOut = 0;
6434  int errorCode = 0;
6435  LO numEntOut;
6436  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
6437  if (static_cast<size_t> (numEntOut) != numEnt ||
6438  numEntOut == static_cast<LO> (0)) {
6439  const int myRank = this->getMap ()->getComm ()->getRank ();
6440  std::ostringstream os;
6441  os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6442  bool firstErrorCondition = false;
6443  if (static_cast<size_t> (numEntOut) != numEnt) {
6444  os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6445  << " does not equal number of entries unpacked from imports "
6446  "buffer numEntOut=" << numEntOut << ".";
6447  firstErrorCondition = true;
6448  }
6449  if (numEntOut == static_cast<LO> (0)) {
6450  if (firstErrorCondition) {
6451  os << " Also, ";
6452  }
6453  os << "Number of entries unpacked from imports buffer numEntOut=0, "
6454  "but number of bytes to unpack for this row numBytes=" << numBytes
6455  << " != 0. This should never happen, since packRow should only "
6456  "ever pack rows with a nonzero number of entries. In this case, "
6457  "the number of entries from numPacketsPerLID is numEnt=" << numEnt
6458  << ".";
6459  }
6460  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
6461  }
6462 
6463  {
6464  Kokkos::pair<int, size_t> p;
6465  p = PackTraits<GO>::unpackArray (gidsOut, gidsIn, numEnt);
6466  errorCode += p.first;
6467  numBytesOut += p.second;
6468 
6469  p = PackTraits<ST>::unpackArray (valsOut, valsIn, numEnt);
6470  errorCode += p.first;
6471  numBytesOut += p.second;
6472  }
6473 
6474  TEUCHOS_TEST_FOR_EXCEPTION
6475  (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
6476  << numBytesOut << " != numBytes = " << numBytes << ".");
6477 
6478  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6479  TEUCHOS_TEST_FOR_EXCEPTION
6480  (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
6481  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6482  << expectedNumBytes << ".");
6483 
6484  TEUCHOS_TEST_FOR_EXCEPTION
6485  (errorCode != 0, std::runtime_error, "unpackRow: "
6486  "PackTraits::unpackArray returned a nonzero error code");
6487 
6488  return numBytesOut;
6489  }
6490 
6491  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6492  void
6493  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6494  allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
6495  size_t& totalNumEntries,
6496  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
6497  {
6498  using Details::Behavior;
6500  using std::endl;
6501  typedef impl_scalar_type IST;
6502  typedef LocalOrdinal LO;
6503  typedef GlobalOrdinal GO;
6504  //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6505 
6506  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6507  // output to std::cerr on every MPI process. This is unwise for
6508  // runs with large numbers of MPI processes.
6509  const bool verbose = Behavior::verbose("CrsMatrix");
6510  std::unique_ptr<std::string> prefix;
6511  if (verbose) {
6512  prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6513  std::ostringstream os;
6514  os << *prefix << "Before:"
6515  << endl
6516  << *prefix << " "
6517  << dualViewStatusToString (exports, "exports")
6518  << endl
6519  << *prefix << " "
6520  << dualViewStatusToString (exportLIDs, "exportLIDs")
6521  << endl;
6522  std::cerr << os.str ();
6523  }
6524 
6525  // The number of export LIDs must fit in LocalOrdinal, assuming
6526  // that the LIDs are distinct and valid on the calling process.
6527  const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
6528 
6529  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6530  auto exportLIDs_h = exportLIDs.view_host ();
6531 
6532  // Count the total number of matrix entries to send.
6533  totalNumEntries = 0;
6534  for (LO i = 0; i < numExportLIDs; ++i) {
6535  const LO lclRow = exportLIDs_h[i];
6536  size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
6537  // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6538  // indices as an error. Just consider them nonowned for now.
6539  if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
6540  curNumEntries = 0;
6541  }
6542  totalNumEntries += curNumEntries;
6543  }
6544 
6545  // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6546  // if sizeof(IST) is a meaningful representation of the amount of
6547  // data in a Scalar instance. (LO and GO are always built-in
6548  // integer types.)
6549  //
6550  // Allocate the exports array. It does NOT need padding for
6551  // alignment, since we use memcpy to write to / read from send /
6552  // receive buffers.
6553  const size_t allocSize =
6554  static_cast<size_t> (numExportLIDs) * sizeof (LO) +
6555  totalNumEntries * (sizeof (IST) + sizeof (GO));
6556  if (static_cast<size_t> (exports.extent (0)) < allocSize) {
6557  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6558 
6559  const std::string oldLabel = exports.view_device().label ();
6560  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6561  exports = exports_type (newLabel, allocSize);
6562  }
6563 
6564  if (verbose) {
6565  std::ostringstream os;
6566  os << *prefix << "After:"
6567  << endl
6568  << *prefix << " "
6569  << dualViewStatusToString (exports, "exports")
6570  << endl
6571  << *prefix << " "
6572  << dualViewStatusToString (exportLIDs, "exportLIDs")
6573  << endl;
6574  std::cerr << os.str ();
6575  }
6576  }
6577 
6578  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6579  void
6581  packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6582  Kokkos::DualView<char*, buffer_device_type>& exports,
6583  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6584  size_t& constantNumPackets) const
6585  {
6586  // The call to packNew in packAndPrepare catches and handles any exceptions.
6587  Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6588  if (this->isStaticGraph ()) {
6590  packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
6591  constantNumPackets);
6592  }
6593  else {
6594  this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
6595  constantNumPackets);
6596  }
6597  }
6598 
6599  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6600  void
6602  packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6603  Kokkos::DualView<char*, buffer_device_type>& exports,
6604  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6605  size_t& constantNumPackets) const
6606  {
6607  using Details::Behavior;
6609  using Details::PackTraits;
6611  using Kokkos::View;
6612  using std::endl;
6613  using LO = LocalOrdinal;
6614  using GO = GlobalOrdinal;
6615  using ST = impl_scalar_type;
6616  const char tfecfFuncName[] = "packNonStaticNew: ";
6617 
6618  const bool verbose = Behavior::verbose("CrsMatrix");
6619  std::unique_ptr<std::string> prefix;
6620  if (verbose) {
6621  prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6622  std::ostringstream os;
6623  os << *prefix << "Start" << endl;
6624  std::cerr << os.str ();
6625  }
6626 
6627  const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
6628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6629  (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
6630  std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
6631  << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
6632  << ".");
6633 
6634  // Setting this to zero tells the caller to expect a possibly
6635  // different ("nonconstant") number of packets per local index
6636  // (i.e., a possibly different number of entries per row).
6637  constantNumPackets = 0;
6638 
6639  // The pack buffer 'exports' enters this method possibly
6640  // unallocated. Do the first two parts of "Count, allocate, fill,
6641  // compute."
6642  size_t totalNumEntries = 0;
6643  this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
6644  const size_t bufSize = static_cast<size_t> (exports.extent (0));
6645 
6646  // Write-only host access
6647  exports.clear_sync_state();
6648  exports.modify_host();
6649  auto exports_h = exports.view_host ();
6650  if (verbose) {
6651  std::ostringstream os;
6652  os << *prefix << "After marking exports as modified on host, "
6653  << dualViewStatusToString (exports, "exports") << endl;
6654  std::cerr << os.str ();
6655  }
6656 
6657  // Read-only host access
6658  auto exportLIDs_h = exportLIDs.view_host ();
6659 
6660  // Write-only host access
6661  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6662  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6663  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6664 
6665  // Compute the number of "packets" (in this case, bytes) per
6666  // export LID (in this case, local index of the row to send), and
6667  // actually pack the data.
6668  auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6669 
6670 
6671  // Temporary buffer for global column indices.
6672  typename global_inds_host_view_type::non_const_type gidsIn_k;
6673  if (this->isLocallyIndexed()) { // Need storage for Global IDs
6674  gidsIn_k =
6675  typename global_inds_host_view_type::non_const_type("packGids",
6676  maxRowNumEnt);
6677  }
6678 
6679  size_t offset = 0; // current index into 'exports' array.
6680  for (size_t i = 0; i < numExportLIDs; ++i) {
6681  const LO lclRow = exportLIDs_h[i];
6682 
6683  size_t numBytes = 0;
6684  size_t numEnt = this->getNumEntriesInLocalRow (lclRow);
6685 
6686  // Only pack this row's data if it has a nonzero number of
6687  // entries. We can do this because receiving processes get the
6688  // number of packets, and will know that zero packets means zero
6689  // entries.
6690  if (numEnt == 0) {
6691  numPacketsPerLID_h[i] = 0;
6692  continue;
6693  }
6694 
6695  if (this->isLocallyIndexed ()) {
6696  typename global_inds_host_view_type::non_const_type gidsIn;
6697  values_host_view_type valsIn;
6698  // If the matrix is locally indexed on the calling process, we
6699  // have to use its column Map (which it _must_ have in this
6700  // case) to convert to global indices.
6701  local_inds_host_view_type lidsIn;
6702  this->getLocalRowView (lclRow, lidsIn, valsIn);
6703  const map_type& colMap = * (this->getColMap ());
6704  for (size_t k = 0; k < numEnt; ++k) {
6705  gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
6706  }
6707  gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
6708 
6709  const size_t numBytesPerValue =
6710  PackTraits<ST>::packValueCount (valsIn[0]);
6711  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6712  gidsIn.data (), valsIn.data (),
6713  numBytesPerValue);
6714  }
6715  else if (this->isGloballyIndexed ()) {
6716  global_inds_host_view_type gidsIn;
6717  values_host_view_type valsIn;
6718  // If the matrix is globally indexed on the calling process,
6719  // then we can use the column indices directly. However, we
6720  // have to get the global row index. The calling process must
6721  // have a row Map, since otherwise it shouldn't be participating
6722  // in packing operations.
6723  const map_type& rowMap = * (this->getRowMap ());
6724  const GO gblRow = rowMap.getGlobalElement (lclRow);
6725  this->getGlobalRowView (gblRow, gidsIn, valsIn);
6726 
6727  const size_t numBytesPerValue =
6728  PackTraits<ST>::packValueCount (valsIn[0]);
6729  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6730  gidsIn.data (), valsIn.data (),
6731  numBytesPerValue);
6732  }
6733  // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6734  // nor locally indexed, then it has no entries. Therefore,
6735  // there is nothing to pack. No worries!
6736 
6737  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6738  (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6739  "First invalid offset into 'exports' pack buffer at index i = " << i
6740  << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
6741  bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
6742  ".");
6743  // numPacketsPerLID_h[i] is the number of "packets" in the
6744  // current local row i. Packet=char (really "byte") so use the
6745  // number of bytes of the packed data for that row.
6746  numPacketsPerLID_h[i] = numBytes;
6747  offset += numBytes;
6748  }
6749 
6750  if (verbose) {
6751  std::ostringstream os;
6752  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6753  << *prefix << " "
6754  << dualViewStatusToString (exports, "exports")
6755  << endl
6756  << *prefix << " "
6757  << dualViewStatusToString (exportLIDs, "exportLIDs")
6758  << endl;
6759  std::cerr << os.str ();
6760  }
6761  }
6762 
6763  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6764  LocalOrdinal
6765  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6766  combineGlobalValuesRaw(const LocalOrdinal lclRow,
6767  const LocalOrdinal numEnt,
6768  const impl_scalar_type vals[],
6769  const GlobalOrdinal cols[],
6770  const Tpetra::CombineMode combMode,
6771  const char* const prefix,
6772  const bool debug,
6773  const bool verbose)
6774  {
6775  using GO = GlobalOrdinal;
6776 
6777  // mfh 23 Mar 2017: This branch is not thread safe in a debug
6778  // build, due to use of Teuchos::ArrayView; see #229.
6779  const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6780  Teuchos::ArrayView<const GO> cols_av
6781  (numEnt == 0 ? nullptr : cols, numEnt);
6782  Teuchos::ArrayView<const Scalar> vals_av
6783  (numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*> (vals), numEnt);
6784 
6785  // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6786  // combine modes. combineGlobalValues throws on error; it does
6787  // not return an error code. Thus, if it returns, it succeeded.
6788  combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6789  prefix, debug, verbose);
6790  return numEnt;
6791  }
6792 
6793  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6794  void
6795  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6796  combineGlobalValues(
6797  const GlobalOrdinal globalRowIndex,
6798  const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6799  const Teuchos::ArrayView<const Scalar>& values,
6800  const Tpetra::CombineMode combineMode,
6801  const char* const prefix,
6802  const bool debug,
6803  const bool verbose)
6804  {
6805  const char tfecfFuncName[] = "combineGlobalValues: ";
6806 
6807  if (isStaticGraph ()) {
6808  // INSERT doesn't make sense for a static graph, since you
6809  // aren't allowed to change the structure of the graph.
6810  // However, all the other combine modes work.
6811  if (combineMode == ADD) {
6812  sumIntoGlobalValues (globalRowIndex, columnIndices, values);
6813  }
6814  else if (combineMode == REPLACE) {
6815  replaceGlobalValues (globalRowIndex, columnIndices, values);
6816  }
6817  else if (combineMode == ABSMAX) {
6818  using ::Tpetra::Details::AbsMax;
6819  AbsMax<Scalar> f;
6820  this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
6821  columnIndices,
6822  values, f);
6823  }
6824  else if (combineMode == INSERT) {
6825  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6826  (isStaticGraph() && combineMode == INSERT,
6827  std::invalid_argument, "INSERT combine mode is forbidden "
6828  "if the matrix has a static (const) graph (i.e., was "
6829  "constructed with the CrsMatrix constructor that takes a "
6830  "const CrsGraph pointer).");
6831  }
6832  else {
6833  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6834  (true, std::logic_error, "Invalid combine mode; should "
6835  "never get here! "
6836  "Please report this bug to the Tpetra developers.");
6837  }
6838  }
6839  else { // The matrix has a dynamic graph.
6840  if (combineMode == ADD || combineMode == INSERT) {
6841  // For a dynamic graph, all incoming column indices are
6842  // inserted into the target graph. Duplicate indices will
6843  // have their values summed. In this context, ADD and INSERT
6844  // are equivalent. We need to call insertGlobalValues()
6845  // anyway if the column indices don't yet exist in this row,
6846  // so we just call insertGlobalValues() for both cases.
6847  insertGlobalValuesFilteredChecked(globalRowIndex,
6848  columnIndices, values, prefix, debug, verbose);
6849  }
6850  // FIXME (mfh 14 Mar 2012):
6851  //
6852  // Implementing ABSMAX or REPLACE for a dynamic graph would
6853  // require modifying assembly to attach a possibly different
6854  // combine mode to each inserted (i, j, A_ij) entry. For
6855  // example, consider two different Export operations to the same
6856  // target CrsMatrix, the first with ABSMAX combine mode and the
6857  // second with REPLACE. This isn't a common use case, so we
6858  // won't mess with it for now.
6859  else if (combineMode == ABSMAX) {
6860  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6861  ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
6862  "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6863  "implemented.");
6864  }
6865  else if (combineMode == REPLACE) {
6866  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6867  ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
6868  "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6869  "implemented.");
6870  }
6871  else {
6872  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6873  true, std::logic_error, "Should never get here! Please report this "
6874  "bug to the Tpetra developers.");
6875  }
6876  }
6877  }
6878 
6879  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6880  void
6883  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6884  Kokkos::DualView<char*, buffer_device_type> imports,
6885  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6886  const size_t constantNumPackets,
6887  const CombineMode combineMode)
6888  {
6889  using Details::Behavior;
6892  using std::endl;
6893  const char tfecfFuncName[] = "unpackAndCombine: ";
6894  ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
6895 
6896  const bool debug = Behavior::debug("CrsMatrix");
6897  const bool verbose = Behavior::verbose("CrsMatrix");
6898  constexpr int numValidModes = 5;
6899  const CombineMode validModes[numValidModes] =
6900  {ADD, REPLACE, ABSMAX, INSERT, ZERO};
6901  const char* validModeNames[numValidModes] =
6902  {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6903 
6904  std::unique_ptr<std::string> prefix;
6905  if (verbose) {
6906  prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6907  std::ostringstream os;
6908  os << *prefix << "Start:" << endl
6909  << *prefix << " "
6910  << dualViewStatusToString (importLIDs, "importLIDs")
6911  << endl
6912  << *prefix << " "
6913  << dualViewStatusToString (imports, "imports")
6914  << endl
6915  << *prefix << " "
6916  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6917  << endl
6918  << *prefix << " constantNumPackets: " << constantNumPackets
6919  << endl
6920  << *prefix << " combineMode: " << combineModeToString (combineMode)
6921  << endl;
6922  std::cerr << os.str ();
6923  }
6924 
6925  if (debug) {
6926  if (std::find (validModes, validModes+numValidModes, combineMode) ==
6927  validModes+numValidModes) {
6928  std::ostringstream os;
6929  os << "Invalid combine mode. Valid modes are {";
6930  for (int k = 0; k < numValidModes; ++k) {
6931  os << validModeNames[k];
6932  if (k < numValidModes - 1) {
6933  os << ", ";
6934  }
6935  }
6936  os << "}.";
6937  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6938  (true, std::invalid_argument, os.str ());
6939  }
6940  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6941  (importLIDs.extent(0) != numPacketsPerLID.extent(0),
6942  std::invalid_argument, "importLIDs.extent(0)="
6943  << importLIDs.extent(0)
6944  << " != numPacketsPerLID.extent(0)="
6945  << numPacketsPerLID.extent(0) << ".");
6946  }
6947 
6948  if (combineMode == ZERO) {
6949  return; // nothing to do
6950  }
6951 
6952  if (debug) {
6953  using Teuchos::reduceAll;
6954  std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
6955  int lclBad = 0;
6956  try {
6957  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6958  constantNumPackets, combineMode,
6959  verbose);
6960  } catch (std::exception& e) {
6961  lclBad = 1;
6962  *msg << e.what ();
6963  }
6964  int gblBad = 0;
6965  const Teuchos::Comm<int>& comm = * (this->getComm ());
6966  reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
6967  lclBad, Teuchos::outArg (gblBad));
6968  if (gblBad != 0) {
6969  // mfh 22 Oct 2017: 'prefix' might be null, since it is only
6970  // initialized in a debug build. Thus, we get the process
6971  // rank again here. This is an error message, so the small
6972  // run-time cost doesn't matter. See #1887.
6973  std::ostringstream os;
6974  os << "Proc " << comm.getRank () << ": " << msg->str () << endl;
6975  msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
6976  ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
6977  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6978  (true, std::logic_error, std::endl << "unpackAndCombineImpl "
6979  "threw an exception on one or more participating processes: "
6980  << endl << msg->str ());
6981  }
6982  }
6983  else {
6984  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6985  constantNumPackets, combineMode,
6986  verbose);
6987  }
6988 
6989  if (verbose) {
6990  std::ostringstream os;
6991  os << *prefix << "Done!" << endl
6992  << *prefix << " "
6993  << dualViewStatusToString (importLIDs, "importLIDs")
6994  << endl
6995  << *prefix << " "
6996  << dualViewStatusToString (imports, "imports")
6997  << endl
6998  << *prefix << " "
6999  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7000  << endl;
7001  std::cerr << os.str ();
7002  }
7003  }
7004 
7005  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7006  void
7009  const Kokkos::DualView<const local_ordinal_type*,
7010  buffer_device_type>& importLIDs,
7011  Kokkos::DualView<char*, buffer_device_type> imports,
7012  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7013  const size_t constantNumPackets,
7014  const CombineMode combineMode,
7015  const bool verbose)
7016  {
7017  Details::ProfilingRegion region_unpack_and_combine_impl(
7018  "Tpetra::CrsMatrix::unpackAndCombineImpl",
7019  "Import/Export"
7020  );
7021  using std::endl;
7022  const char tfecfFuncName[] = "unpackAndCombineImpl";
7023  std::unique_ptr<std::string> prefix;
7024  if (verbose) {
7025  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7026  std::ostringstream os;
7027  os << *prefix << "isStaticGraph(): "
7028  << (isStaticGraph() ? "true" : "false")
7029  << ", importLIDs.extent(0): "
7030  << importLIDs.extent(0)
7031  << ", imports.extent(0): "
7032  << imports.extent(0)
7033  << ", numPacketsPerLID.extent(0): "
7034  << numPacketsPerLID.extent(0)
7035  << endl;
7036  std::cerr << os.str();
7037  }
7038 
7039  if (isStaticGraph ()) {
7040  using Details::unpackCrsMatrixAndCombineNew;
7041  unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
7042  importLIDs, constantNumPackets,
7043  combineMode);
7044  }
7045  else {
7046  {
7047  using padding_type = typename crs_graph_type::padding_type;
7048  std::unique_ptr<padding_type> padding;
7049  try {
7050  padding = myGraph_->computePaddingForCrsMatrixUnpack(
7051  importLIDs, imports, numPacketsPerLID, verbose);
7052  }
7053  catch (std::exception& e) {
7054  const auto rowMap = getRowMap();
7055  const auto comm = rowMap.is_null() ? Teuchos::null :
7056  rowMap->getComm();
7057  const int myRank = comm.is_null() ? -1 : comm->getRank();
7058  TEUCHOS_TEST_FOR_EXCEPTION
7059  (true, std::runtime_error, "Proc " << myRank << ": "
7060  "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
7061  "threw an exception: " << e.what());
7062  }
7063  if (verbose) {
7064  std::ostringstream os;
7065  os << *prefix << "Call applyCrsPadding" << endl;
7066  std::cerr << os.str();
7067  }
7068  applyCrsPadding(*padding, verbose);
7069  }
7070  if (verbose) {
7071  std::ostringstream os;
7072  os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
7073  std::cerr << os.str();
7074  }
7075  unpackAndCombineImplNonStatic(importLIDs, imports,
7076  numPacketsPerLID,
7077  constantNumPackets,
7078  combineMode);
7079  }
7080 
7081  if (verbose) {
7082  std::ostringstream os;
7083  os << *prefix << "Done" << endl;
7084  std::cerr << os.str();
7085  }
7086  }
7087 
7088  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7089  void
7090  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7091  unpackAndCombineImplNonStatic(
7092  const Kokkos::DualView<const local_ordinal_type*,
7093  buffer_device_type>& importLIDs,
7094  Kokkos::DualView<char*, buffer_device_type> imports,
7095  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7096  const size_t constantNumPackets,
7097  const CombineMode combineMode)
7098  {
7099  using Kokkos::View;
7100  using Kokkos::subview;
7101  using Kokkos::MemoryUnmanaged;
7102  using Details::Behavior;
7105  using Details::PackTraits;
7106  using Details::ScalarViewTraits;
7107  using std::endl;
7108  using LO = LocalOrdinal;
7109  using GO = GlobalOrdinal;
7110  using ST = impl_scalar_type;
7111  using size_type = typename Teuchos::ArrayView<LO>::size_type;
7112  using HES =
7113  typename View<int*, device_type>::HostMirror::execution_space;
7114  using pair_type = std::pair<typename View<int*, HES>::size_type,
7115  typename View<int*, HES>::size_type>;
7116  using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
7117  using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
7118  const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
7119 
7120  const bool debug = Behavior::debug("CrsMatrix");
7121  const bool verbose = Behavior::verbose("CrsMatrix");
7122  std::unique_ptr<std::string> prefix;
7123  if (verbose) {
7124  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7125  std::ostringstream os;
7126  os << *prefix << endl; // we've already printed DualViews' statuses
7127  std::cerr << os.str ();
7128  }
7129  const char* const prefix_raw =
7130  verbose ? prefix.get()->c_str() : nullptr;
7131 
7132  const size_type numImportLIDs = importLIDs.extent (0);
7133  if (combineMode == ZERO || numImportLIDs == 0) {
7134  return; // nothing to do; no need to combine entries
7135  }
7136 
7137  Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
7138  "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
7139  "Import/Export"
7140  );
7141 
7142  // We're unpacking on host. This is read-only host access.
7143  if (imports.need_sync_host()) {
7144  imports.sync_host ();
7145  }
7146  auto imports_h = imports.view_host();
7147 
7148  // Read-only host access.
7149  if (numPacketsPerLID.need_sync_host()) {
7150  numPacketsPerLID.sync_host ();
7151  }
7152  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
7153 
7154  TEUCHOS_ASSERT( ! importLIDs.need_sync_host() );
7155  auto importLIDs_h = importLIDs.view_host();
7156 
7157  size_t numBytesPerValue;
7158  {
7159  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7160  // with run-time size? We already assume that all entries in both the
7161  // source and target matrices have the same size. If the calling process
7162  // owns at least one entry in either matrix, we can use that entry to set
7163  // the size. However, it is possible that the calling process owns no
7164  // entries. In that case, we're in trouble. One way to fix this would be
7165  // for each row's data to contain the run-time size. This is only
7166  // necessary if the size is not a compile-time constant.
7167  Scalar val;
7168  numBytesPerValue = PackTraits<ST>::packValueCount (val);
7169  }
7170 
7171  // Determine the maximum number of entries in any one row
7172  size_t offset = 0;
7173  size_t maxRowNumEnt = 0;
7174  for (size_type i = 0; i < numImportLIDs; ++i) {
7175  const size_t numBytes = numPacketsPerLID_h[i];
7176  if (numBytes == 0) {
7177  continue; // empty buffer for that row means that the row is empty
7178  }
7179  // We need to unpack a nonzero number of entries for this row.
7180  if (debug) {
7181  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7182  (offset + numBytes > size_t(imports_h.extent (0)),
7183  std::logic_error, ": At local row index importLIDs_h[i="
7184  << i << "]=" << importLIDs_h[i] << ", offset (=" << offset
7185  << ") + numBytes (=" << numBytes << ") > "
7186  "imports_h.extent(0)=" << imports_h.extent (0) << ".");
7187  }
7188  LO numEntLO = 0;
7189 
7190  if (debug) {
7191  const size_t theNumBytes =
7192  PackTraits<LO>::packValueCount (numEntLO);
7193  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7194  (theNumBytes > numBytes, std::logic_error, ": theNumBytes="
7195  << theNumBytes << " > numBytes = " << numBytes << ".");
7196  }
7197  const char* const inBuf = imports_h.data () + offset;
7198  const size_t actualNumBytes =
7199  PackTraits<LO>::unpackValue (numEntLO, inBuf);
7200 
7201  if (debug) {
7202  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7203  (actualNumBytes > numBytes, std::logic_error, ": At i=" << i
7204  << ", actualNumBytes=" << actualNumBytes
7205  << " > numBytes=" << numBytes << ".");
7206  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7207  (numEntLO == 0, std::logic_error, ": At local row index "
7208  "importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", "
7209  "the number of entries read from the packed data is "
7210  "numEntLO=" << numEntLO << ", but numBytes=" << numBytes
7211  << " != 0.");
7212  }
7213 
7214  maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
7215  offset += numBytes;
7216  }
7217 
7218  // Temporary space to cache incoming global column indices and
7219  // values. Column indices come in as global indices, in case the
7220  // source object's column Map differs from the target object's
7221  // (this's) column Map.
7222  View<GO*, HES> gblColInds;
7223  View<LO*, HES> lclColInds;
7224  View<ST*, HES> vals;
7225  {
7226  GO gid = 0;
7227  LO lid = 0;
7228  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7229  // with run-time size? We already assume that all entries in both the
7230  // source and target matrices have the same size. If the calling process
7231  // owns at least one entry in either matrix, we can use that entry to set
7232  // the size. However, it is possible that the calling process owns no
7233  // entries. In that case, we're in trouble. One way to fix this would be
7234  // for each row's data to contain the run-time size. This is only
7235  // necessary if the size is not a compile-time constant.
7236  Scalar val;
7237  gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
7238  gid, maxRowNumEnt, "gids");
7239  lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
7240  lid, maxRowNumEnt, "lids");
7241  vals = ScalarViewTraits<ST, HES>::allocateArray(
7242  val, maxRowNumEnt, "vals");
7243  }
7244 
7245  offset = 0;
7246  for (size_type i = 0; i < numImportLIDs; ++i) {
7247  const size_t numBytes = numPacketsPerLID_h[i];
7248  if (numBytes == 0) {
7249  continue; // empty buffer for that row means that the row is empty
7250  }
7251  LO numEntLO = 0;
7252  const char* const inBuf = imports_h.data () + offset;
7253  (void) PackTraits<LO>::unpackValue (numEntLO, inBuf);
7254 
7255  const size_t numEnt = static_cast<size_t>(numEntLO);;
7256  const LO lclRow = importLIDs_h[i];
7257 
7258  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
7259  vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
7260 
7261  const size_t numBytesOut =
7262  unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
7263  offset, numBytes, numEnt, numBytesPerValue);
7264  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7265  (numBytes != numBytesOut, std::logic_error, ": At i=" << i
7266  << ", numBytes=" << numBytes << " != numBytesOut="
7267  << numBytesOut << ".");
7268 
7269  const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
7270  const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
7271  combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
7272  combineMode, prefix_raw, debug, verbose);
7273  // Don't update offset until current LID has succeeded.
7274  offset += numBytes;
7275  } // for each import LID i
7276 
7277  if (verbose) {
7278  std::ostringstream os;
7279  os << *prefix << "Done" << endl;
7280  std::cerr << os.str();
7281  }
7282  }
7283 
7284  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7285  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7287  getColumnMapMultiVector (const MV& X_domainMap,
7288  const bool force) const
7289  {
7290  using Teuchos::null;
7291  using Teuchos::RCP;
7292  using Teuchos::rcp;
7293 
7294  TEUCHOS_TEST_FOR_EXCEPTION(
7295  ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
7296  "MapMultiVector: You may only call this method if the matrix has a "
7297  "column Map. If the matrix does not yet have a column Map, you should "
7298  "first call fillComplete (with domain and range Map if necessary).");
7299 
7300  // If the graph is not fill complete, then the Import object (if
7301  // one should exist) hasn't been constructed yet.
7302  TEUCHOS_TEST_FOR_EXCEPTION(
7303  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7304  "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
7305  "this matrix's graph is fill complete.");
7306 
7307  const size_t numVecs = X_domainMap.getNumVectors ();
7308  RCP<const import_type> importer = this->getGraph ()->getImporter ();
7309  RCP<const map_type> colMap = this->getColMap ();
7310 
7311  RCP<MV> X_colMap; // null by default
7312 
7313  // If the Import object is trivial (null), then we don't need a
7314  // separate column Map multivector. Just return null in that
7315  // case. The caller is responsible for knowing not to use the
7316  // returned null pointer.
7317  //
7318  // If the Import is nontrivial, then we do need a separate
7319  // column Map multivector for the Import operation. Check in
7320  // that case if we have to (re)create the column Map
7321  // multivector.
7322  if (! importer.is_null () || force) {
7323  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
7324  X_colMap = rcp (new MV (colMap, numVecs));
7325 
7326  // Cache the newly created multivector for later reuse.
7327  importMV_ = X_colMap;
7328  }
7329  else { // Yay, we can reuse the cached multivector!
7330  X_colMap = importMV_;
7331  // mfh 09 Jan 2013: We don't have to fill with zeros first,
7332  // because the Import uses INSERT combine mode, which overwrites
7333  // existing entries.
7334  //
7335  //X_colMap->putScalar (ZERO);
7336  }
7337  }
7338  return X_colMap;
7339  }
7340 
7341  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7342  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7345  const bool force) const
7346  {
7347  using Teuchos::null;
7348  using Teuchos::RCP;
7349  using Teuchos::rcp;
7350 
7351  // If the graph is not fill complete, then the Export object (if
7352  // one should exist) hasn't been constructed yet.
7353  TEUCHOS_TEST_FOR_EXCEPTION(
7354  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7355  "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7356  "matrix's graph is fill complete.");
7357 
7358  const size_t numVecs = Y_rangeMap.getNumVectors ();
7359  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
7360  // Every version of the constructor takes either a row Map, or a
7361  // graph (all of whose constructors take a row Map). Thus, the
7362  // matrix always has a row Map.
7363  RCP<const map_type> rowMap = this->getRowMap ();
7364 
7365  RCP<MV> Y_rowMap; // null by default
7366 
7367  // If the Export object is trivial (null), then we don't need a
7368  // separate row Map multivector. Just return null in that case.
7369  // The caller is responsible for knowing not to use the returned
7370  // null pointer.
7371  //
7372  // If the Export is nontrivial, then we do need a separate row
7373  // Map multivector for the Export operation. Check in that case
7374  // if we have to (re)create the row Map multivector.
7375  if (! exporter.is_null () || force) {
7376  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
7377  Y_rowMap = rcp (new MV (rowMap, numVecs));
7378  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7379  }
7380  else { // Yay, we can reuse the cached multivector!
7381  Y_rowMap = exportMV_;
7382  }
7383  }
7384  return Y_rowMap;
7385  }
7386 
7387  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7388  void
7390  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
7391  {
7392  TEUCHOS_TEST_FOR_EXCEPTION(
7393  myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
7394  "removeEmptyProcessesInPlace: This method does not work when the matrix "
7395  "was created with a constant graph (that is, when it was created using "
7396  "the version of its constructor that takes an RCP<const CrsGraph>). "
7397  "This is because the matrix is not allowed to modify the graph in that "
7398  "case, but removing empty processes requires modifying the graph.");
7399  myGraph_->removeEmptyProcessesInPlace (newMap);
7400  // Even though CrsMatrix's row Map (as returned by getRowMap())
7401  // comes from its CrsGraph, CrsMatrix still implements DistObject,
7402  // so we also have to change the DistObject's Map.
7403  this->map_ = this->getRowMap ();
7404  // In the nonconst graph case, staticGraph_ is just a const
7405  // pointer to myGraph_. This assignment is probably redundant,
7406  // but it doesn't hurt.
7407  staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
7408  }
7409 
7410  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7411  Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7413  add (const Scalar& alpha,
7415  const Scalar& beta,
7416  const Teuchos::RCP<const map_type>& domainMap,
7417  const Teuchos::RCP<const map_type>& rangeMap,
7418  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7419  {
7420  using Teuchos::Array;
7421  using Teuchos::ArrayView;
7422  using Teuchos::ParameterList;
7423  using Teuchos::RCP;
7424  using Teuchos::rcp;
7425  using Teuchos::rcp_implicit_cast;
7426  using Teuchos::sublist;
7427  using std::endl;
7428  using LO = local_ordinal_type;
7429  using GO = global_ordinal_type;
7430  using crs_matrix_type =
7432  const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7433 
7434  const bool debug = Details::Behavior::debug("CrsMatrix");
7435  const bool verbose = Details::Behavior::verbose("CrsMatrix");
7436  std::unique_ptr<std::string> prefix;
7437  if (verbose) {
7438  prefix = this->createPrefix("CrsMatrix", "add");
7439  std::ostringstream os;
7440  os << *prefix << "Start" << endl;
7441  std::cerr << os.str ();
7442  }
7443 
7444  const crs_matrix_type& B = *this; // a convenient abbreviation
7445  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7446  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7447 
7448  // If the user didn't supply a domain or range Map, then try to
7449  // get one from B first (if it has them), then from A (if it has
7450  // them). If we don't have any domain or range Maps, scold the
7451  // user.
7452  RCP<const map_type> A_domainMap = A.getDomainMap ();
7453  RCP<const map_type> A_rangeMap = A.getRangeMap ();
7454  RCP<const map_type> B_domainMap = B.getDomainMap ();
7455  RCP<const map_type> B_rangeMap = B.getRangeMap ();
7456 
7457  RCP<const map_type> theDomainMap = domainMap;
7458  RCP<const map_type> theRangeMap = rangeMap;
7459 
7460  if (domainMap.is_null ()) {
7461  if (B_domainMap.is_null ()) {
7462  TEUCHOS_TEST_FOR_EXCEPTION(
7463  A_domainMap.is_null (), std::invalid_argument,
7464  "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7465  "then you must supply a nonnull domain Map to this method.");
7466  theDomainMap = A_domainMap;
7467  } else {
7468  theDomainMap = B_domainMap;
7469  }
7470  }
7471  if (rangeMap.is_null ()) {
7472  if (B_rangeMap.is_null ()) {
7473  TEUCHOS_TEST_FOR_EXCEPTION(
7474  A_rangeMap.is_null (), std::invalid_argument,
7475  "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7476  "then you must supply a nonnull range Map to this method.");
7477  theRangeMap = A_rangeMap;
7478  } else {
7479  theRangeMap = B_rangeMap;
7480  }
7481  }
7482 
7483  if (debug) {
7484  // In debug mode, check that A and B have matching domain and
7485  // range Maps, if they have domain and range Maps at all. (If
7486  // they aren't fill complete, then they may not yet have them.)
7487  if (! A_domainMap.is_null() && ! A_rangeMap.is_null()) {
7488  if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7489  TEUCHOS_TEST_FOR_EXCEPTION
7490  (! B_domainMap->isSameAs(*A_domainMap),
7491  std::invalid_argument,
7492  errPfx << "The input RowMatrix A must have a domain Map "
7493  "which is the same as (isSameAs) this RowMatrix's "
7494  "domain Map.");
7495  TEUCHOS_TEST_FOR_EXCEPTION
7496  (! B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7497  errPfx << "The input RowMatrix A must have a range Map "
7498  "which is the same as (isSameAs) this RowMatrix's range "
7499  "Map.");
7500  TEUCHOS_TEST_FOR_EXCEPTION
7501  (! domainMap.is_null() &&
7502  ! domainMap->isSameAs(*B_domainMap),
7503  std::invalid_argument,
7504  errPfx << "The input domain Map must be the same as "
7505  "(isSameAs) this RowMatrix's domain Map.");
7506  TEUCHOS_TEST_FOR_EXCEPTION
7507  (! rangeMap.is_null() &&
7508  ! rangeMap->isSameAs(*B_rangeMap),
7509  std::invalid_argument,
7510  errPfx << "The input range Map must be the same as "
7511  "(isSameAs) this RowMatrix's range Map.");
7512  }
7513  }
7514  else if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7515  TEUCHOS_TEST_FOR_EXCEPTION
7516  (! domainMap.is_null() &&
7517  ! domainMap->isSameAs(*B_domainMap),
7518  std::invalid_argument,
7519  errPfx << "The input domain Map must be the same as "
7520  "(isSameAs) this RowMatrix's domain Map.");
7521  TEUCHOS_TEST_FOR_EXCEPTION
7522  (! rangeMap.is_null() && ! rangeMap->isSameAs(*B_rangeMap),
7523  std::invalid_argument,
7524  errPfx << "The input range Map must be the same as "
7525  "(isSameAs) this RowMatrix's range Map.");
7526  }
7527  else {
7528  TEUCHOS_TEST_FOR_EXCEPTION
7529  (domainMap.is_null() || rangeMap.is_null(),
7530  std::invalid_argument, errPfx << "If neither A nor B "
7531  "have a domain and range Map, then you must supply a "
7532  "nonnull domain and range Map to this method.");
7533  }
7534  }
7535 
7536  // What parameters do we pass to C's constructor? Do we call
7537  // fillComplete on C after filling it? And if so, what parameters
7538  // do we pass to C's fillComplete call?
7539  bool callFillComplete = true;
7540  RCP<ParameterList> constructorSublist;
7541  RCP<ParameterList> fillCompleteSublist;
7542  if (! params.is_null()) {
7543  callFillComplete =
7544  params->get("Call fillComplete", callFillComplete);
7545  constructorSublist = sublist(params, "Constructor parameters");
7546  fillCompleteSublist = sublist(params, "fillComplete parameters");
7547  }
7548 
7549  RCP<const map_type> A_rowMap = A.getRowMap ();
7550  RCP<const map_type> B_rowMap = B.getRowMap ();
7551  RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7552  RCP<crs_matrix_type> C; // The result matrix.
7553 
7554  // If A and B's row Maps are the same, we can compute an upper
7555  // bound on the number of entries in each row of C, before
7556  // actually computing the sum. A reasonable upper bound is the
7557  // sum of the two entry counts in each row.
7558  if (A_rowMap->isSameAs (*B_rowMap)) {
7559  const LO localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7560  Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
7561 
7562  // Get the number of entries in each row of A.
7563  if (alpha != ZERO) {
7564  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7565  const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7566  C_maxNumEntriesPerRow[localRow] += A_numEntries;
7567  }
7568  }
7569  // Get the number of entries in each row of B.
7570  if (beta != ZERO) {
7571  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7572  const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7573  C_maxNumEntriesPerRow[localRow] += B_numEntries;
7574  }
7575  }
7576  // Construct the result matrix C.
7577  if (constructorSublist.is_null ()) {
7578  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow ()));
7579  } else {
7580  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
7581  constructorSublist));
7582  }
7583  // Since A and B have the same row Maps, we could add them
7584  // together all at once and merge values before we call
7585  // insertGlobalValues. However, we don't really need to, since
7586  // we've already allocated enough space in each row of C for C
7587  // to do the merge itself.
7588  }
7589  else { // the row Maps of A and B are not the same
7590  // Construct the result matrix C.
7591  // true: !A_rowMap->isSameAs (*B_rowMap)
7592  TEUCHOS_TEST_FOR_EXCEPTION
7593  (true, std::invalid_argument, errPfx << "The row maps must "
7594  "be the same for statically allocated matrices, to ensure "
7595  "that there is sufficient space to do the addition.");
7596  }
7597 
7598  TEUCHOS_TEST_FOR_EXCEPTION
7599  (C.is_null (), std::logic_error,
7600  errPfx << "C should not be null at this point. "
7601  "Please report this bug to the Tpetra developers.");
7602 
7603  if (verbose) {
7604  std::ostringstream os;
7605  os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7606  std::cerr << os.str ();
7607  }
7608  using gids_type = nonconst_global_inds_host_view_type;
7609  using vals_type = nonconst_values_host_view_type;
7610  gids_type ind;
7611  vals_type val;
7612 
7613  if (alpha != ZERO) {
7614  const LO A_localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7615  for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7616  size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7617  const GO globalRow = A_rowMap->getGlobalElement (localRow);
7618  if (A_numEntries > static_cast<size_t> (ind.size ())) {
7619  Kokkos::resize(ind,A_numEntries);
7620  Kokkos::resize(val,A_numEntries);
7621  }
7622  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
7623  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
7624  A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
7625 
7626  if (alpha != ONE) {
7627  for (size_t k = 0; k < A_numEntries; ++k) {
7628  valView[k] *= alpha;
7629  }
7630  }
7631  C->insertGlobalValues (globalRow, A_numEntries,
7632  reinterpret_cast<Scalar *>(valView.data()),
7633  indView.data());
7634  }
7635  }
7636 
7637  if (beta != ZERO) {
7638  const LO B_localNumRows = static_cast<LO> (B_rowMap->getLocalNumElements ());
7639  for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7640  size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7641  const GO globalRow = B_rowMap->getGlobalElement (localRow);
7642  if (B_numEntries > static_cast<size_t> (ind.size ())) {
7643  Kokkos::resize(ind,B_numEntries);
7644  Kokkos::resize(val,B_numEntries);
7645  }
7646  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
7647  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
7648  B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
7649 
7650  if (beta != ONE) {
7651  for (size_t k = 0; k < B_numEntries; ++k) {
7652  valView[k] *= beta;
7653  }
7654  }
7655  C->insertGlobalValues (globalRow, B_numEntries,
7656  reinterpret_cast<Scalar *>(valView.data()),
7657  indView.data());
7658  }
7659  }
7660 
7661  if (callFillComplete) {
7662  if (verbose) {
7663  std::ostringstream os;
7664  os << *prefix << "Call fillComplete on C" << endl;
7665  std::cerr << os.str ();
7666  }
7667  if (fillCompleteSublist.is_null ()) {
7668  C->fillComplete (theDomainMap, theRangeMap);
7669  } else {
7670  C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
7671  }
7672  }
7673  else if (verbose) {
7674  std::ostringstream os;
7675  os << *prefix << "Do NOT call fillComplete on C" << endl;
7676  std::cerr << os.str ();
7677  }
7678 
7679  if (verbose) {
7680  std::ostringstream os;
7681  os << *prefix << "Done" << endl;
7682  std::cerr << os.str ();
7683  }
7684  return rcp_implicit_cast<row_matrix_type> (C);
7685  }
7686 
7687 
7688 
7689  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7690  void
7693  const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7694  const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
7695  const Teuchos::RCP<const map_type>& domainMap,
7696  const Teuchos::RCP<const map_type>& rangeMap,
7697  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7698  {
7699  using Details::Behavior;
7704  using Teuchos::ArrayRCP;
7705  using Teuchos::ArrayView;
7706  using Teuchos::Comm;
7707  using Teuchos::ParameterList;
7708  using Teuchos::RCP;
7709  using std::endl;
7710  typedef LocalOrdinal LO;
7711  typedef GlobalOrdinal GO;
7712  typedef node_type NT;
7713  typedef CrsMatrix<Scalar, LO, GO, NT> this_CRS_type;
7714  typedef Vector<int, LO, GO, NT> IntVectorType;
7715  using Teuchos::as;
7716 
7717  const bool debug = Behavior::debug("CrsMatrix");
7718  const bool verbose = Behavior::verbose("CrsMatrix");
7719  int MyPID = getComm ()->getRank ();
7720 
7721  std::unique_ptr<std::string> verbosePrefix;
7722  if (verbose) {
7723  verbosePrefix =
7724  this->createPrefix("CrsMatrix", "transferAndFillComplete");
7725  std::ostringstream os;
7726  os << "Start" << endl;
7727  std::cerr << os.str();
7728  }
7729 
7730  //
7731  // Get the caller's parameters
7732  //
7733  bool isMM = false; // optimize for matrix-matrix ops.
7734  bool reverseMode = false; // Are we in reverse mode?
7735  bool restrictComm = false; // Do we need to restrict the communicator?
7736 
7737  int mm_optimization_core_count =
7738  Behavior::TAFC_OptimizationCoreCount();
7739  RCP<ParameterList> matrixparams; // parameters for the destination matrix
7740  bool overrideAllreduce = false;
7741  bool useKokkosPath = false;
7742  if (! params.is_null ()) {
7743  matrixparams = sublist (params, "CrsMatrix");
7744  reverseMode = params->get ("Reverse Mode", reverseMode);
7745  useKokkosPath = params->get ("TAFC: use kokkos path", useKokkosPath);
7746  restrictComm = params->get ("Restrict Communicator", restrictComm);
7747  auto & slist = params->sublist("matrixmatrix: kernel params",false);
7748  isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
7749  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
7750 
7751  overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
7752  if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7753  if(reverseMode) isMM = false;
7754  }
7755 
7756  // Only used in the sparse matrix-matrix multiply (isMM) case.
7757  std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
7758  int mismatch = 0;
7759  int reduced_mismatch = 0;
7760  if (isMM && !overrideAllreduce) {
7761 
7762  // Test for pathological matrix transfer
7763  const bool source_vals = ! getGraph ()->getImporter ().is_null();
7764  const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
7765  rowTransfer.getRemoteLIDs ().size() == 0);
7766  mismatch = (source_vals != target_vals) ? 1 : 0;
7767  iallreduceRequest =
7768  ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
7769  Teuchos::REDUCE_MAX, * (getComm ()));
7770  }
7771 
7772 #ifdef HAVE_TPETRA_MMM_TIMINGS
7773  using Teuchos::TimeMonitor;
7774  std::string label;
7775  if(!params.is_null())
7776  label = params->get("Timer Label",label);
7777  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
7778  std::string tlstr;
7779  {
7780  std::ostringstream os;
7781  if(isMM) os<<":MMOpt";
7782  else os<<":MMLegacy";
7783  tlstr = os.str();
7784  }
7785 
7786  Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
7787 #endif
7788 
7789  // Make sure that the input argument rowTransfer is either an
7790  // Import or an Export. Import and Export are the only two
7791  // subclasses of Transfer that we defined, but users might
7792  // (unwisely, for now at least) decide to implement their own
7793  // subclasses. Exclude this possibility.
7794  const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
7795  const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
7796  TEUCHOS_TEST_FOR_EXCEPTION(
7797  xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7798  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7799  "argument must be either an Import or an Export, and its template "
7800  "parameters must match the corresponding template parameters of the "
7801  "CrsMatrix.");
7802 
7803  // Make sure that the input argument domainTransfer is either an
7804  // Import or an Export. Import and Export are the only two
7805  // subclasses of Transfer that we defined, but users might
7806  // (unwisely, for now at least) decide to implement their own
7807  // subclasses. Exclude this possibility.
7808  Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
7809  Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
7810 
7811  if(! domainTransfer.is_null()) {
7812  TEUCHOS_TEST_FOR_EXCEPTION(
7813  (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7814  "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7815  "argument must be either an Import or an Export, and its template "
7816  "parameters must match the corresponding template parameters of the "
7817  "CrsMatrix.");
7818 
7819  TEUCHOS_TEST_FOR_EXCEPTION(
7820  ( xferAsImport != nullptr || ! xferDomainAsImport.is_null() ) &&
7821  (( xferAsImport != nullptr && xferDomainAsImport.is_null() ) ||
7822  ( xferAsImport == nullptr && ! xferDomainAsImport.is_null() )), std::invalid_argument,
7823  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7824  "arguments must be of the same type (either Import or Export).");
7825 
7826  TEUCHOS_TEST_FOR_EXCEPTION(
7827  ( xferAsExport != nullptr || ! xferDomainAsExport.is_null() ) &&
7828  (( xferAsExport != nullptr && xferDomainAsExport.is_null() ) ||
7829  ( xferAsExport == nullptr && ! xferDomainAsExport.is_null() )), std::invalid_argument,
7830  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7831  "arguments must be of the same type (either Import or Export).");
7832  } // domainTransfer != null
7833 
7834 
7835  // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7836  // if the source Map is not distributed but the target Map is?
7837  const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
7838 
7839  // Get the new domain and range Maps. We need some of them for
7840  // error checking, now that we have the reverseMode parameter.
7841  RCP<const map_type> MyRowMap = reverseMode ?
7842  rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
7843  RCP<const map_type> MyColMap; // create this below
7844  RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
7845  domainMap : getDomainMap ();
7846  RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
7847  rangeMap : getRangeMap ();
7848  RCP<const map_type> BaseRowMap = MyRowMap;
7849  RCP<const map_type> BaseDomainMap = MyDomainMap;
7850 
7851  // If the user gave us a nonnull destMat, then check whether it's
7852  // "pristine." That means that it has no entries.
7853  //
7854  // FIXME (mfh 15 May 2014) If this is not true on all processes,
7855  // then this exception test may hang. It would be better to
7856  // forward an error flag to the next communication phase.
7857  if (! destMat.is_null ()) {
7858  // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7859  // whether a graph or matrix has no entries on the calling
7860  // process, is that it is neither locally nor globally indexed.
7861  // This may change eventually with the Kokkos refactor version
7862  // of Tpetra, so it would be better just to check the quantity
7863  // of interest directly. Note that with the Kokkos refactor
7864  // version of Tpetra, asking for the total number of entries in
7865  // a graph or matrix that is not fill complete might require
7866  // computation (kernel launch), since it is not thread scalable
7867  // to update a count every time an entry is inserted.
7868  const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
7869  ! destMat->getGraph ()->isGloballyIndexed ();
7870  TEUCHOS_TEST_FOR_EXCEPTION(
7871  ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
7872  "transferAndFillComplete: The input argument 'destMat' is only allowed "
7873  "to be nonnull, if its graph is empty (neither locally nor globally "
7874  "indexed).");
7875  // FIXME (mfh 15 May 2014) At some point, we want to change
7876  // graphs and matrices so that their DistObject Map
7877  // (this->getMap()) may differ from their row Map. This will
7878  // make redistribution for 2-D distributions more efficient. I
7879  // hesitate to change this check, because I'm not sure how much
7880  // the code here depends on getMap() and getRowMap() being the
7881  // same.
7882  TEUCHOS_TEST_FOR_EXCEPTION(
7883  ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
7884  "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7885  "input argument 'destMat' is not the same as the (row) Map specified "
7886  "by the input argument 'rowTransfer'.");
7887  TEUCHOS_TEST_FOR_EXCEPTION(
7888  ! destMat->checkSizes (*this), std::invalid_argument,
7889  "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7890  "destination matrix, but checkSizes() indicates that it is not a legal "
7891  "legal target for redistribution from the source matrix (*this). This "
7892  "may mean that they do not have the same dimensions.");
7893  }
7894 
7895  // If forward mode (the default), then *this's (row) Map must be
7896  // the same as the source Map of the Transfer. If reverse mode,
7897  // then *this's (row) Map must be the same as the target Map of
7898  // the Transfer.
7899  //
7900  // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7901  // and matrices so that their DistObject Map (this->getMap()) may
7902  // differ from their row Map. This will make redistribution for
7903  // 2-D distributions more efficient. I hesitate to change this
7904  // check, because I'm not sure how much the code here depends on
7905  // getMap() and getRowMap() being the same.
7906  TEUCHOS_TEST_FOR_EXCEPTION(
7907  ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
7908  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7909  "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7910  TEUCHOS_TEST_FOR_EXCEPTION(
7911  ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
7912  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7913  "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7914 
7915  // checks for domainTransfer
7916  TEUCHOS_TEST_FOR_EXCEPTION(
7917  ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7918  std::invalid_argument,
7919  "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7920  "argument must be the same as the rebalanced domain map 'domainMap'");
7921 
7922  TEUCHOS_TEST_FOR_EXCEPTION(
7923  ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7924  std::invalid_argument,
7925  "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7926  "argument must be the same as the rebalanced domain map 'domainMap'");
7927 
7928  // The basic algorithm here is:
7929  //
7930  // 1. Call the moral equivalent of "Distor.do" to handle the import.
7931  // 2. Copy all the Imported and Copy/Permuted data into the raw
7932  // CrsMatrix / CrsGraphData pointers, still using GIDs.
7933  // 3. Call an optimized version of MakeColMap that avoids the
7934  // Directory lookups (since the importer knows who owns all the
7935  // GIDs) AND reindexes to LIDs.
7936  // 4. Call expertStaticFillComplete()
7937 
7938  // Get information from the Importer
7939  const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7940  ArrayView<const LO> ExportLIDs = reverseMode ?
7941  rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
7942  auto RemoteLIDs = reverseMode ?
7943  rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
7944  auto PermuteToLIDs = reverseMode ?
7945  rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
7946  auto PermuteFromLIDs = reverseMode ?
7947  rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
7948  Distributor& Distor = rowTransfer.getDistributor ();
7949 
7950  // Owning PIDs
7951  Teuchos::Array<int> SourcePids;
7952 
7953  // Temp variables for sub-communicators
7954  RCP<const map_type> ReducedRowMap, ReducedColMap,
7955  ReducedDomainMap, ReducedRangeMap;
7956  RCP<const Comm<int> > ReducedComm;
7957 
7958  // If the user gave us a null destMat, then construct the new
7959  // destination matrix. We will replace its column Map later.
7960  if (destMat.is_null ()) {
7961  destMat = rcp (new this_CRS_type (MyRowMap, 0, matrixparams));
7962  }
7963 
7964  /***************************************************/
7965  /***** 1) First communicator restriction phase ****/
7966  /***************************************************/
7967  if (restrictComm) {
7968 #ifdef HAVE_TPETRA_MMM_TIMINGS
7969  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrictComm")));
7970 #endif
7971  ReducedRowMap = MyRowMap->removeEmptyProcesses ();
7972  ReducedComm = ReducedRowMap.is_null () ?
7973  Teuchos::null :
7974  ReducedRowMap->getComm ();
7975  destMat->removeEmptyProcessesInPlace (ReducedRowMap);
7976 
7977  ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
7978  ReducedRowMap :
7979  MyDomainMap->replaceCommWithSubset (ReducedComm);
7980  ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
7981  ReducedRowMap :
7982  MyRangeMap->replaceCommWithSubset (ReducedComm);
7983 
7984  // Reset the "my" maps
7985  MyRowMap = ReducedRowMap;
7986  MyDomainMap = ReducedDomainMap;
7987  MyRangeMap = ReducedRangeMap;
7988 
7989  // Update my PID, if we've restricted the communicator
7990  if (! ReducedComm.is_null ()) {
7991  MyPID = ReducedComm->getRank ();
7992  }
7993  else {
7994  MyPID = -2; // For debugging
7995  }
7996  }
7997  else {
7998  ReducedComm = MyRowMap->getComm ();
7999  }
8000 
8001 
8002 
8003  /***************************************************/
8004  /***** 2) From Tpetra::DistObject::doTransfer() ****/
8005  /***************************************************/
8006  // Get the owning PIDs
8007  RCP<const import_type> MyImporter = getGraph ()->getImporter ();
8008 
8009  // check whether domain maps of source matrix and base domain map is the same
8010  bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
8011 
8012  if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
8013 #ifdef HAVE_TPETRA_MMM_TIMINGS
8014  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs same map")));
8015 #endif
8016  // Same domain map as source matrix
8017  //
8018  // NOTE: This won't work for restrictComm (because the Import
8019  // doesn't know the restricted PIDs), though writing an
8020  // optimized version for that case would be easy (Import an
8021  // IntVector of the new PIDs). Might want to add this later.
8022  Import_Util::getPids (*MyImporter, SourcePids, false);
8023  }
8024  else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
8025  // Same domain map as source matrix (restricted communicator)
8026  // We need one import from the domain to the column map
8027 #ifdef HAVE_TPETRA_MMM_TIMINGS
8028  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs restricted comm")));
8029 #endif
8030  IntVectorType SourceDomain_pids(getDomainMap (),true);
8031  IntVectorType SourceCol_pids(getColMap());
8032  // SourceDomain_pids contains the restricted pids
8033  SourceDomain_pids.putScalar(MyPID);
8034 
8035  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8036  SourcePids.resize (getColMap ()->getLocalNumElements ());
8037  SourceCol_pids.get1dCopy (SourcePids ());
8038  }
8039  else if (MyImporter.is_null ()) {
8040  // Matrix has no off-process entries
8041 #ifdef HAVE_TPETRA_MMM_TIMINGS
8042  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs all local entries")));
8043 #endif
8044  SourcePids.resize (getColMap ()->getLocalNumElements ());
8045  SourcePids.assign (getColMap ()->getLocalNumElements (), MyPID);
8046  }
8047  else if ( ! MyImporter.is_null () &&
8048  ! domainTransfer.is_null () ) {
8049  // general implementation for rectangular matrices with
8050  // domain map different than SourceMatrix domain map.
8051  // User has to provide a DomainTransfer object. We need
8052  // to communications (import/export)
8053 #ifdef HAVE_TPETRA_MMM_TIMINGS
8054  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs rectangular case")));
8055 #endif
8056 
8057  // TargetDomain_pids lives on the rebalanced new domain map
8058  IntVectorType TargetDomain_pids (domainMap);
8059  TargetDomain_pids.putScalar (MyPID);
8060 
8061  // SourceDomain_pids lives on the non-rebalanced old domain map
8062  IntVectorType SourceDomain_pids (getDomainMap ());
8063 
8064  // SourceCol_pids lives on the non-rebalanced old column map
8065  IntVectorType SourceCol_pids (getColMap ());
8066 
8067  if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8068  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8069  }
8070  else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8071  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8072  }
8073  else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8074  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8075  }
8076  else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8077  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8078  }
8079  else {
8080  TEUCHOS_TEST_FOR_EXCEPTION(
8081  true, std::logic_error, "Tpetra::CrsMatrix::"
8082  "transferAndFillComplete: Should never get here! "
8083  "Please report this bug to a Tpetra developer.");
8084  }
8085  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8086  SourcePids.resize (getColMap ()->getLocalNumElements ());
8087  SourceCol_pids.get1dCopy (SourcePids ());
8088  }
8089  else if ( ! MyImporter.is_null () &&
8090  BaseDomainMap->isSameAs (*BaseRowMap) &&
8091  getDomainMap ()->isSameAs (*getRowMap ())) {
8092  // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
8093 #ifdef HAVE_TPETRA_MMM_TIMINGS
8094  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs query import")));
8095 #endif
8096 
8097  IntVectorType TargetRow_pids (domainMap);
8098  IntVectorType SourceRow_pids (getRowMap ());
8099  IntVectorType SourceCol_pids (getColMap ());
8100 
8101  TargetRow_pids.putScalar (MyPID);
8102  if (! reverseMode && xferAsImport != nullptr) {
8103  SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
8104  }
8105  else if (reverseMode && xferAsExport != nullptr) {
8106  SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
8107  }
8108  else if (! reverseMode && xferAsExport != nullptr) {
8109  SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
8110  }
8111  else if (reverseMode && xferAsImport != nullptr) {
8112  SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
8113  }
8114  else {
8115  TEUCHOS_TEST_FOR_EXCEPTION(
8116  true, std::logic_error, "Tpetra::CrsMatrix::"
8117  "transferAndFillComplete: Should never get here! "
8118  "Please report this bug to a Tpetra developer.");
8119  }
8120 
8121  SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
8122  SourcePids.resize (getColMap ()->getLocalNumElements ());
8123  SourceCol_pids.get1dCopy (SourcePids ());
8124  }
8125  else {
8126  TEUCHOS_TEST_FOR_EXCEPTION(
8127  true, std::invalid_argument, "Tpetra::CrsMatrix::"
8128  "transferAndFillComplete: This method only allows either domainMap == "
8129  "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
8130  "getDomainMap () == getRowMap ()).");
8131  }
8132 
8133  // Tpetra-specific stuff
8134  size_t constantNumPackets = destMat->constantNumberOfPackets ();
8135  {
8136 #ifdef HAVE_TPETRA_MMM_TIMINGS
8137  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC reallocate buffers")));
8138 #endif
8139  if (constantNumPackets == 0) {
8140  destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
8141  RemoteLIDs.view_host().size ());
8142  }
8143  else {
8144  // There are a constant number of packets per element. We
8145  // already know (from the number of "remote" (incoming)
8146  // elements) how many incoming elements we expect, so we can
8147  // resize the buffer accordingly.
8148  const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
8149  destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
8150  }
8151  }
8152 
8153  // Pack & Prepare w/ owning PIDs
8154  {
8155 #ifdef HAVE_TPETRA_MMM_TIMINGS
8156  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC pack and prepare")));
8157 #endif
8158  if (debug) {
8159  using Teuchos::outArg;
8160  using Teuchos::REDUCE_MAX;
8161  using Teuchos::reduceAll;
8162  using std::cerr;
8163  using std::endl;
8164  RCP<const Teuchos::Comm<int> > comm = this->getComm ();
8165  const int myRank = comm->getRank ();
8166 
8167  std::ostringstream errStrm;
8168  int lclErr = 0;
8169  int gblErr = 0;
8170 
8171  Teuchos::ArrayView<size_t> numExportPacketsPerLID;
8172  try {
8173  // packAndPrepare* methods modify numExportPacketsPerLID_.
8174  destMat->numExportPacketsPerLID_.modify_host ();
8175  numExportPacketsPerLID =
8176  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8177  }
8178  catch (std::exception& e) {
8179  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
8180  << e.what () << std::endl;
8181  lclErr = 1;
8182  }
8183  catch (...) {
8184  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
8185  "an exception not a subclass of std::exception" << std::endl;
8186  lclErr = 1;
8187  }
8188 
8189  if (! comm.is_null ()) {
8190  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8191  }
8192  if (gblErr != 0) {
8193  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8194  TEUCHOS_TEST_FOR_EXCEPTION(
8195  true, std::runtime_error, "getArrayViewFromDualView threw an "
8196  "exception on at least one process.");
8197  }
8198 
8199  if (verbose) {
8200  std::ostringstream os;
8201  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8202  << std::endl;
8203  std::cerr << os.str ();
8204  }
8205  try {
8207  destMat->exports_,
8208  numExportPacketsPerLID,
8209  ExportLIDs,
8210  SourcePids,
8211  constantNumPackets);
8212  }
8213  catch (std::exception& e) {
8214  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
8215  << e.what () << std::endl;
8216  lclErr = 1;
8217  }
8218  catch (...) {
8219  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
8220  "an exception not a subclass of std::exception" << std::endl;
8221  lclErr = 1;
8222  }
8223 
8224  if (verbose) {
8225  std::ostringstream os;
8226  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8227  << std::endl;
8228  std::cerr << os.str ();
8229  }
8230 
8231  if (! comm.is_null ()) {
8232  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8233  }
8234  if (gblErr != 0) {
8235  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8236  TEUCHOS_TEST_FOR_EXCEPTION(
8237  true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
8238  "exception on at least one process.");
8239  }
8240  }
8241  else {
8242  // packAndPrepare* methods modify numExportPacketsPerLID_.
8243  destMat->numExportPacketsPerLID_.modify_host ();
8244  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
8245  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8246  if (verbose) {
8247  std::ostringstream os;
8248  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8249  << std::endl;
8250  std::cerr << os.str ();
8251  }
8253  destMat->exports_,
8254  numExportPacketsPerLID,
8255  ExportLIDs,
8256  SourcePids,
8257  constantNumPackets);
8258  if (verbose) {
8259  std::ostringstream os;
8260  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8261  << std::endl;
8262  std::cerr << os.str ();
8263  }
8264  }
8265  }
8266 
8267  // Do the exchange of remote data.
8268  {
8269 #ifdef HAVE_TPETRA_MMM_TIMINGS
8270  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs exchange remote data")));
8271 #endif
8272  if (! communication_needed) {
8273  if (verbose) {
8274  std::ostringstream os;
8275  os << *verbosePrefix << "Communication not needed" << std::endl;
8276  std::cerr << os.str ();
8277  }
8278  }
8279  else {
8280  if (reverseMode) {
8281  if (constantNumPackets == 0) { // variable number of packets per LID
8282  if (verbose) {
8283  std::ostringstream os;
8284  os << *verbosePrefix << "Reverse mode, variable # packets / LID"
8285  << std::endl;
8286  std::cerr << os.str ();
8287  }
8288  // Make sure that host has the latest version, since we're
8289  // using the version on host. If host has the latest
8290  // version, syncing to host does nothing.
8291  destMat->numExportPacketsPerLID_.sync_host ();
8292  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8293  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8294  destMat->numImportPacketsPerLID_.sync_host ();
8295  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8296  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8297 
8298  if (verbose) {
8299  std::ostringstream os;
8300  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8301  << std::endl;
8302  std::cerr << os.str ();
8303  }
8304  Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8305  destMat->numImportPacketsPerLID_.view_host());
8306  if (verbose) {
8307  std::ostringstream os;
8308  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8309  << std::endl;
8310  std::cerr << os.str ();
8311  }
8312 
8313  size_t totalImportPackets = 0;
8314  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8315  totalImportPackets += numImportPacketsPerLID[i];
8316  }
8317 
8318  // Reallocation MUST go before setting the modified flag,
8319  // because it may clear out the flags.
8320  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8321  verbosePrefix.get ());
8322  destMat->imports_.modify_host ();
8323  auto hostImports = destMat->imports_.view_host();
8324  // This is a legacy host pack/unpack path, so use the host
8325  // version of exports_.
8326  destMat->exports_.sync_host ();
8327  auto hostExports = destMat->exports_.view_host();
8328  if (verbose) {
8329  std::ostringstream os;
8330  os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
8331  << std::endl;
8332  std::cerr << os.str ();
8333  }
8334  Distor.doReversePostsAndWaits (hostExports,
8335  numExportPacketsPerLID,
8336  hostImports,
8337  numImportPacketsPerLID);
8338  if (verbose) {
8339  std::ostringstream os;
8340  os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
8341  << std::endl;
8342  std::cerr << os.str ();
8343  }
8344  }
8345  else { // constant number of packets per LID
8346  if (verbose) {
8347  std::ostringstream os;
8348  os << *verbosePrefix << "Reverse mode, constant # packets / LID"
8349  << std::endl;
8350  std::cerr << os.str ();
8351  }
8352  destMat->imports_.modify_host ();
8353  auto hostImports = destMat->imports_.view_host();
8354  // This is a legacy host pack/unpack path, so use the host
8355  // version of exports_.
8356  destMat->exports_.sync_host ();
8357  auto hostExports = destMat->exports_.view_host();
8358  if (verbose) {
8359  std::ostringstream os;
8360  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8361  << std::endl;
8362  std::cerr << os.str ();
8363  }
8364  Distor.doReversePostsAndWaits (hostExports,
8365  constantNumPackets,
8366  hostImports);
8367  if (verbose) {
8368  std::ostringstream os;
8369  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8370  << std::endl;
8371  std::cerr << os.str ();
8372  }
8373  }
8374  }
8375  else { // forward mode (the default)
8376  if (constantNumPackets == 0) { // variable number of packets per LID
8377  if (verbose) {
8378  std::ostringstream os;
8379  os << *verbosePrefix << "Forward mode, variable # packets / LID"
8380  << std::endl;
8381  std::cerr << os.str ();
8382  }
8383  // Make sure that host has the latest version, since we're
8384  // using the version on host. If host has the latest
8385  // version, syncing to host does nothing.
8386  destMat->numExportPacketsPerLID_.sync_host ();
8387  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8388  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8389  destMat->numImportPacketsPerLID_.sync_host ();
8390  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8391  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8392  if (verbose) {
8393  std::ostringstream os;
8394  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8395  << std::endl;
8396  std::cerr << os.str ();
8397  }
8398  Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8399  destMat->numImportPacketsPerLID_.view_host());
8400  if (verbose) {
8401  std::ostringstream os;
8402  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8403  << std::endl;
8404  std::cerr << os.str ();
8405  }
8406 
8407  size_t totalImportPackets = 0;
8408  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8409  totalImportPackets += numImportPacketsPerLID[i];
8410  }
8411 
8412  // Reallocation MUST go before setting the modified flag,
8413  // because it may clear out the flags.
8414  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8415  verbosePrefix.get ());
8416  destMat->imports_.modify_host ();
8417  auto hostImports = destMat->imports_.view_host();
8418  // This is a legacy host pack/unpack path, so use the host
8419  // version of exports_.
8420  destMat->exports_.sync_host ();
8421  auto hostExports = destMat->exports_.view_host();
8422  if (verbose) {
8423  std::ostringstream os;
8424  os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8425  << std::endl;
8426  std::cerr << os.str ();
8427  }
8428  Distor.doPostsAndWaits (hostExports,
8429  numExportPacketsPerLID,
8430  hostImports,
8431  numImportPacketsPerLID);
8432  if (verbose) {
8433  std::ostringstream os;
8434  os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8435  << std::endl;
8436  std::cerr << os.str ();
8437  }
8438  }
8439  else { // constant number of packets per LID
8440  if (verbose) {
8441  std::ostringstream os;
8442  os << *verbosePrefix << "Forward mode, constant # packets / LID"
8443  << std::endl;
8444  std::cerr << os.str ();
8445  }
8446  destMat->imports_.modify_host ();
8447  auto hostImports = destMat->imports_.view_host();
8448  // This is a legacy host pack/unpack path, so use the host
8449  // version of exports_.
8450  destMat->exports_.sync_host ();
8451  auto hostExports = destMat->exports_.view_host();
8452  if (verbose) {
8453  std::ostringstream os;
8454  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8455  << std::endl;
8456  std::cerr << os.str ();
8457  }
8458  Distor.doPostsAndWaits (hostExports,
8459  constantNumPackets,
8460  hostImports);
8461  if (verbose) {
8462  std::ostringstream os;
8463  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8464  << std::endl;
8465  std::cerr << os.str ();
8466  }
8467  }
8468  }
8469  }
8470  }
8471 
8472  /*********************************************************************/
8473  /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8474  /*********************************************************************/
8475 
8476  bool runOnHost = std::is_same_v<typename device_type::memory_space, Kokkos::HostSpace> && !useKokkosPath;
8477 
8478  Teuchos::Array<int> RemotePids;
8479  if (runOnHost) {
8480  Teuchos::Array<int> TargetPids;
8481  // Backwards compatibility measure. We'll use this again below.
8482 
8483  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8484  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8485  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8486  destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8487 
8488 # ifdef HAVE_TPETRA_MMM_TIMINGS
8489  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8490 # endif
8491  ArrayRCP<size_t> CSR_rowptr;
8492  ArrayRCP<GO> CSR_colind_GID;
8493  ArrayRCP<LO> CSR_colind_LID;
8494  ArrayRCP<Scalar> CSR_vals;
8495 
8496  destMat->imports_.sync_device ();
8497  destMat->numImportPacketsPerLID_.sync_device ();
8498 
8499  size_t N = BaseRowMap->getLocalNumElements ();
8500 
8501  auto RemoteLIDs_d = RemoteLIDs.view_device();
8502  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8503  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8504 
8506  *this,
8507  RemoteLIDs_d,
8508  destMat->imports_.view_device(), //hostImports
8509  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8510  NumSameIDs,
8511  PermuteToLIDs_d,
8512  PermuteFromLIDs_d,
8513  N,
8514  MyPID,
8515  CSR_rowptr,
8516  CSR_colind_GID,
8517  CSR_vals,
8518  SourcePids(),
8519  TargetPids);
8520 
8521  // If LO and GO are the same, we can reuse memory when
8522  // converting the column indices from global to local indices.
8523  if (typeid (LO) == typeid (GO)) {
8524  CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
8525  }
8526  else {
8527  CSR_colind_LID.resize (CSR_colind_GID.size());
8528  }
8529  CSR_colind_LID.resize (CSR_colind_GID.size());
8530 
8531  // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8532  // owned entries. Convert them to the actual PID.
8533  // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
8534  for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
8535  {
8536  if(TargetPids[i] == -1) TargetPids[i] = MyPID;
8537  }
8538 #ifdef HAVE_TPETRA_MMM_TIMINGS
8539  tmCopySPRdata = Teuchos::null;
8540 #endif
8541  /**************************************************************/
8542  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8543  /**************************************************************/
8544  // Call an optimized version of makeColMap that avoids the
8545  // Directory lookups (since the Import object knows who owns all
8546  // the GIDs).
8547  if (verbose) {
8548  std::ostringstream os;
8549  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8550  << std::endl;
8551  std::cerr << os.str ();
8552  }
8553  {
8554 #ifdef HAVE_TPETRA_MMM_TIMINGS
8555  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8556 #endif
8557  Import_Util::lowCommunicationMakeColMapAndReindexSerial(CSR_rowptr (),
8558  CSR_colind_LID (),
8559  CSR_colind_GID (),
8560  BaseDomainMap,
8561  TargetPids,
8562  RemotePids,
8563  MyColMap);
8564  }
8565 
8566  if (verbose) {
8567  std::ostringstream os;
8568  os << *verbosePrefix << "restrictComm="
8569  << (restrictComm ? "true" : "false") << std::endl;
8570  std::cerr << os.str ();
8571  }
8572 
8573  /*******************************************************/
8574  /**** 4) Second communicator restriction phase ****/
8575  /*******************************************************/
8576  {
8577 #ifdef HAVE_TPETRA_MMM_TIMINGS
8578  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8579 #endif
8580  if (restrictComm) {
8581  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8582  ReducedRowMap :
8583  MyColMap->replaceCommWithSubset (ReducedComm);
8584  MyColMap = ReducedColMap; // Reset the "my" maps
8585  }
8586 
8587  // Replace the col map
8588  if (verbose) {
8589  std::ostringstream os;
8590  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8591  std::cerr << os.str ();
8592  }
8593  destMat->replaceColMap (MyColMap);
8594 
8595  // Short circuit if the processor is no longer in the communicator
8596  //
8597  // NOTE: Epetra replaces modifies all "removed" processes so they
8598  // have a dummy (serial) Map that doesn't touch the original
8599  // communicator. Duplicating that here might be a good idea.
8600  if (ReducedComm.is_null ()) {
8601  if (verbose) {
8602  std::ostringstream os;
8603  os << *verbosePrefix << "I am no longer in the communicator; "
8604  "returning" << std::endl;
8605  std::cerr << os.str ();
8606  }
8607  return;
8608  }
8609  }
8610 
8611  /***************************************************/
8612  /**** 5) Sort ****/
8613  /***************************************************/
8614  if ((! reverseMode && xferAsImport != nullptr) ||
8615  (reverseMode && xferAsExport != nullptr)) {
8616  if (verbose) {
8617  std::ostringstream os;
8618  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8619  std::cerr << os.str ();
8620  }
8621 #ifdef HAVE_TPETRA_MMM_TIMINGS
8622  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8623 #endif
8624  Import_Util::sortCrsEntries (CSR_rowptr(),
8625  CSR_colind_LID(),
8626  CSR_vals());
8627  }
8628  else if ((! reverseMode && xferAsExport != nullptr) ||
8629  (reverseMode && xferAsImport != nullptr)) {
8630  if (verbose) {
8631  std::ostringstream os;
8632  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8633  << endl;
8634  std::cerr << os.str();
8635  }
8636 #ifdef HAVE_TPETRA_MMM_TIMINGS
8637  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8638 #endif
8639  Import_Util::sortAndMergeCrsEntries (CSR_rowptr(),
8640  CSR_colind_LID(),
8641  CSR_vals());
8642  if (CSR_rowptr[N] != static_cast<size_t>(CSR_vals.size())) {
8643  CSR_colind_LID.resize (CSR_rowptr[N]);
8644  CSR_vals.resize (CSR_rowptr[N]);
8645  }
8646  }
8647  else {
8648  TEUCHOS_TEST_FOR_EXCEPTION(
8649  true, std::logic_error, "Tpetra::CrsMatrix::"
8650  "transferAndFillComplete: Should never get here! "
8651  "Please report this bug to a Tpetra developer.");
8652  }
8653  /***************************************************/
8654  /**** 6) Reset the colmap and the arrays ****/
8655  /***************************************************/
8656 
8657  if (verbose) {
8658  std::ostringstream os;
8659  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8660  std::cerr << os.str ();
8661  }
8662 
8663  // Call constructor for the new matrix (restricted as needed)
8664  //
8665  // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8666  // refactor version of CrsMatrix, though it reserves the right to
8667  // make a deep copy of the arrays.
8668  {
8669 #ifdef HAVE_TPETRA_MMM_TIMINGS
8670  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8671 #endif
8672  destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
8673  }
8674 
8675  } else {
8676  // run on device
8677 
8678 
8679  // Backwards compatibility measure. We'll use this again below.
8680 
8681  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8682  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8683  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8684  destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8685 
8686 # ifdef HAVE_TPETRA_MMM_TIMINGS
8687  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8688 # endif
8689  ArrayRCP<size_t> CSR_rowptr;
8690  ArrayRCP<GO> CSR_colind_GID;
8691  ArrayRCP<LO> CSR_colind_LID;
8692  ArrayRCP<Scalar> CSR_vals;
8693 
8694  destMat->imports_.sync_device ();
8695  destMat->numImportPacketsPerLID_.sync_device ();
8696 
8697  size_t N = BaseRowMap->getLocalNumElements ();
8698 
8699  auto RemoteLIDs_d = RemoteLIDs.view_device();
8700  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8701  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8702 
8703  Kokkos::View<size_t*,device_type> CSR_rowptr_d;
8704  Kokkos::View<GO*,device_type> CSR_colind_GID_d;
8705  Kokkos::View<LO*,device_type> CSR_colind_LID_d;
8706  Kokkos::View<impl_scalar_type*,device_type> CSR_vals_d;
8707  Kokkos::View<int*,device_type> TargetPids_d;
8708 
8710  *this,
8711  RemoteLIDs_d,
8712  destMat->imports_.view_device(), //hostImports
8713  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8714  NumSameIDs,
8715  PermuteToLIDs_d,
8716  PermuteFromLIDs_d,
8717  N,
8718  MyPID,
8719  CSR_rowptr_d,
8720  CSR_colind_GID_d,
8721  CSR_vals_d,
8722  SourcePids(),
8723  TargetPids_d);
8724 
8725  Kokkos::resize (CSR_colind_LID_d, CSR_colind_GID_d.size());
8726 
8727 #ifdef HAVE_TPETRA_MMM_TIMINGS
8728  tmCopySPRdata = Teuchos::null;
8729 #endif
8730  /**************************************************************/
8731  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8732  /**************************************************************/
8733  // Call an optimized version of makeColMap that avoids the
8734  // Directory lookups (since the Import object knows who owns all
8735  // the GIDs).
8736  if (verbose) {
8737  std::ostringstream os;
8738  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8739  << std::endl;
8740  std::cerr << os.str ();
8741  }
8742  {
8743 #ifdef HAVE_TPETRA_MMM_TIMINGS
8744  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8745 #endif
8746  Import_Util::lowCommunicationMakeColMapAndReindex(CSR_rowptr_d,
8747  CSR_colind_LID_d,
8748  CSR_colind_GID_d,
8749  BaseDomainMap,
8750  TargetPids_d,
8751  RemotePids,
8752  MyColMap);
8753  }
8754 
8755  if (verbose) {
8756  std::ostringstream os;
8757  os << *verbosePrefix << "restrictComm="
8758  << (restrictComm ? "true" : "false") << std::endl;
8759  std::cerr << os.str ();
8760  }
8761 
8762  /*******************************************************/
8763  /**** 4) Second communicator restriction phase ****/
8764  /*******************************************************/
8765  {
8766 #ifdef HAVE_TPETRA_MMM_TIMINGS
8767  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8768 #endif
8769  if (restrictComm) {
8770  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8771  ReducedRowMap :
8772  MyColMap->replaceCommWithSubset (ReducedComm);
8773  MyColMap = ReducedColMap; // Reset the "my" maps
8774  }
8775 
8776  // Replace the col map
8777  if (verbose) {
8778  std::ostringstream os;
8779  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8780  std::cerr << os.str ();
8781  }
8782  destMat->replaceColMap (MyColMap);
8783 
8784  // Short circuit if the processor is no longer in the communicator
8785  //
8786  // NOTE: Epetra replaces modifies all "removed" processes so they
8787  // have a dummy (serial) Map that doesn't touch the original
8788  // communicator. Duplicating that here might be a good idea.
8789  if (ReducedComm.is_null ()) {
8790  if (verbose) {
8791  std::ostringstream os;
8792  os << *verbosePrefix << "I am no longer in the communicator; "
8793  "returning" << std::endl;
8794  std::cerr << os.str ();
8795  }
8796  return;
8797  }
8798  }
8799 
8800  /***************************************************/
8801  /**** 5) Sort ****/
8802  /***************************************************/
8803 
8804  if ((! reverseMode && xferAsImport != nullptr) ||
8805  (reverseMode && xferAsExport != nullptr)) {
8806  if (verbose) {
8807  std::ostringstream os;
8808  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8809  std::cerr << os.str ();
8810  }
8811 #ifdef HAVE_TPETRA_MMM_TIMINGS
8812  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8813 #endif
8814  Import_Util::sortCrsEntries (CSR_rowptr_d,
8815  CSR_colind_LID_d,
8816  CSR_vals_d);
8817  }
8818  else if ((! reverseMode && xferAsExport != nullptr) ||
8819  (reverseMode && xferAsImport != nullptr)) {
8820  if (verbose) {
8821  std::ostringstream os;
8822  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8823  << endl;
8824  std::cerr << os.str();
8825  }
8826 #ifdef HAVE_TPETRA_MMM_TIMINGS
8827  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8828 #endif
8829  Import_Util::sortAndMergeCrsEntries (CSR_rowptr_d,
8830  CSR_colind_LID_d,
8831  CSR_vals_d);
8832  }
8833  else {
8834  TEUCHOS_TEST_FOR_EXCEPTION(
8835  true, std::logic_error, "Tpetra::CrsMatrix::"
8836  "transferAndFillComplete: Should never get here! "
8837  "Please report this bug to a Tpetra developer.");
8838  }
8839 
8840  /***************************************************/
8841  /**** 6) Reset the colmap and the arrays ****/
8842  /***************************************************/
8843 
8844  if (verbose) {
8845  std::ostringstream os;
8846  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8847  std::cerr << os.str ();
8848  }
8849 
8850  {
8851 #ifdef HAVE_TPETRA_MMM_TIMINGS
8852  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8853 #endif
8854  destMat->setAllValues (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d);
8855  }
8856 
8857  } //if (runOnHost) .. else ..
8858 
8859  /***************************************************/
8860  /**** 7) Build Importer & Call ESFC ****/
8861  /***************************************************/
8862 #ifdef HAVE_TPETRA_MMM_TIMINGS
8863  RCP<TimeMonitor> tmIESFC = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC build importer and esfc"))));
8864 #endif
8865  // Pre-build the importer using the existing PIDs
8866  Teuchos::ParameterList esfc_params;
8867 
8868  RCP<import_type> MyImport;
8869 
8870  // Fulfull the non-blocking allreduce on reduced_mismatch.
8871  if (iallreduceRequest.get () != nullptr) {
8872  if (verbose) {
8873  std::ostringstream os;
8874  os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8875  << endl;
8876  std::cerr << os.str ();
8877  }
8878  iallreduceRequest->wait ();
8879  if (reduced_mismatch != 0) {
8880  isMM = false;
8881  }
8882  }
8883 
8884  if( isMM ) {
8885 #ifdef HAVE_TPETRA_MMM_TIMINGS
8886  Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8887 #endif
8888  // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8889 
8890  if (verbose) {
8891  std::ostringstream os;
8892  os << *verbosePrefix << "Getting CRS pointers" << endl;
8893  std::cerr << os.str ();
8894  }
8895 
8896  Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8897  Teuchos::ArrayRCP<int> type3PIDs;
8898  auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8899  auto colind = getCrsGraph()->getLocalIndicesHost();
8900 
8901  if (verbose) {
8902  std::ostringstream os;
8903  os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8904  std::cerr << os.str ();
8905  }
8906 
8907  {
8908 #ifdef HAVE_TPETRA_MMM_TIMINGS
8909  TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8910 #endif
8911  Import_Util::reverseNeighborDiscovery(*this,
8912  rowptr,
8913  colind,
8914  rowTransfer,
8915  MyImporter,
8916  MyDomainMap,
8917  type3PIDs,
8918  type3LIDs,
8919  ReducedComm);
8920  }
8921 
8922  if (verbose) {
8923  std::ostringstream os;
8924  os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8925  std::cerr << os.str ();
8926  }
8927 
8928  Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8929  Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8930 
8931  Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8932  Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8933 
8934  const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8935  // from EpetraExt_MMHelpers.cpp: build_type2_exports
8936  std::vector<bool> IsOwned(numCols,true);
8937  std::vector<int> SentTo(numCols,-1);
8938  if (! MyImporter.is_null ()) {
8939  for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8940  IsOwned[rlid]=false;
8941  }
8942  }
8943 
8944  std::vector<std::pair<int,GO> > usrtg;
8945  usrtg.reserve(TEPID2.size());
8946 
8947  {
8948  const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
8949  for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
8950  const LO row = TELID2[i];
8951  const int pid = TEPID2[i];
8952  for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
8953  const int col = colind[j];
8954  if (IsOwned[col] && SentTo[col] != pid) {
8955  SentTo[col] = pid;
8956  GO gid = colMap.getGlobalElement (col);
8957  usrtg.push_back (std::pair<int,GO> (pid, gid));
8958  }
8959  }
8960  }
8961  }
8962 
8963 // This sort can _not_ be omitted.[
8964  std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
8965  auto eopg = std ::unique(usrtg.begin(),usrtg.end());
8966  // 25 Jul 2018: Could just ignore the entries at and after eopg.
8967  usrtg.erase(eopg,usrtg.end());
8968 
8969  const Array_size_type type2_us_size = usrtg.size();
8970  Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
8971  Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
8972 
8973  int pos=0;
8974  for(auto && p : usrtg) {
8975  EPID2[pos]= p.first;
8976  ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
8977  pos++;
8978  }
8979 
8980  Teuchos::ArrayView<int> EPID3 = type3PIDs();
8981  Teuchos::ArrayView< LO> ELID3 = type3LIDs();
8982  GO InfGID = std::numeric_limits<GO>::max();
8983  int InfPID = INT_MAX;
8984 #ifdef TPETRA_MIN3
8985 # undef TPETRA_MIN3
8986 #endif // TPETRA_MIN3
8987 #define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
8988  int i1=0, i2=0, i3=0;
8989  int Len1 = EPID1.size();
8990  int Len2 = EPID2.size();
8991  int Len3 = EPID3.size();
8992 
8993  int MyLen=Len1+Len2+Len3;
8994  Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
8995  Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
8996  int iloc = 0; // will be the size of the userExportLID/PIDs
8997 
8998  while(i1 < Len1 || i2 < Len2 || i3 < Len3){
8999  int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
9000  int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
9001  int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
9002 
9003  GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
9004  GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
9005  GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
9006 
9007  int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
9008  GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
9009 #ifdef TPETRA_MIN3
9010 # undef TPETRA_MIN3
9011 #endif // TPETRA_MIN3
9012  bool added_entry=false;
9013 
9014  if(PID1 == MIN_PID && GID1 == MIN_GID){
9015  userExportLIDs[iloc]=ELID1[i1];
9016  userExportPIDs[iloc]=EPID1[i1];
9017  i1++;
9018  added_entry=true;
9019  iloc++;
9020  }
9021  if(PID2 == MIN_PID && GID2 == MIN_GID){
9022  if(!added_entry) {
9023  userExportLIDs[iloc]=ELID2[i2];
9024  userExportPIDs[iloc]=EPID2[i2];
9025  added_entry=true;
9026  iloc++;
9027  }
9028  i2++;
9029  }
9030  if(PID3 == MIN_PID && GID3 == MIN_GID){
9031  if(!added_entry) {
9032  userExportLIDs[iloc]=ELID3[i3];
9033  userExportPIDs[iloc]=EPID3[i3];
9034  iloc++;
9035  }
9036  i3++;
9037  }
9038  }
9039 
9040  if (verbose) {
9041  std::ostringstream os;
9042  os << *verbosePrefix << "Create Import" << std::endl;
9043  std::cerr << os.str ();
9044  }
9045 
9046 #ifdef HAVE_TPETRA_MMM_TIMINGS
9047  auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
9048 #endif
9049  Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
9050  // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
9051  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9052  MyImport = rcp ( new import_type (MyDomainMap,
9053  MyColMap,
9054  RemotePids,
9055  userExportLIDs.view(0,iloc).getConst(),
9056  userExportPIDs.view(0,iloc).getConst(),
9057  plist)
9058  );
9059 
9060  if (verbose) {
9061  std::ostringstream os;
9062  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9063  std::cerr << os.str ();
9064  }
9065 
9066  {
9067 #ifdef HAVE_TPETRA_MMM_TIMINGS
9068  TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
9069  esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
9070 #endif
9071  if(!params.is_null())
9072  esfc_params.set("compute global constants",params->get("compute global constants",true));
9073  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
9074 
9075  }
9076 
9077  } // if(isMM)
9078  else {
9079 #ifdef HAVE_TPETRA_MMM_TIMINGS
9080  TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
9081 #endif
9082  if (verbose) {
9083  std::ostringstream os;
9084  os << *verbosePrefix << "Create Import" << std::endl;
9085  std::cerr << os.str ();
9086  }
9087 
9088 #ifdef HAVE_TPETRA_MMM_TIMINGS
9089  TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
9090 #endif
9091  Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
9092  mypars->set("Timer Label","notMMFrom_tAFC");
9093  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9094  MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
9095 
9096  if (verbose) {
9097  std::ostringstream os;
9098  os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
9099  std::cerr << os.str ();
9100  }
9101 
9102 #ifdef HAVE_TPETRA_MMM_TIMINGS
9103  TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
9104  esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
9105 #else
9106  esfc_params.set("Timer Label",std::string("notMM eSFC"));
9107 #endif
9108 
9109  if (!params.is_null ()) {
9110  esfc_params.set ("compute global constants",
9111  params->get ("compute global constants", true));
9112  }
9113  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
9114  MyImport, Teuchos::null,
9115  rcp (new Teuchos::ParameterList (esfc_params)));
9116  }
9117 
9118 #ifdef HAVE_TPETRA_MMM_TIMINGS
9119  tmIESFC = Teuchos::null;
9120 #endif
9121 
9122  if (verbose) {
9123  std::ostringstream os;
9124  os << *verbosePrefix << "Done" << endl;
9125  std::cerr << os.str ();
9126  }
9127  } //transferAndFillComplete
9128 
9129 
9130  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9131  void
9134  const import_type& importer,
9135  const Teuchos::RCP<const map_type>& domainMap,
9136  const Teuchos::RCP<const map_type>& rangeMap,
9137  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9138  {
9139  transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
9140  }
9141 
9142  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9143  void
9146  const import_type& rowImporter,
9147  const import_type& domainImporter,
9148  const Teuchos::RCP<const map_type>& domainMap,
9149  const Teuchos::RCP<const map_type>& rangeMap,
9150  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9151  {
9152  transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
9153  }
9154 
9155  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9156  void
9159  const export_type& exporter,
9160  const Teuchos::RCP<const map_type>& domainMap,
9161  const Teuchos::RCP<const map_type>& rangeMap,
9162  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9163  {
9164  transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
9165  }
9166 
9167  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9168  void
9171  const export_type& rowExporter,
9172  const export_type& domainExporter,
9173  const Teuchos::RCP<const map_type>& domainMap,
9174  const Teuchos::RCP<const map_type>& rangeMap,
9175  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9176  {
9177  transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
9178  }
9179 
9180 } // namespace Tpetra
9181 
9182 //
9183 // Explicit instantiation macro
9184 //
9185 // Must be expanded from within the Tpetra namespace!
9186 //
9187 
9188 #define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
9189  \
9190  template class CrsMatrix< SCALAR , LO , GO , NODE >;
9191 
9192 #define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
9193  \
9194  template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
9195  CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
9196 
9197 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9198  template<> \
9199  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9200  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9201  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9202  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9203  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
9204  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9205  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9206  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9207  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9208  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9209  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9210  const Teuchos::RCP<Teuchos::ParameterList>& params);
9211 
9212 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9213  template<> \
9214  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9215  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9216  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9217  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9218  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
9219  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9220  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9221  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
9222  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9223  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9224  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9225  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9226  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9227  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9228  const Teuchos::RCP<Teuchos::ParameterList>& params);
9229 
9230 
9231 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9232  template<> \
9233  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9234  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9235  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9236  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9237  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
9238  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9239  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9240  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9241  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9242  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9243  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9244  const Teuchos::RCP<Teuchos::ParameterList>& params);
9245 
9246 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9247  template<> \
9248  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9249  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9250  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9251  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9252  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9253  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9254  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9255  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9256  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9257  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9258  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9259  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9260  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9261  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9262  const Teuchos::RCP<Teuchos::ParameterList>& params);
9263 
9264 
9265 #define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9266  TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9267  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9268  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9269  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9270  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9271 
9272 #endif // TPETRA_CRSMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
bool indicesAreSorted_
Whether the graph&#39;s indices are sorted in each row, on this process.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Functor for the the ABSMAX CombineMode of Import and Export operations.
void checkInternalState() const
Check that this object&#39;s state is sane; throw if it&#39;s not.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types...
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
mag_type getNormInf() const
Compute and return the infinity norm of the matrix.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
typename device_type::execution_space execution_space
The Kokkos execution space.
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object&#39;s data for an Import or Export.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
void getGlobalRowView(GlobalOrdinal GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices...
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
mag_type getNorm1(bool assumeSymmetric=false) const
Compute and return the 1-norm of the matrix.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Traits class for packing / unpacking data of type T.
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular...
Declaration of Tpetra::Details::EquilibrationInfo.
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
bool noRedundancies_
Whether the graph&#39;s indices are non-redundant (merged) in each row, on this process.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
bool isDistributed() const
Whether this is a globally distributed object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
static bool debug()
Whether Tpetra is in debug mode.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix&#39;s communicator...
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y...
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T...
Scalar scalar_type
The type of each entry in the matrix.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of column indices for all rows Valid when isLocallyIndexed is true If OptimizedStorage...
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
Details::EquilibrationInfo< typename Kokkos::ArithTraits< SC >::val_type, typename NT::device_type > computeRowOneNorms(const Tpetra::RowMatrix< SC, LO, GO, NT > &A)
Compute global row one-norms (&quot;row sums&quot;) of the input sparse matrix A, in a way suitable for one-sid...
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void sort(View &view, const size_t &size)
Convenience wrapper for std::sort for host-accessible views.
Details::EquilibrationInfo< typename Kokkos::ArithTraits< SC >::val_type, typename NT::device_type > computeRowAndColumnOneNorms(const Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool assumeSymmetric)
Compute global row and column one-norms (&quot;row sums&quot; and &quot;column sums&quot;) of the input sparse matrix A...
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular...
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow)...
bool isFillComplete() const override
Whether the matrix is fill complete.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified, possibly nondefault Kokkos Node type.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row&#39;s entries.
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Functions for manipulating CRS arrays.
Kokkos::View< size_t *, Kokkos::LayoutLeft, device_type >::HostMirror num_row_entries_type
Row offsets for &quot;1-D&quot; storage.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
bool isNodeGlobalElement(global_ordinal_type globalIndex) const
Whether the given global index is owned by this Map on the calling process.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for &quot;new&quot; DistObject inter...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void copyConvert(const OutputViewType &dst, const InputViewType &src)
Copy values from the 1-D Kokkos::View src, to the 1-D Kokkos::View dst, of the same length...
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
bool fillComplete_
Whether the matrix is fill complete.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using global indices.
Replace old value with maximum of magnitudes of old and new values.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
Abstract base class for objects that can be the source of an Import or Export operation.
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
typename Node::device_type device_type
The Kokkos device type.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process...
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps...
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
std::string description() const override
A one-line description of this object.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
Replace existing values with new values.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
Replace old values with zero.
const row_ptrs_host_view_type & getRowPtrsUnpackedHost() const
Get the unpacked row pointers on host. Lazily make a copy from device.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is &quot;imbalanced&quot; in the number of entries per row...
bool isLocallyComplete() const
Is this Export or Import locally complete?
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
local_map_type getLocalMap() const
Get the LocalMap for Kokkos-Kernels.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
A parallel distribution of indices over processes.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const CombineMode CM=ADD)
Insert one or more entries into the matrix, using local column indices.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Details::EStorageStatus storageStatus_
Status of the matrix&#39;s storage, when not in a fill-complete state.
A read-only, row-oriented interface to a sparse matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
A distributed dense vector.
Declaration of Tpetra::Details::iallreduce.
void reduce()
Sum values of a locally replicated multivector across all processes.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process...
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using local row and column indices.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
Definition: Tpetra_Util.cpp:71
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
Accumulate new values into existing values (may not be supported in all classes)
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
void rightScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Right-scale a KokkosSparse::CrsMatrix.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Description of Tpetra&#39;s behavior.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
Declaration of Tpetra::computeRowAndColumnOneNorms.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.