Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_CRSMATRIX_DEF_HPP
11 #define TPETRA_CRSMATRIX_DEF_HPP
12 
20 
21 #include "Tpetra_Import_Util.hpp"
22 #include "Tpetra_Import_Util2.hpp"
23 #include "Tpetra_RowMatrix.hpp"
24 #include "Tpetra_LocalCrsMatrixOperator.hpp"
25 #include "Tpetra_computeRowAndColumnOneNorms.hpp"
27 
34 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
42 #include "Tpetra_Details_packCrsMatrix.hpp"
43 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
45 #include "Teuchos_FancyOStream.hpp"
46 #include "Teuchos_RCP.hpp"
47 #include "Teuchos_DataAccess.hpp"
48 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
49 #include "KokkosBlas1_scal.hpp"
50 #include "KokkosSparse_getDiagCopy.hpp"
51 #include "KokkosSparse_spmv.hpp"
53 
54 #include <memory>
55 #include <sstream>
56 #include <typeinfo>
57 #include <utility>
58 #include <vector>
59 
60 namespace Tpetra {
61 
62 namespace { // (anonymous)
63 
64 template <class T, class BinaryFunction>
65 T atomic_binary_function_update(T* const dest,
66  const T& inputVal,
67  BinaryFunction f) {
68  T oldVal = *dest;
69  T assume;
70 
71  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
72  // POWER architectures, because 'newval' depends on 'assume',
73  // which depends on 'oldVal', which depends on '*dest'. This
74  // sets up a chain of read dependencies that should ensure
75  // correct behavior given a sane memory model.
76  do {
77  assume = oldVal;
78  T newVal = f(assume, inputVal);
79  oldVal = Kokkos::atomic_compare_exchange(dest, assume, newVal);
80  } while (assume != oldVal);
81 
82  return oldVal;
83 }
84 } // namespace
85 
86 //
87 // Users must never rely on anything in the Details namespace.
88 //
89 namespace Details {
90 
100 template <class Scalar>
101 struct AbsMax {
103  Scalar operator()(const Scalar& x, const Scalar& y) {
104  typedef Teuchos::ScalarTraits<Scalar> STS;
105  return std::max(STS::magnitude(x), STS::magnitude(y));
106  }
107 };
108 
109 } // namespace Details
110 } // namespace Tpetra
111 
112 namespace Tpetra {
113 
114 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
115 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
116  CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
117  size_t maxNumEntriesPerRow,
118  const Teuchos::RCP<Teuchos::ParameterList>& params)
119  : dist_object_type(rowMap) {
120  const char tfecfFuncName[] =
121  "CrsMatrix(RCP<const Map>, size_t "
122  "[, RCP<ParameterList>]): ";
123  Teuchos::RCP<crs_graph_type> graph;
124  try {
125  graph = Teuchos::rcp(new crs_graph_type(rowMap, maxNumEntriesPerRow,
126  params));
127  } catch (std::exception& e) {
128  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
129  "CrsGraph constructor (RCP<const Map>, "
130  "size_t [, RCP<ParameterList>]) threw an exception: "
131  << e.what());
132  }
133  // myGraph_ not null means that the matrix owns the graph. That's
134  // different than the const CrsGraph constructor, where the matrix
135  // does _not_ own the graph.
136  myGraph_ = graph;
137  staticGraph_ = myGraph_;
138  resumeFill(params);
140 }
141 
142 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
144  CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
145  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
146  const Teuchos::RCP<Teuchos::ParameterList>& params)
147  : dist_object_type(rowMap) {
148  const char tfecfFuncName[] =
149  "CrsMatrix(RCP<const Map>, "
150  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
151  Teuchos::RCP<crs_graph_type> graph;
152  try {
153  using Teuchos::rcp;
154  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
155  params));
156  } catch (std::exception& e) {
157  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
158  "CrsGraph constructor "
159  "(RCP<const Map>, ArrayView<const size_t>"
160  "[, RCP<ParameterList>]) threw an exception: "
161  << e.what());
162  }
163  // myGraph_ not null means that the matrix owns the graph. That's
164  // different than the const CrsGraph constructor, where the matrix
165  // does _not_ own the graph.
166  myGraph_ = graph;
167  staticGraph_ = graph;
168  resumeFill(params);
170 }
171 
172 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
174  CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
175  const Teuchos::RCP<const map_type>& colMap,
176  const size_t maxNumEntPerRow,
177  const Teuchos::RCP<Teuchos::ParameterList>& params)
178  : dist_object_type(rowMap) {
179  const char tfecfFuncName[] =
180  "CrsMatrix(RCP<const Map>, "
181  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
182  const char suffix[] =
183  " Please report this bug to the Tpetra developers.";
184 
185  // An artifact of debugging something a while back.
186  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!staticGraph_.is_null(), std::logic_error,
187  "staticGraph_ is not null at the beginning of the constructor."
188  << suffix);
189  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!myGraph_.is_null(), std::logic_error,
190  "myGraph_ is not null at the beginning of the constructor."
191  << suffix);
192  Teuchos::RCP<crs_graph_type> graph;
193  try {
194  graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap,
195  maxNumEntPerRow,
196  params));
197  } catch (std::exception& e) {
198  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
199  "CrsGraph constructor (RCP<const Map>, "
200  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
201  "exception: "
202  << e.what());
203  }
204  // myGraph_ not null means that the matrix owns the graph. That's
205  // different than the const CrsGraph constructor, where the matrix
206  // does _not_ own the graph.
207  myGraph_ = graph;
208  staticGraph_ = myGraph_;
209  resumeFill(params);
211 }
212 
213 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
215  CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
216  const Teuchos::RCP<const map_type>& colMap,
217  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
218  const Teuchos::RCP<Teuchos::ParameterList>& params)
219  : dist_object_type(rowMap) {
220  const char tfecfFuncName[] =
221  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
222  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
223  Teuchos::RCP<crs_graph_type> graph;
224  try {
225  graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap,
226  numEntPerRowToAlloc,
227  params));
228  } catch (std::exception& e) {
229  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
230  "CrsGraph constructor (RCP<const Map>, "
231  "RCP<const Map>, ArrayView<const size_t>[, "
232  "RCP<ParameterList>]) threw an exception: "
233  << e.what());
234  }
235  // myGraph_ not null means that the matrix owns the graph. That's
236  // different than the const CrsGraph constructor, where the matrix
237  // does _not_ own the graph.
238  myGraph_ = graph;
239  staticGraph_ = graph;
240  resumeFill(params);
242 }
243 
244 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
246  CrsMatrix(const Teuchos::RCP<const crs_graph_type>& graph,
247  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
248  : dist_object_type(graph->getRowMap())
249  , staticGraph_(graph)
250  , storageStatus_(Details::STORAGE_1D_PACKED) {
251  using std::endl;
252  typedef typename local_matrix_device_type::values_type values_type;
253  const char tfecfFuncName[] =
254  "CrsMatrix(RCP<const CrsGraph>[, "
255  "RCP<ParameterList>]): ";
256  const bool verbose = Details::Behavior::verbose("CrsMatrix");
257 
258  std::unique_ptr<std::string> prefix;
259  if (verbose) {
260  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
261  std::ostringstream os;
262  os << *prefix << "Start" << endl;
263  std::cerr << os.str();
264  }
265 
266  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.is_null(), std::runtime_error, "Input graph is null.");
267  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::runtime_error,
268  "Input graph "
269  "is not fill complete. You must call fillComplete on the "
270  "graph before using it to construct a CrsMatrix. Note that "
271  "calling resumeFill on the graph makes it not fill complete, "
272  "even if you had previously called fillComplete. In that "
273  "case, you must call fillComplete on the graph again.");
274 
275  // The graph is fill complete, so it is locally indexed and has a
276  // fixed structure. This means we can allocate the (1-D) array of
277  // values and build the local matrix right now. Note that the
278  // local matrix's number of columns comes from the column Map, not
279  // the domain Map.
280 
281  const size_t numEnt = graph->lclIndsPacked_wdv.extent(0);
282  if (verbose) {
283  std::ostringstream os;
284  os << *prefix << "Allocate values: " << numEnt << endl;
285  std::cerr << os.str();
286  }
287 
288  values_type val("Tpetra::CrsMatrix::values", numEnt);
289  valuesPacked_wdv = values_wdv_type(val);
290  valuesUnpacked_wdv = valuesPacked_wdv;
291 
293 
294  if (verbose) {
295  std::ostringstream os;
296  os << *prefix << "Done" << endl;
297  std::cerr << os.str();
298  }
299 }
300 
301 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
304  const Teuchos::RCP<const crs_graph_type>& graph,
305  const Teuchos::RCP<Teuchos::ParameterList>& params)
306  : dist_object_type(graph->getRowMap())
307  , staticGraph_(graph)
308  , storageStatus_(matrix.storageStatus_) {
309  const char tfecfFuncName[] =
310  "CrsMatrix(RCP<const CrsGraph>, "
311  "local_matrix_device_type::values_type, "
312  "[,RCP<ParameterList>]): ";
313  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.is_null(), std::runtime_error, "Input graph is null.");
314  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::runtime_error,
315  "Input graph "
316  "is not fill complete. You must call fillComplete on the "
317  "graph before using it to construct a CrsMatrix. Note that "
318  "calling resumeFill on the graph makes it not fill complete, "
319  "even if you had previously called fillComplete. In that "
320  "case, you must call fillComplete on the graph again.");
321 
322  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
323  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
324 
325  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
326  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
327 
329 }
330 
331 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
333  CrsMatrix(const Teuchos::RCP<const crs_graph_type>& graph,
334  const typename local_matrix_device_type::values_type& values,
335  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
336  : dist_object_type(graph->getRowMap())
337  , staticGraph_(graph)
338  , storageStatus_(Details::STORAGE_1D_PACKED) {
339  const char tfecfFuncName[] =
340  "CrsMatrix(RCP<const CrsGraph>, "
341  "local_matrix_device_type::values_type, "
342  "[,RCP<ParameterList>]): ";
343  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.is_null(), std::runtime_error, "Input graph is null.");
344  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::runtime_error,
345  "Input graph "
346  "is not fill complete. You must call fillComplete on the "
347  "graph before using it to construct a CrsMatrix. Note that "
348  "calling resumeFill on the graph makes it not fill complete, "
349  "even if you had previously called fillComplete. In that "
350  "case, you must call fillComplete on the graph again.");
351 
352  // The graph is fill complete, so it is locally indexed and has a
353  // fixed structure. This means we can allocate the (1-D) array of
354  // values and build the local matrix right now. Note that the
355  // local matrix's number of columns comes from the column Map, not
356  // the domain Map.
357 
358  valuesPacked_wdv = values_wdv_type(values);
359  valuesUnpacked_wdv = valuesPacked_wdv;
360 
361  // FIXME (22 Jun 2016) I would very much like to get rid of
362  // k_values1D_ at some point. I find it confusing to have all
363  // these extra references lying around.
364  // KDDKDD ALMOST THERE, MARK!
365  // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
366 
368 }
369 
370 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
372  CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
373  const Teuchos::RCP<const map_type>& colMap,
374  const typename local_graph_device_type::row_map_type& rowPointers,
375  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
376  const typename local_matrix_device_type::values_type& values,
377  const Teuchos::RCP<Teuchos::ParameterList>& params)
378  : dist_object_type(rowMap)
379  , storageStatus_(Details::STORAGE_1D_PACKED) {
380  using Details::getEntryOnHost;
381  using std::endl;
382  using Teuchos::RCP;
383  const char tfecfFuncName[] =
384  "Tpetra::CrsMatrix(RCP<const Map>, "
385  "RCP<const Map>, ptr, ind, val[, params]): ";
386  const char suffix[] =
387  ". Please report this bug to the Tpetra developers.";
388  const bool debug = Details::Behavior::debug("CrsMatrix");
389  const bool verbose = Details::Behavior::verbose("CrsMatrix");
390 
391  std::unique_ptr<std::string> prefix;
392  if (verbose) {
393  prefix = this->createPrefix(
394  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
395  std::ostringstream os;
396  os << *prefix << "Start" << endl;
397  std::cerr << os.str();
398  }
399 
400  // Check the user's input. Note that this might throw only on
401  // some processes but not others, causing deadlock. We prefer
402  // deadlock due to exceptions to segfaults, because users can
403  // catch exceptions.
404  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.extent(0) != columnIndices.extent(0),
405  std::invalid_argument, "values.extent(0)=" << values.extent(0) << " != columnIndices.extent(0) = " << columnIndices.extent(0) << ".");
406  if (debug && rowPointers.extent(0) != 0) {
407  const size_t numEnt =
408  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
409  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numEnt != size_t(columnIndices.extent(0)) ||
410  numEnt != size_t(values.extent(0)),
411  std::invalid_argument,
412  "Last entry of rowPointers says that "
413  "the matrix has "
414  << numEnt << " entr"
415  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
416  "columnIndices and values don't match this. "
417  "columnIndices.extent(0)="
418  << columnIndices.extent(0)
419  << " and values.extent(0)=" << values.extent(0) << ".");
420  }
421 
422  RCP<crs_graph_type> graph;
423  try {
424  graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap, rowPointers,
425  columnIndices, params));
426  } catch (std::exception& e) {
427  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
428  "CrsGraph constructor (RCP<const Map>, "
429  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
430  << e.what());
431  }
432  // The newly created CrsGraph _must_ have a local graph at this
433  // point. We don't really care whether CrsGraph's constructor
434  // deep-copies or shallow-copies the input, but the dimensions
435  // have to be right. That's how we tell whether the CrsGraph has
436  // a local graph.
437  auto lclGraph = graph->getLocalGraphDevice();
438  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclGraph.row_map.extent(0) != rowPointers.extent(0) ||
439  lclGraph.entries.extent(0) != columnIndices.extent(0),
440  std::logic_error,
441  "CrsGraph's constructor (rowMap, colMap, ptr, "
442  "ind[, params]) did not set the local graph correctly."
443  << suffix);
444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclGraph.entries.extent(0) != values.extent(0),
445  std::logic_error,
446  "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
447  "params]) did not set the local graph correctly. "
448  "lclGraph.entries.extent(0) = "
449  << lclGraph.entries.extent(0)
450  << " != values.extent(0) = " << values.extent(0) << suffix);
451 
452  // myGraph_ not null means that the matrix owns the graph. This
453  // is true because the column indices come in as nonconst,
454  // implying shared ownership.
455  myGraph_ = graph;
456  staticGraph_ = graph;
457 
458  // The graph may not be fill complete yet. However, it is locally
459  // indexed (since we have a column Map) and has a fixed structure
460  // (due to the input arrays). This means we can allocate the
461  // (1-D) array of values and build the local matrix right now.
462  // Note that the local matrix's number of columns comes from the
463  // column Map, not the domain Map.
464 
465  valuesPacked_wdv = values_wdv_type(values);
466  valuesUnpacked_wdv = valuesPacked_wdv;
467 
468  // FIXME (22 Jun 2016) I would very much like to get rid of
469  // k_values1D_ at some point. I find it confusing to have all
470  // these extra references lying around.
471  // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
472 
474  if (verbose) {
475  std::ostringstream os;
476  os << *prefix << "Done" << endl;
477  std::cerr << os.str();
478  }
479 }
480 
481 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
483  CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
484  const Teuchos::RCP<const map_type>& colMap,
485  const Teuchos::ArrayRCP<size_t>& ptr,
486  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
487  const Teuchos::ArrayRCP<Scalar>& val,
488  const Teuchos::RCP<Teuchos::ParameterList>& params)
489  : dist_object_type(rowMap)
490  , storageStatus_(Details::STORAGE_1D_PACKED) {
491  using Kokkos::Compat::getKokkosViewDeepCopy;
492  using Teuchos::av_reinterpret_cast;
493  using Teuchos::RCP;
494  using values_type = typename local_matrix_device_type::values_type;
495  using IST = impl_scalar_type;
496  const char tfecfFuncName[] =
497  "Tpetra::CrsMatrix(RCP<const Map>, "
498  "RCP<const Map>, ptr, ind, val[, params]): ";
499 
500  RCP<crs_graph_type> graph;
501  try {
502  graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap, ptr,
503  ind, params));
504  } catch (std::exception& e) {
505  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
506  "CrsGraph constructor (RCP<const Map>, "
507  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
508  "RCP<ParameterList>]) threw an exception: "
509  << e.what());
510  }
511  // myGraph_ not null means that the matrix owns the graph. This
512  // is true because the column indices come in as nonconst,
513  // implying shared ownership.
514  myGraph_ = graph;
515  staticGraph_ = graph;
516 
517  // The graph may not be fill complete yet. However, it is locally
518  // indexed (since we have a column Map) and has a fixed structure
519  // (due to the input arrays). This means we can allocate the
520  // (1-D) array of values and build the local matrix right now.
521  // Note that the local matrix's number of columns comes from the
522  // column Map, not the domain Map.
523 
524  // The graph _must_ have a local graph at this point. We don't
525  // really care whether CrsGraph's constructor deep-copies or
526  // shallow-copies the input, but the dimensions have to be right.
527  // That's how we tell whether the CrsGraph has a local graph.
528  auto lclGraph = staticGraph_->getLocalGraphDevice();
529  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(lclGraph.row_map.extent(0)) != size_t(ptr.size()) ||
530  size_t(lclGraph.entries.extent(0)) != size_t(ind.size()),
531  std::logic_error,
532  "CrsGraph's constructor (rowMap, colMap, "
533  "ptr, ind[, params]) did not set the local graph correctly. "
534  "Please report this bug to the Tpetra developers.");
535 
536  values_type valIn =
537  getKokkosViewDeepCopy<device_type>(av_reinterpret_cast<IST>(val()));
538  valuesPacked_wdv = values_wdv_type(valIn);
539  valuesUnpacked_wdv = valuesPacked_wdv;
540 
541  // FIXME (22 Jun 2016) I would very much like to get rid of
542  // k_values1D_ at some point. I find it confusing to have all
543  // these extra references lying around.
544  // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
545 
547 }
548 
549 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
551  CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
552  const Teuchos::RCP<const map_type>& colMap,
553  const local_matrix_device_type& lclMatrix,
554  const Teuchos::RCP<Teuchos::ParameterList>& params)
555  : dist_object_type(rowMap)
556  , storageStatus_(Details::STORAGE_1D_PACKED)
557  , fillComplete_(true) {
558  const char tfecfFuncName[] =
559  "Tpetra::CrsMatrix(RCP<const Map>, "
560  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
561  const char suffix[] =
562  " Please report this bug to the Tpetra developers.";
563 
564  Teuchos::RCP<crs_graph_type> graph;
565  try {
566  graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap,
567  lclMatrix.graph, params));
568  } catch (std::exception& e) {
569  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
570  "CrsGraph constructor (RCP<const Map>, "
571  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
572  "exception: "
573  << e.what());
574  }
575  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::logic_error,
576  "CrsGraph constructor (RCP"
577  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
578  "did not produce a fill-complete graph. Please report this bug to the "
579  "Tpetra developers.");
580  // myGraph_ not null means that the matrix owns the graph. This
581  // is true because the column indices come in as nonconst through
582  // the matrix, implying shared ownership.
583  myGraph_ = graph;
584  staticGraph_ = graph;
585 
586  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
587  valuesUnpacked_wdv = valuesPacked_wdv;
588 
589  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
590  "At the end of a CrsMatrix constructor that should produce "
591  "a fillComplete matrix, isFillActive() is true."
592  << suffix);
593  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::logic_error,
594  "At the end of a "
595  "CrsMatrix constructor that should produce a fillComplete "
596  "matrix, isFillComplete() is false."
597  << suffix);
599 }
600 
601 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
604  const Teuchos::RCP<const map_type>& rowMap,
605  const Teuchos::RCP<const map_type>& colMap,
606  const Teuchos::RCP<const map_type>& domainMap,
607  const Teuchos::RCP<const map_type>& rangeMap,
608  const Teuchos::RCP<Teuchos::ParameterList>& params)
609  : dist_object_type(rowMap)
610  , storageStatus_(Details::STORAGE_1D_PACKED)
611  , fillComplete_(true) {
612  const char tfecfFuncName[] =
613  "Tpetra::CrsMatrix(RCP<const Map>, "
614  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
615  "local_matrix_device_type[, RCP<ParameterList>]): ";
616  const char suffix[] =
617  " Please report this bug to the Tpetra developers.";
618 
619  Teuchos::RCP<crs_graph_type> graph;
620  try {
621  graph = Teuchos::rcp(new crs_graph_type(lclMatrix.graph, rowMap, colMap,
622  domainMap, rangeMap, params));
623  } catch (std::exception& e) {
624  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
625  "CrsGraph constructor (RCP<const Map>, "
626  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
627  "RCP<ParameterList>]) threw an exception: "
628  << e.what());
629  }
630  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::logic_error,
631  "CrsGraph "
632  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
633  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
634  "not produce a fillComplete graph."
635  << suffix);
636  // myGraph_ not null means that the matrix owns the graph. This
637  // is true because the column indices come in as nonconst through
638  // the matrix, implying shared ownership.
639  myGraph_ = graph;
640  staticGraph_ = graph;
641 
642  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
643  valuesUnpacked_wdv = valuesPacked_wdv;
644 
645  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
646  "At the end of a CrsMatrix constructor that should produce "
647  "a fillComplete matrix, isFillActive() is true."
648  << suffix);
649  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::logic_error,
650  "At the end of a "
651  "CrsMatrix constructor that should produce a fillComplete "
652  "matrix, isFillComplete() is false."
653  << suffix);
655 }
656 
657 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
660  const Teuchos::RCP<const map_type>& rowMap,
661  const Teuchos::RCP<const map_type>& colMap,
662  const Teuchos::RCP<const map_type>& domainMap,
663  const Teuchos::RCP<const map_type>& rangeMap,
664  const Teuchos::RCP<const import_type>& importer,
665  const Teuchos::RCP<const export_type>& exporter,
666  const Teuchos::RCP<Teuchos::ParameterList>& params)
667  : dist_object_type(rowMap)
668  , storageStatus_(Details::STORAGE_1D_PACKED)
669  , fillComplete_(true) {
670  using Teuchos::rcp;
671  const char tfecfFuncName[] =
672  "Tpetra::CrsMatrix"
673  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
674  const char suffix[] =
675  " Please report this bug to the Tpetra developers.";
676 
677  Teuchos::RCP<crs_graph_type> graph;
678  try {
679  graph = rcp(new crs_graph_type(lclMatrix.graph, rowMap, colMap,
680  domainMap, rangeMap, importer,
681  exporter, params));
682  } catch (std::exception& e) {
683  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
684  "CrsGraph constructor "
685  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
686  "params) threw: "
687  << e.what());
688  }
689  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::logic_error,
690  "CrsGraph "
691  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
692  "Export, params) did not produce a fill-complete graph. "
693  "Please report this bug to the Tpetra developers.");
694  // myGraph_ not null means that the matrix owns the graph. This
695  // is true because the column indices come in as nonconst through
696  // the matrix, implying shared ownership.
697  myGraph_ = graph;
698  staticGraph_ = graph;
699 
700  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
701  valuesUnpacked_wdv = valuesPacked_wdv;
702 
703  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
704  "At the end of a CrsMatrix constructor that should produce "
705  "a fillComplete matrix, isFillActive() is true."
706  << suffix);
707  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::logic_error,
708  "At the end of a "
709  "CrsMatrix constructor that should produce a fillComplete "
710  "matrix, isFillComplete() is false."
711  << suffix);
713 }
714 
715 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
718  const Teuchos::DataAccess copyOrView)
719  : dist_object_type(source.getCrsGraph()->getRowMap())
720  , staticGraph_(source.getCrsGraph())
721  , storageStatus_(source.storageStatus_) {
722  const char tfecfFuncName[] =
723  "Tpetra::CrsMatrix("
724  "const CrsMatrix&, const Teuchos::DataAccess): ";
725  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!source.isFillComplete(), std::invalid_argument,
726  "Source graph must be fillComplete().");
727 
728  if (copyOrView == Teuchos::Copy) {
729  using values_type = typename local_matrix_device_type::values_type;
730  auto vals = source.getLocalValuesDevice(Access::ReadOnly);
731  using Kokkos::view_alloc;
732  using Kokkos::WithoutInitializing;
733  values_type newvals(view_alloc("val", WithoutInitializing),
734  vals.extent(0));
735  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
736  Kokkos::deep_copy(newvals, vals);
737  valuesPacked_wdv = values_wdv_type(newvals);
738  valuesUnpacked_wdv = valuesPacked_wdv;
739  fillComplete(source.getDomainMap(), source.getRangeMap());
740  } else if (copyOrView == Teuchos::View) {
741  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
742  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
743  fillComplete(source.getDomainMap(), source.getRangeMap());
744  } else {
745  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument,
746  "Second argument 'copyOrView' "
747  "has an invalid value "
748  << copyOrView << ". Valid values "
749  "include Teuchos::Copy = "
750  << Teuchos::Copy << " and "
751  "Teuchos::View = "
752  << Teuchos::View << ".");
753  }
755 }
756 
757 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
760  std::swap(crs_matrix.importMV_, this->importMV_);
761  std::swap(crs_matrix.exportMV_, this->exportMV_);
762  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
763  std::swap(crs_matrix.myGraph_, this->myGraph_);
764  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
765  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
766  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
767  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
768  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
769 }
770 
771 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
772 Teuchos::RCP<const Teuchos::Comm<int>>
774  getComm() const {
775  return getCrsGraphRef().getComm();
776 }
777 
778 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
781  return fillComplete_;
782 }
783 
784 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
786  isFillActive() const {
787  return !fillComplete_;
788 }
789 
790 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
793  return this->getCrsGraphRef().isStorageOptimized();
794 }
795 
796 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
799  return getCrsGraphRef().isLocallyIndexed();
800 }
801 
802 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
805  return getCrsGraphRef().isGloballyIndexed();
806 }
807 
808 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
810  hasColMap() const {
811  return getCrsGraphRef().hasColMap();
812 }
813 
814 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
818  return getCrsGraphRef().getGlobalNumEntries();
819 }
820 
821 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
822 size_t
825  return getCrsGraphRef().getLocalNumEntries();
826 }
827 
828 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
832  return getCrsGraphRef().getGlobalNumRows();
833 }
834 
835 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
839  return getCrsGraphRef().getGlobalNumCols();
840 }
841 
842 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
843 size_t
846  return getCrsGraphRef().getLocalNumRows();
847 }
848 
849 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
850 size_t
853  return getCrsGraphRef().getLocalNumCols();
854 }
855 
856 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
857 size_t
859  getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const {
860  return getCrsGraphRef().getNumEntriesInGlobalRow(globalRow);
861 }
862 
863 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
864 size_t
866  getNumEntriesInLocalRow(LocalOrdinal localRow) const {
867  return getCrsGraphRef().getNumEntriesInLocalRow(localRow);
868 }
869 
870 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
871 size_t
874  return getCrsGraphRef().getGlobalMaxNumRowEntries();
875 }
876 
877 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
878 size_t
881  return getCrsGraphRef().getLocalMaxNumRowEntries();
882 }
883 
884 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
885 GlobalOrdinal
887  getIndexBase() const {
888  return getRowMap()->getIndexBase();
889 }
890 
891 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
894  getRowMap() const {
895  return getCrsGraphRef().getRowMap();
896 }
897 
898 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
899 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
901  getColMap() const {
902  return getCrsGraphRef().getColMap();
903 }
904 
905 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
906 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
908  getDomainMap() const {
909  return getCrsGraphRef().getDomainMap();
910 }
911 
912 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
913 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
915  getRangeMap() const {
916  return getCrsGraphRef().getRangeMap();
917 }
918 
919 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
920 Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node>>
922  getGraph() const {
923  if (staticGraph_ != Teuchos::null) {
924  return staticGraph_;
925  }
926  return myGraph_;
927 }
928 
929 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
930 Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node>>
932  getCrsGraph() const {
933  if (staticGraph_ != Teuchos::null) {
934  return staticGraph_;
935  }
936  return myGraph_;
937 }
938 
939 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
942  getCrsGraphRef() const {
943 #ifdef HAVE_TPETRA_DEBUG
944  constexpr bool debug = true;
945 #else
946  constexpr bool debug = false;
947 #endif // HAVE_TPETRA_DEBUG
948 
949  if (!this->staticGraph_.is_null()) {
950  return *(this->staticGraph_);
951  } else {
952  if (debug) {
953  const char tfecfFuncName[] = "getCrsGraphRef: ";
954  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->myGraph_.is_null(), std::logic_error,
955  "Both staticGraph_ and myGraph_ are null. "
956  "Please report this bug to the Tpetra developers.");
957  }
958  return *(this->myGraph_);
959  }
960 }
961 
962 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
963 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
966  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
967  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
968  numCols,
969  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
970  staticGraph_->getLocalGraphDevice());
971 }
972 
973 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
974 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
976  getLocalMatrixHost() const {
977  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
978  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
979  valuesPacked_wdv.getHostView(Access::ReadWrite),
980  staticGraph_->getLocalGraphHost());
981 }
982 
983 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
985  isStaticGraph() const {
986  return myGraph_.is_null();
987 }
988 
989 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
992  return true;
993 }
994 
995 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
998  return true;
999 }
1000 
1001 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1003  allocateValues(ELocalGlobal lg, GraphAllocationStatus gas,
1004  const bool verbose) {
1005  using Details::Behavior;
1007  using std::endl;
1008  const char tfecfFuncName[] = "allocateValues: ";
1009  const char suffix[] =
1010  " Please report this bug to the Tpetra developers.";
1011  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1012 
1013  std::unique_ptr<std::string> prefix;
1014  if (verbose) {
1015  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1016  std::ostringstream os;
1017  os << *prefix << "lg: "
1018  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1019  << ", gas: Graph"
1020  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1021  << "Allocated" << endl;
1022  std::cerr << os.str();
1023  }
1024 
1025  const bool debug = Behavior::debug("CrsMatrix");
1026  if (debug) {
1027  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->staticGraph_.is_null(), std::logic_error,
1028  "staticGraph_ is null." << suffix);
1029 
1030  // If the graph indices are already allocated, then gas should be
1031  // GraphAlreadyAllocated. Otherwise, gas should be
1032  // GraphNotYetAllocated.
1033  if ((gas == GraphAlreadyAllocated) !=
1034  staticGraph_->indicesAreAllocated()) {
1035  const char err1[] =
1036  "The caller has asserted that the graph "
1037  "is ";
1038  const char err2[] =
1039  "already allocated, but the static graph "
1040  "says that its indices are ";
1041  const char err3[] = "already allocated. ";
1042  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(gas == GraphAlreadyAllocated &&
1043  !staticGraph_->indicesAreAllocated(),
1044  std::logic_error,
1045  err1 << err2 << "not " << err3 << suffix);
1046  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(gas != GraphAlreadyAllocated &&
1047  staticGraph_->indicesAreAllocated(),
1048  std::logic_error,
1049  err1 << "not " << err2 << err3 << suffix);
1050  }
1051 
1052  // If the graph is unallocated, then it had better be a
1053  // matrix-owned graph. ("Matrix-owned graph" means that the
1054  // matrix gets to define the graph structure. If the CrsMatrix
1055  // constructor that takes an RCP<const CrsGraph> was used, then
1056  // the matrix does _not_ own the graph.)
1057  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->staticGraph_->indicesAreAllocated() &&
1058  this->myGraph_.is_null(),
1059  std::logic_error,
1060  "The static graph says that its indices are not allocated, "
1061  "but the graph is not owned by the matrix."
1062  << suffix);
1063  }
1064 
1065  if (gas == GraphNotYetAllocated) {
1066  if (debug) {
1067  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->myGraph_.is_null(), std::logic_error,
1068  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1069  }
1070  try {
1071  this->myGraph_->allocateIndices(lg, verbose);
1072  } catch (std::exception& e) {
1073  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
1074  "CrsGraph::allocateIndices "
1075  "threw an exception: "
1076  << e.what());
1077  } catch (...) {
1078  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
1079  "CrsGraph::allocateIndices "
1080  "threw an exception not a subclass of std::exception.");
1081  }
1082  }
1083 
1084  // Allocate matrix values.
1085  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1086  if (debug) {
1087  const size_t lclNumRows = this->staticGraph_->getLocalNumRows();
1088  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1089  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1090  }
1091 
1092  // Allocate array of (packed???) matrix values.
1093  using values_type = typename local_matrix_device_type::values_type;
1094  if (verbose) {
1095  std::ostringstream os;
1096  os << *prefix << "Allocate values_wdv: Pre "
1097  << valuesUnpacked_wdv.extent(0) << ", post "
1098  << lclTotalNumEntries << endl;
1099  std::cerr << os.str();
1100  }
1101  // this->k_values1D_ =
1102  valuesUnpacked_wdv = values_wdv_type(
1103  values_type("Tpetra::CrsMatrix::values",
1104  lclTotalNumEntries));
1105 }
1106 
1107 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1109  fillLocalGraphAndMatrix(const Teuchos::RCP<Teuchos::ParameterList>& params) {
1110  using std::endl;
1111  using Teuchos::arcp_const_cast;
1112  using Teuchos::Array;
1113  using Teuchos::ArrayRCP;
1114  using Teuchos::null;
1115  using Teuchos::RCP;
1116  using Teuchos::rcp;
1118  using ::Tpetra::Details::getEntryOnHost;
1119  using row_map_type = typename local_graph_device_type::row_map_type;
1120  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1121  using values_type = typename local_matrix_device_type::values_type;
1122  Details::ProfilingRegion regionFLGAM("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1123 
1124  const char tfecfFuncName[] =
1125  "fillLocalGraphAndMatrix (called from "
1126  "fillComplete or expertStaticFillComplete): ";
1127  const char suffix[] =
1128  " Please report this bug to the Tpetra developers.";
1129  const bool debug = Details::Behavior::debug("CrsMatrix");
1130  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1131 
1132  std::unique_ptr<std::string> prefix;
1133  if (verbose) {
1134  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1135  std::ostringstream os;
1136  os << *prefix << endl;
1137  std::cerr << os.str();
1138  }
1139 
1140  if (debug) {
1141  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1142  // matrix owns the graph, which means myGraph_ is not null.
1143  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_.is_null(), std::logic_error,
1144  "The nonconst graph "
1145  "(myGraph_) is null. This means that the matrix has a "
1146  "const (a.k.a. \"static\") graph. fillComplete or "
1147  "expertStaticFillComplete should never call "
1148  "fillLocalGraphAndMatrix in that case."
1149  << suffix);
1150  }
1151 
1152  const size_t lclNumRows = this->getLocalNumRows();
1153 
1154  // This method's goal is to fill in the three arrays (compressed
1155  // sparse row format) that define the sparse graph's and matrix's
1156  // structure, and the sparse matrix's values.
1157  //
1158  // Get references to the data in myGraph_, so we can modify them
1159  // as well. Note that we only call fillLocalGraphAndMatrix() if
1160  // the matrix owns the graph, which means myGraph_ is not null.
1161 
1162  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1163  // See: https://github.com/trilinos/Trilinos/issues/12237
1164  // using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1165  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1166 
1167  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1168  myGraph_->rowPtrsUnpacked_dev_;
1169 
1170  if (debug) {
1171  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(curRowOffsets.extent(0) == 0, std::logic_error,
1172  "curRowOffsets.extent(0) == 0.");
1173  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(curRowOffsets.extent(0) != lclNumRows + 1, std::logic_error,
1174  "curRowOffsets.extent(0) = "
1175  << curRowOffsets.extent(0) << " != lclNumRows + 1 = "
1176  << (lclNumRows + 1) << ".");
1177  const size_t numOffsets = curRowOffsets.extent(0);
1178  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1179  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numOffsets != 0 &&
1180  myGraph_->lclIndsUnpacked_wdv.extent(0) != valToCheck,
1181  std::logic_error, "numOffsets = " << numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = " << myGraph_->lclIndsUnpacked_wdv.extent(0) << " != curRowOffsets(" << numOffsets << ") = " << valToCheck << ".");
1182  }
1183 
1184  if (myGraph_->getLocalNumEntries() !=
1185  myGraph_->getLocalAllocationSize()) {
1186  // Use the nonconst version of row_map_type for k_ptrs,
1187  // because row_map_type is const and we need to modify k_ptrs here.
1188  typename row_map_type::non_const_type k_ptrs;
1189  row_map_type k_ptrs_const;
1190  lclinds_1d_type k_inds;
1191  values_type k_vals;
1192 
1193  if (verbose) {
1194  std::ostringstream os;
1195  const auto numEnt = myGraph_->getLocalNumEntries();
1196  const auto allocSize = myGraph_->getLocalAllocationSize();
1197  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1198  << ", allocSize=" << allocSize << endl;
1199  std::cerr << os.str();
1200  }
1201  // The matrix's current 1-D storage is "unpacked." This means
1202  // the row offsets may differ from what the final row offsets
1203  // should be. This could happen, for example, if the user
1204  // set an upper
1205  // bound on the number of entries per row, but didn't fill all
1206  // those entries.
1207  if (debug && curRowOffsets.extent(0) != 0) {
1208  const size_t numOffsets =
1209  static_cast<size_t>(curRowOffsets.extent(0));
1210  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1211  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(valToCheck) !=
1212  static_cast<size_t>(valuesUnpacked_wdv.extent(0)),
1213  std::logic_error,
1214  "(unpacked branch) Before "
1215  "allocating or packing, curRowOffsets("
1216  << (numOffsets - 1)
1217  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1218  " = "
1219  << valuesUnpacked_wdv.extent(0) << ".");
1220  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(valToCheck) !=
1221  static_cast<size_t>(myGraph_->lclIndsUnpacked_wdv.extent(0)),
1222  std::logic_error,
1223  "(unpacked branch) Before "
1224  "allocating or packing, curRowOffsets("
1225  << (numOffsets - 1)
1226  << ") = " << valToCheck
1227  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1228  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ".");
1229  }
1230  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1231  // the array of valid entry counts per row.
1232 
1233  // Total number of entries in the matrix on the calling
1234  // process. We will compute this in the loop below. It's
1235  // cheap to compute and useful as a sanity check.
1236  size_t lclTotalNumEntries = 0;
1237  {
1238  // Allocate the packed row offsets array. We use a nonconst
1239  // temporary (packedRowOffsets) here, because k_ptrs is
1240  // const. We will assign packedRowOffsets to k_ptrs below.
1241  if (verbose) {
1242  std::ostringstream os;
1243  os << *prefix << "Allocate packed row offsets: "
1244  << (lclNumRows + 1) << endl;
1245  std::cerr << os.str();
1246  }
1247  typename row_map_type::non_const_type
1248  packedRowOffsets("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1249  typename row_entries_type::const_type numRowEnt_h =
1250  myGraph_->k_numRowEntries_;
1251  // We're computing offsets on device. This function can
1252  // handle numRowEnt_h being a host View.
1253  lclTotalNumEntries =
1254  computeOffsetsFromCounts(packedRowOffsets, numRowEnt_h);
1255  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1256  // to use packedRowOffsets in the loop above and assign here.
1257  k_ptrs = packedRowOffsets;
1258  k_ptrs_const = k_ptrs;
1259  }
1260 
1261  if (debug) {
1262  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(k_ptrs.extent(0)) != lclNumRows + 1,
1263  std::logic_error,
1264  "(unpacked branch) After packing k_ptrs, "
1265  "k_ptrs.extent(0) = "
1266  << k_ptrs.extent(0) << " != "
1267  "lclNumRows+1 = "
1268  << (lclNumRows + 1) << ".");
1269  const auto valToCheck = getEntryOnHost(k_ptrs, lclNumRows);
1270  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(valToCheck != lclTotalNumEntries, std::logic_error,
1271  "(unpacked branch) After filling k_ptrs, "
1272  "k_ptrs(lclNumRows="
1273  << lclNumRows << ") = " << valToCheck
1274  << " != total number of entries on the calling process = "
1275  << lclTotalNumEntries << ".");
1276  }
1277 
1278  // Allocate the arrays of packed column indices and values.
1279  if (verbose) {
1280  std::ostringstream os;
1281  os << *prefix << "Allocate packed local column indices: "
1282  << lclTotalNumEntries << endl;
1283  std::cerr << os.str();
1284  }
1285  k_inds = lclinds_1d_type("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1286  if (verbose) {
1287  std::ostringstream os;
1288  os << *prefix << "Allocate packed values: "
1289  << lclTotalNumEntries << endl;
1290  std::cerr << os.str();
1291  }
1292  k_vals = values_type("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1293 
1294  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1295  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1296  // the packed row offsets array k_ptrs that we created above.
1297  //
1298  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1299  // need to keep around the unpacked row offsets, column
1300  // indices, and values arrays.
1301 
1302  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1303  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1304  using inds_packer_type = pack_functor<
1305  typename Graph::local_graph_device_type::entries_type::non_const_type,
1306  typename Graph::local_inds_dualv_type::t_dev::const_type,
1307  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1308  typename Graph::local_graph_device_type::row_map_type>;
1309  inds_packer_type indsPacker(
1310  k_inds,
1311  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1312  k_ptrs, curRowOffsets);
1313  using exec_space = typename decltype(k_inds)::execution_space;
1314  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1315  Kokkos::parallel_for("Tpetra::CrsMatrix pack column indices",
1316  range_type(0, lclNumRows), indsPacker);
1317 
1318  // Pack the values from unpacked valuesUnpacked_wdv into packed
1319  // k_vals. We will replace valuesPacked_wdv below.
1320  using vals_packer_type = pack_functor<
1321  typename values_type::non_const_type,
1322  typename values_type::const_type,
1323  typename row_map_type::non_const_type,
1324  typename row_map_type::const_type>;
1325  vals_packer_type valsPacker(
1326  k_vals,
1327  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1328  k_ptrs, curRowOffsets);
1329  Kokkos::parallel_for("Tpetra::CrsMatrix pack values",
1330  range_type(0, lclNumRows), valsPacker);
1331 
1332  if (debug) {
1333  const char myPrefix[] =
1334  "(\"Optimize Storage\""
1335  "=true branch) After packing, ";
1336  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(k_ptrs.extent(0) == 0, std::logic_error, myPrefix << "k_ptrs.extent(0) = 0. This probably means that "
1337  "rowPtrsUnpacked_ was never allocated.");
1338  if (k_ptrs.extent(0) != 0) {
1339  const size_t numOffsets(k_ptrs.extent(0));
1340  const auto valToCheck =
1341  getEntryOnHost(k_ptrs, numOffsets - 1);
1342  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(valToCheck) != k_vals.extent(0),
1343  std::logic_error, myPrefix << "k_ptrs(" << (numOffsets - 1) << ") = " << valToCheck << " != k_vals.extent(0) = " << k_vals.extent(0) << ".");
1344  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(valToCheck) != k_inds.extent(0),
1345  std::logic_error, myPrefix << "k_ptrs(" << (numOffsets - 1) << ") = " << valToCheck << " != k_inds.extent(0) = " << k_inds.extent(0) << ".");
1346  }
1347  }
1348  // Build the local graph.
1349  myGraph_->setRowPtrsPacked(k_ptrs_const);
1350  myGraph_->lclIndsPacked_wdv =
1351  typename crs_graph_type::local_inds_wdv_type(k_inds);
1352  valuesPacked_wdv = values_wdv_type(k_vals);
1353  } else { // We don't have to pack, so just set the pointers.
1354  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1355  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1356  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1357  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1358  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1359  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1360  valuesPacked_wdv = valuesUnpacked_wdv;
1361 
1362  if (verbose) {
1363  std::ostringstream os;
1364  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1365  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1366  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1367  << valuesUnpacked_wdv.extent(0) << endl;
1368  std::cerr << os.str();
1369  }
1370 
1371  if (debug) {
1372  const char myPrefix[] =
1373  "(\"Optimize Storage\"=false branch) ";
1374  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1375  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_->rowPtrsUnpacked_dev_.extent(0) == 0, std::logic_error, myPrefix << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1376  "that rowPtrsUnpacked_ was never allocated.");
1377  if (myGraph_->rowPtrsUnpacked_dev_.extent(0) != 0) {
1378  const size_t numOffsets = rowPtrsUnpackedHost.extent(0);
1379  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1380  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(valToCheck) != valuesPacked_wdv.extent(0),
1381  std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != valuesPacked_wdv.extent(0) = " << valuesPacked_wdv.extent(0) << ".");
1382  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(valToCheck) != myGraph_->lclIndsPacked_wdv.extent(0),
1383  std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != myGraph_->lclIndsPacked.extent(0) = " << myGraph_->lclIndsPacked_wdv.extent(0) << ".");
1384  }
1385  }
1386  }
1387 
1388  if (debug) {
1389  const char myPrefix[] = "After packing, ";
1390  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1391  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(rowPtrsPackedHost.extent(0)) != size_t(lclNumRows + 1),
1392  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = " << rowPtrsPackedHost.extent(0) << " != lclNumRows+1 = " << (lclNumRows + 1) << ".");
1393  if (rowPtrsPackedHost.extent(0) != 0) {
1394  const size_t numOffsets(rowPtrsPackedHost.extent(0));
1395  const size_t valToCheck = rowPtrsPackedHost(numOffsets - 1);
1396  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(valToCheck != size_t(valuesPacked_wdv.extent(0)),
1397  std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != valuesPacked_wdv.extent(0) = " << valuesPacked_wdv.extent(0) << ".");
1398  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(valToCheck != size_t(myGraph_->lclIndsPacked_wdv.extent(0)),
1399  std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = " << myGraph_->lclIndsPacked_wdv.extent(0) << ".");
1400  }
1401  }
1402 
1403  // May we ditch the old allocations for the packed (and otherwise
1404  // "optimized") allocations, later in this routine? Optimize
1405  // storage if the graph is not static, or if the graph already has
1406  // optimized storage.
1407  const bool defaultOptStorage =
1408  !isStaticGraph() || staticGraph_->isStorageOptimized();
1409  const bool requestOptimizedStorage =
1410  (!params.is_null() &&
1411  params->get("Optimize Storage", defaultOptStorage)) ||
1412  (params.is_null() && defaultOptStorage);
1413 
1414  // The graph has optimized storage when indices are allocated,
1415  // myGraph_->k_numRowEntries_ is empty, and there are more than
1416  // zero rows on this process.
1417  if (requestOptimizedStorage) {
1418  // Free the old, unpacked, unoptimized allocations.
1419  // Free graph data structures that are only needed for
1420  // unpacked 1-D storage.
1421  if (verbose) {
1422  std::ostringstream os;
1423  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1424  << myGraph_->k_numRowEntries_.extent(0) << endl;
1425  std::cerr << os.str();
1426  }
1427 
1428  myGraph_->k_numRowEntries_ = row_entries_type();
1429 
1430  // Keep the new 1-D packed allocations.
1431  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1432  // We directly set the memory spaces to avoid a memcpy from device to host
1433  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1434  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1435  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1436  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1437  valuesUnpacked_wdv = valuesPacked_wdv;
1438 
1439  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1440  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1441  } else {
1442  if (verbose) {
1443  std::ostringstream os;
1444  os << *prefix << "User requested NOT to optimize storage"
1445  << endl;
1446  std::cerr << os.str();
1447  }
1448  }
1449 }
1450 
1451 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1453  fillLocalMatrix(const Teuchos::RCP<Teuchos::ParameterList>& params) {
1454  using std::endl;
1455  using Teuchos::Array;
1456  using Teuchos::ArrayRCP;
1457  using Teuchos::null;
1458  using Teuchos::RCP;
1459  using Teuchos::rcp;
1460  using ::Tpetra::Details::ProfilingRegion;
1461  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1462  using non_const_row_map_type = typename row_map_type::non_const_type;
1463  using values_type = typename local_matrix_device_type::values_type;
1464  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1465  const size_t lclNumRows = getLocalNumRows();
1466 
1467  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1468  std::unique_ptr<std::string> prefix;
1469  if (verbose) {
1470  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1471  std::ostringstream os;
1472  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1473  std::cerr << os.str();
1474  }
1475 
1476  // The goals of this routine are first, to allocate and fill
1477  // packed 1-D storage (see below for an explanation) in the vals
1478  // array, and second, to give vals to the local matrix and
1479  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1480  // row offsets, within the scope of this routine, since we're only
1481  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1482  // fill both the graph and the matrix at the same time).
1483 
1484  // get data from staticGraph_
1485  size_t nodeNumEntries = staticGraph_->getLocalNumEntries();
1486  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize();
1487  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1488 
1489  row_map_type k_ptrs; // "packed" row offsets array
1490  values_type k_vals; // "packed" values array
1491 
1492  // May we ditch the old allocations for the packed (and otherwise
1493  // "optimized") allocations, later in this routine? Request
1494  // optimized storage by default.
1495  bool requestOptimizedStorage = true;
1496  const bool default_OptimizeStorage =
1497  !isStaticGraph() || staticGraph_->isStorageOptimized();
1498  if (!params.is_null() &&
1499  !params->get("Optimize Storage", default_OptimizeStorage)) {
1500  requestOptimizedStorage = false;
1501  }
1502  // If we're not allowed to change a static graph, then we can't
1503  // change the storage of the matrix, either. This means that if
1504  // the graph's storage isn't already optimized, we can't optimize
1505  // the matrix's storage either. Check and give warning, as
1506  // appropriate.
1507  if (!staticGraph_->isStorageOptimized() &&
1508  requestOptimizedStorage) {
1509  TPETRA_ABUSE_WARNING(true, std::runtime_error,
1510  "You requested optimized storage "
1511  "by setting the \"Optimize Storage\" flag to \"true\" in "
1512  "the ParameterList, or by virtue of default behavior. "
1513  "However, the associated CrsGraph was filled separately and "
1514  "requested not to optimize storage. Therefore, the "
1515  "CrsMatrix cannot optimize storage.");
1516  requestOptimizedStorage = false;
1517  }
1518 
1519  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1520  // See: https://github.com/trilinos/Trilinos/issues/12237
1521  // using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1522  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1523 
1524  // The matrix's values are currently
1525  // stored in a 1-D format. However, this format is "unpacked";
1526  // it doesn't necessarily have the same row offsets as indicated
1527  // by the ptrs array returned by allocRowPtrs. This could
1528  // happen, for example, if the user
1529  // fixed the number of matrix entries in
1530  // each row, but didn't fill all those entries.
1531  //
1532  // As above, we don't need to keep the "packed" row offsets
1533  // array ptrs here, but we do need it here temporarily, so we
1534  // have to allocate it. We'll free ptrs later in this method.
1535  //
1536  // Note that this routine checks whether storage has already
1537  // been packed. This is a common case for solution of nonlinear
1538  // PDEs using the finite element method, as long as the
1539  // structure of the sparse matrix does not change between linear
1540  // solves.
1541  if (nodeNumEntries != nodeNumAllocated) {
1542  if (verbose) {
1543  std::ostringstream os;
1544  os << *prefix << "Unpacked 1-D storage: numEnt="
1545  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1546  << endl;
1547  std::cerr << os.str();
1548  }
1549  // We have to pack the 1-D storage, since the user didn't fill
1550  // up all requested storage.
1551  if (verbose) {
1552  std::ostringstream os;
1553  os << *prefix << "Allocate packed row offsets: "
1554  << (lclNumRows + 1) << endl;
1555  std::cerr << os.str();
1556  }
1557  non_const_row_map_type tmpk_ptrs("Tpetra::CrsGraph::ptr",
1558  lclNumRows + 1);
1559  // Total number of entries in the matrix on the calling
1560  // process. We will compute this in the loop below. It's
1561  // cheap to compute and useful as a sanity check.
1562  size_t lclTotalNumEntries = 0;
1563  k_ptrs = tmpk_ptrs;
1564  {
1565  typename row_entries_type::const_type numRowEnt_h =
1566  staticGraph_->k_numRowEntries_;
1567  // This function can handle the counts being a host View.
1568  lclTotalNumEntries =
1569  Details::computeOffsetsFromCounts(tmpk_ptrs, numRowEnt_h);
1570  }
1571 
1572  // Allocate the "packed" values array.
1573  // It has exactly the right number of entries.
1574  if (verbose) {
1575  std::ostringstream os;
1576  os << *prefix << "Allocate packed values: "
1577  << lclTotalNumEntries << endl;
1578  std::cerr << os.str();
1579  }
1580  k_vals = values_type("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1581 
1582  // Pack values_wdv into k_vals. We will replace values_wdv below.
1583  pack_functor<
1584  typename values_type::non_const_type,
1585  typename values_type::const_type,
1586  typename row_map_type::non_const_type,
1587  typename row_map_type::const_type>
1588  valsPacker(k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1589  tmpk_ptrs, k_rowPtrs);
1590 
1591  using exec_space = typename decltype(k_vals)::execution_space;
1592  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1593  Kokkos::parallel_for("Tpetra::CrsMatrix pack values",
1594  range_type(0, lclNumRows), valsPacker);
1595  valuesPacked_wdv = values_wdv_type(k_vals);
1596  } else { // We don't have to pack, so just set the pointer.
1597  valuesPacked_wdv = valuesUnpacked_wdv;
1598  if (verbose) {
1599  std::ostringstream os;
1600  os << *prefix << "Storage already packed: "
1601  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1602  std::cerr << os.str();
1603  }
1604  }
1605 
1606  // May we ditch the old allocations for the packed one?
1607  if (requestOptimizedStorage) {
1608  // The user requested optimized storage, so we can dump the
1609  // unpacked 1-D storage, and keep the packed storage.
1610  valuesUnpacked_wdv = valuesPacked_wdv;
1611  // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1612  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1613  }
1614 }
1615 
1616 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1618  insertIndicesAndValues(crs_graph_type& graph,
1619  RowInfo& rowInfo,
1620  const typename crs_graph_type::SLocalGlobalViews& newInds,
1621  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1622  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1623  const ELocalGlobal lg,
1624  const ELocalGlobal I) {
1625  const size_t oldNumEnt = rowInfo.numEntries;
1626  const size_t numInserted = graph.insertIndices(rowInfo, newInds, lg, I);
1627 
1628  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1629  // that probably relates to scalar_type vs. impl_scalar_type
1630  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1631  // details; look for GCC_WORKAROUND macro definition.
1632  if (numInserted > 0) {
1633  const size_t startOffset = oldNumEnt;
1634  memcpy((void*)&oldRowVals[startOffset], &newRowVals[0],
1635  numInserted * sizeof(impl_scalar_type));
1636  }
1637 }
1638 
1639 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1641  insertLocalValues(const LocalOrdinal lclRow,
1642  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1643  const Teuchos::ArrayView<const Scalar>& values,
1644  const CombineMode CM) {
1645  using std::endl;
1646  const char tfecfFuncName[] = "insertLocalValues: ";
1647 
1648  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->isFillActive(), std::runtime_error,
1649  "Fill is not active. After calling fillComplete, you must call "
1650  "resumeFill before you may insert entries into the matrix again.");
1651  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->isStaticGraph(), std::runtime_error,
1652  "Cannot insert indices with static graph; use replaceLocalValues() "
1653  "instead.");
1654  // At this point, we know that myGraph_ is nonnull.
1655  crs_graph_type& graph = *(this->myGraph_);
1656  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.colMap_.is_null(), std::runtime_error,
1657  "Cannot insert local indices without a column map.");
1658  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.isGloballyIndexed(),
1659  std::runtime_error,
1660  "Graph indices are global; use "
1661  "insertGlobalValues().");
1662  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(), std::runtime_error,
1663  "values.size() = " << values.size()
1664  << " != indices.size() = " << indices.size() << ".");
1665  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1666  !graph.rowMap_->isNodeLocalElement(lclRow), std::runtime_error,
1667  "Local row index " << lclRow << " does not belong to this process.");
1668 
1669  if (!graph.indicesAreAllocated()) {
1670  // We only allocate values at most once per process, so it's OK
1671  // to check TPETRA_VERBOSE here.
1672  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1673  this->allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
1674  }
1675 
1676 #ifdef HAVE_TPETRA_DEBUG
1677  const size_t numEntriesToAdd = static_cast<size_t>(indices.size());
1678  // In a debug build, test whether any of the given column indices
1679  // are not in the column Map. Keep track of the invalid column
1680  // indices so we can tell the user about them.
1681  {
1682  using Teuchos::toString;
1683 
1684  const map_type& colMap = *(graph.colMap_);
1685  Teuchos::Array<LocalOrdinal> badColInds;
1686  bool allInColMap = true;
1687  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1688  if (!colMap.isNodeLocalElement(indices[k])) {
1689  allInColMap = false;
1690  badColInds.push_back(indices[k]);
1691  }
1692  }
1693  if (!allInColMap) {
1694  std::ostringstream os;
1695  os << "You attempted to insert entries in owned row " << lclRow
1696  << ", at the following column indices: " << toString(indices)
1697  << "." << endl;
1698  os << "Of those, the following indices are not in the column Map on "
1699  "this process: "
1700  << toString(badColInds) << "." << endl
1701  << "Since "
1702  "the matrix has a column Map already, it is invalid to insert "
1703  "entries at those locations.";
1704  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
1705  }
1706  }
1707 #endif // HAVE_TPETRA_DEBUG
1708 
1709  RowInfo rowInfo = graph.getRowInfo(lclRow);
1710 
1711  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1712  if (CM == ADD) {
1713  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) { valsView[offset] += values[k]; };
1714  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1715  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1716  } else if (CM == INSERT) {
1717  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) { valsView[offset] = values[k]; };
1718  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1719  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1720  } else {
1721  std::ostringstream os;
1722  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1723  << "but this has not been implemented." << endl;
1724  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
1725  }
1726 }
1727 
1728 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1730  insertLocalValues(const LocalOrdinal localRow,
1731  const LocalOrdinal numEnt,
1732  const Scalar vals[],
1733  const LocalOrdinal cols[],
1734  const CombineMode CM) {
1735  Teuchos::ArrayView<const LocalOrdinal> colsT(cols, numEnt);
1736  Teuchos::ArrayView<const Scalar> valsT(vals, numEnt);
1737  this->insertLocalValues(localRow, colsT, valsT, CM);
1738 }
1739 
1740 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1743  RowInfo& rowInfo,
1744  const GlobalOrdinal gblColInds[],
1745  const impl_scalar_type vals[],
1746  const size_t numInputEnt) {
1747 #ifdef HAVE_TPETRA_DEBUG
1748  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1749  const size_t origNumEnt = graph.getNumEntriesInLocalRow(rowInfo.localRow);
1750  const size_t curNumEnt = rowInfo.numEntries;
1751 #endif // HAVE_TPETRA_DEBUG
1752 
1753  if (!graph.indicesAreAllocated()) {
1754  // We only allocate values at most once per process, so it's OK
1755  // to check TPETRA_VERBOSE here.
1756  using ::Tpetra::Details::Behavior;
1757  const bool verbose = Behavior::verbose("CrsMatrix");
1758  this->allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
1759  // mfh 23 Jul 2017: allocateValues invalidates existing
1760  // getRowInfo results. Once we get rid of lazy graph
1761  // allocation, we'll be able to move the getRowInfo call outside
1762  // of this method.
1763  rowInfo = graph.getRowInfo(rowInfo.localRow);
1764  }
1765 
1766  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1767  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1768  valsView[offset] += vals[k];
1769  };
1770  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1771 #ifdef HAVE_TPETRA_DEBUG
1772  // numInserted is only used inside the debug code below.
1773  auto numInserted =
1774 #endif
1775  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1776 
1777 #ifdef HAVE_TPETRA_DEBUG
1778  size_t newNumEnt = curNumEnt + numInserted;
1779  const size_t chkNewNumEnt =
1780  graph.getNumEntriesInLocalRow(rowInfo.localRow);
1781  if (chkNewNumEnt != newNumEnt) {
1782  std::ostringstream os;
1783  os << std::endl
1784  << "newNumEnt = " << newNumEnt
1785  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1786  << ") = " << chkNewNumEnt << "." << std::endl
1787  << "\torigNumEnt: " << origNumEnt << std::endl
1788  << "\tnumInputEnt: " << numInputEnt << std::endl
1789  << "\tgblColInds: [";
1790  for (size_t k = 0; k < numInputEnt; ++k) {
1791  os << gblColInds[k];
1792  if (k + size_t(1) < numInputEnt) {
1793  os << ",";
1794  }
1795  }
1796  os << "]" << std::endl
1797  << "\tvals: [";
1798  for (size_t k = 0; k < numInputEnt; ++k) {
1799  os << vals[k];
1800  if (k + size_t(1) < numInputEnt) {
1801  os << ",";
1802  }
1803  }
1804  os << "]" << std::endl;
1805 
1806  if (this->supportsRowViews()) {
1807  values_host_view_type vals2;
1808  if (this->isGloballyIndexed()) {
1809  global_inds_host_view_type gblColInds2;
1810  const GlobalOrdinal gblRow =
1811  graph.rowMap_->getGlobalElement(rowInfo.localRow);
1812  if (gblRow ==
1813  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid()) {
1814  os << "Local row index " << rowInfo.localRow << " is invalid!"
1815  << std::endl;
1816  } else {
1817  bool getViewThrew = false;
1818  try {
1819  this->getGlobalRowView(gblRow, gblColInds2, vals2);
1820  } catch (std::exception& e) {
1821  getViewThrew = true;
1822  os << "getGlobalRowView threw exception:" << std::endl
1823  << e.what() << std::endl;
1824  }
1825  if (!getViewThrew) {
1826  os << "\tNew global column indices: ";
1827  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1828  os << gblColInds2[jjj] << " ";
1829  os << std::endl;
1830  os << "\tNew values: ";
1831  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1832  os << vals2[jjj] << " ";
1833  os << std::endl;
1834  }
1835  }
1836  } else if (this->isLocallyIndexed()) {
1837  local_inds_host_view_type lclColInds2;
1838  this->getLocalRowView(rowInfo.localRow, lclColInds2, vals2);
1839  os << "\tNew local column indices: ";
1840  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1841  os << lclColInds2[jjj] << " ";
1842  os << std::endl;
1843  os << "\tNew values: ";
1844  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1845  os << vals2[jjj] << " ";
1846  os << std::endl;
1847  }
1848  }
1849 
1850  os << "Please report this bug to the Tpetra developers.";
1851  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, os.str());
1852  }
1853 #endif // HAVE_TPETRA_DEBUG
1854 }
1855 
1856 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1858  insertGlobalValues(const GlobalOrdinal gblRow,
1859  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1860  const Teuchos::ArrayView<const Scalar>& values) {
1861  using std::endl;
1862  using Teuchos::toString;
1863  typedef impl_scalar_type IST;
1864  typedef LocalOrdinal LO;
1865  typedef GlobalOrdinal GO;
1866  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1867  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1868  const char tfecfFuncName[] = "insertGlobalValues: ";
1869 
1870 #ifdef HAVE_TPETRA_DEBUG
1871  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(), std::runtime_error,
1872  "values.size() = " << values.size() << " != indices.size() = "
1873  << indices.size() << ".");
1874 #endif // HAVE_TPETRA_DEBUG
1875 
1876  // getRowMap() is not thread safe, because it increments RCP's
1877  // reference count. getCrsGraphRef() is thread safe.
1878  const map_type& rowMap = *(this->getCrsGraphRef().rowMap_);
1879  const LO lclRow = rowMap.getLocalElement(gblRow);
1880 
1881  if (lclRow == OTLO::invalid()) {
1882  // Input row is _not_ owned by the calling process.
1883  //
1884  // See a note (now deleted) from mfh 14 Dec 2012: If input row
1885  // is not in the row Map, it doesn't matter whether or not the
1886  // graph is static; the data just get stashed for later use by
1887  // globalAssemble().
1888  this->insertNonownedGlobalValues(gblRow, indices, values);
1889  } else { // Input row _is_ owned by the calling process
1890  if (this->isStaticGraph()) {
1891  // Uh oh! Not allowed to insert into owned rows in that case.
1892  const int myRank = rowMap.getComm()->getRank();
1893  const int numProcs = rowMap.getComm()->getSize();
1894  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
1895  "The matrix was constructed with a constant (\"static\") graph, "
1896  "yet the given global row index "
1897  << gblRow << " is in the row "
1898  "Map on the calling process (with rank "
1899  << myRank << ", of " << numProcs << " process(es)). In this case, you may not insert "
1900  "new entries into rows owned by the calling process.");
1901  }
1902 
1903  crs_graph_type& graph = *(this->myGraph_);
1904  const IST* const inputVals =
1905  reinterpret_cast<const IST*>(values.getRawPtr());
1906  const GO* const inputGblColInds = indices.getRawPtr();
1907  const size_t numInputEnt = indices.size();
1908  RowInfo rowInfo = graph.getRowInfo(lclRow);
1909 
1910  // If the matrix has a column Map, check at this point whether
1911  // the column indices belong to the column Map.
1912  //
1913  // FIXME (mfh 16 May 2013) We may want to consider deferring the
1914  // test to the CrsGraph method, since it may have to do this
1915  // anyway.
1916  if (!graph.colMap_.is_null()) {
1917  const map_type& colMap = *(graph.colMap_);
1918  // In a debug build, keep track of the nonowned ("bad") column
1919  // indices, so that we can display them in the exception
1920  // message. In a release build, just ditch the loop early if
1921  // we encounter a nonowned column index.
1922 #ifdef HAVE_TPETRA_DEBUG
1923  Teuchos::Array<GO> badColInds;
1924 #endif // HAVE_TPETRA_DEBUG
1925  const size_type numEntriesToInsert = indices.size();
1926  bool allInColMap = true;
1927  for (size_type k = 0; k < numEntriesToInsert; ++k) {
1928  if (!colMap.isNodeGlobalElement(indices[k])) {
1929  allInColMap = false;
1930 #ifdef HAVE_TPETRA_DEBUG
1931  badColInds.push_back(indices[k]);
1932 #else
1933  break;
1934 #endif // HAVE_TPETRA_DEBUG
1935  }
1936  }
1937  if (!allInColMap) {
1938  std::ostringstream os;
1939  os << "You attempted to insert entries in owned row " << gblRow
1940  << ", at the following column indices: " << toString(indices)
1941  << "." << endl;
1942 #ifdef HAVE_TPETRA_DEBUG
1943  os << "Of those, the following indices are not in the column Map "
1944  "on this process: "
1945  << toString(badColInds) << "." << endl
1946  << "Since the matrix has a column Map already, it is invalid "
1947  "to insert entries at those locations.";
1948 #else
1949  os << "At least one of those indices is not in the column Map "
1950  "on this process."
1951  << endl
1952  << "It is invalid to insert into "
1953  "columns not in the column Map on the process that owns the "
1954  "row.";
1955 #endif // HAVE_TPETRA_DEBUG
1956  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
1957  }
1958  }
1959 
1960  this->insertGlobalValuesImpl(graph, rowInfo, inputGblColInds,
1961  inputVals, numInputEnt);
1962  }
1963 }
1964 
1965 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1967  insertGlobalValues(const GlobalOrdinal globalRow,
1968  const LocalOrdinal numEnt,
1969  const Scalar vals[],
1970  const GlobalOrdinal inds[]) {
1971  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numEnt);
1972  Teuchos::ArrayView<const Scalar> valsT(vals, numEnt);
1973  this->insertGlobalValues(globalRow, indsT, valsT);
1974 }
1975 
1976 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1979  const GlobalOrdinal gblRow,
1980  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1981  const Teuchos::ArrayView<const Scalar>& values,
1982  const bool debug) {
1983  typedef impl_scalar_type IST;
1984  typedef LocalOrdinal LO;
1985  typedef GlobalOrdinal GO;
1986  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1987  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
1988 
1989  if (debug) {
1990  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(), std::runtime_error,
1991  "values.size() = " << values.size() << " != indices.size() = "
1992  << indices.size() << ".");
1993  }
1994 
1995  // getRowMap() is not thread safe, because it increments RCP's
1996  // reference count. getCrsGraphRef() is thread safe.
1997  const map_type& rowMap = *(this->getCrsGraphRef().rowMap_);
1998  const LO lclRow = rowMap.getLocalElement(gblRow);
1999  if (lclRow == OTLO::invalid()) {
2000  // Input row is _not_ owned by the calling process.
2001  //
2002  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2003  // is not in the row Map, it doesn't matter whether or not the
2004  // graph is static; the data just get stashed for later use by
2005  // globalAssemble().
2006  this->insertNonownedGlobalValues(gblRow, indices, values);
2007  } else { // Input row _is_ owned by the calling process
2008  if (this->isStaticGraph()) {
2009  // Uh oh! Not allowed to insert into owned rows in that case.
2010  const int myRank = rowMap.getComm()->getRank();
2011  const int numProcs = rowMap.getComm()->getSize();
2012  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
2013  "The matrix was constructed with a constant (\"static\") graph, "
2014  "yet the given global row index "
2015  << gblRow << " is in the row "
2016  "Map on the calling process (with rank "
2017  << myRank << ", of " << numProcs << " process(es)). In this case, you may not insert "
2018  "new entries into rows owned by the calling process.");
2019  }
2020 
2021  crs_graph_type& graph = *(this->myGraph_);
2022  const IST* const inputVals =
2023  reinterpret_cast<const IST*>(values.getRawPtr());
2024  const GO* const inputGblColInds = indices.getRawPtr();
2025  const size_t numInputEnt = indices.size();
2026  RowInfo rowInfo = graph.getRowInfo(lclRow);
2027 
2028  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2029  // This branch is similar in function to the following branch, but for
2030  // the special case that the target graph is locally indexed.
2031  // In this case, we cannot simply filter
2032  // out global indices that don't exist on the receiving process and
2033  // insert the remaining (global) indices, but we must convert them (the
2034  // remaining global indices) to local and call `insertLocalValues`.
2035  const map_type& colMap = *(graph.colMap_);
2036  size_t curOffset = 0;
2037  while (curOffset < numInputEnt) {
2038  // Find a sequence of input indices that are in the column Map on the
2039  // calling process. Doing a sequence at a time, instead of one at a
2040  // time, amortizes some overhead.
2041  Teuchos::Array<LO> lclIndices;
2042  size_t endOffset = curOffset;
2043  for (; endOffset < numInputEnt; ++endOffset) {
2044  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2045  if (lclIndex != OTLO::invalid())
2046  lclIndices.push_back(lclIndex);
2047  else
2048  break;
2049  }
2050  // curOffset, endOffset: half-exclusive range of indices in the column
2051  // Map on the calling process. If endOffset == curOffset, the range is
2052  // empty.
2053  const LO numIndInSeq = (endOffset - curOffset);
2054  if (numIndInSeq != 0) {
2055  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2056  }
2057  // Invariant before the increment line: Either endOffset ==
2058  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2059  // on the calling process.
2060  if (debug) {
2061  const bool invariant = endOffset == numInputEnt ||
2062  colMap.getLocalElement(inputGblColInds[endOffset]) == OTLO::invalid();
2063  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!invariant, std::logic_error, std::endl
2064  << "Invariant failed!");
2065  }
2066  curOffset = endOffset + 1;
2067  }
2068  } else if (!graph.colMap_.is_null()) { // We have a column Map.
2069  const map_type& colMap = *(graph.colMap_);
2070  size_t curOffset = 0;
2071  while (curOffset < numInputEnt) {
2072  // Find a sequence of input indices that are in the column
2073  // Map on the calling process. Doing a sequence at a time,
2074  // instead of one at a time, amortizes some overhead.
2075  size_t endOffset = curOffset;
2076  for (; endOffset < numInputEnt &&
2077  colMap.getLocalElement(inputGblColInds[endOffset]) != OTLO::invalid();
2078  ++endOffset) {
2079  }
2080  // curOffset, endOffset: half-exclusive range of indices in
2081  // the column Map on the calling process. If endOffset ==
2082  // curOffset, the range is empty.
2083  const LO numIndInSeq = (endOffset - curOffset);
2084  if (numIndInSeq != 0) {
2085  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2086  this->insertGlobalValuesImpl(graph, rowInfo,
2087  inputGblColInds + curOffset,
2088  inputVals + curOffset,
2089  numIndInSeq);
2090  }
2091  // Invariant before the increment line: Either endOffset ==
2092  // numInputEnt, or inputGblColInds[endOffset] is not in the
2093  // column Map on the calling process.
2094  if (debug) {
2095  const bool invariant = endOffset == numInputEnt ||
2096  colMap.getLocalElement(inputGblColInds[endOffset]) == OTLO::invalid();
2097  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!invariant, std::logic_error, std::endl
2098  << "Invariant failed!");
2099  }
2100  curOffset = endOffset + 1;
2101  }
2102  } else { // we don't have a column Map.
2103  this->insertGlobalValuesImpl(graph, rowInfo, inputGblColInds,
2104  inputVals, numInputEnt);
2105  }
2106  }
2107 }
2108 
2109 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2110 void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2111  insertGlobalValuesFilteredChecked(
2112  const GlobalOrdinal gblRow,
2113  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2114  const Teuchos::ArrayView<const Scalar>& values,
2115  const char* const prefix,
2116  const bool debug,
2117  const bool verbose) {
2119  using std::endl;
2120 
2121  try {
2122  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2123  } catch (std::exception& e) {
2124  std::ostringstream os;
2125  if (verbose) {
2126  const size_t maxNumToPrint =
2128  os << *prefix << ": insertGlobalValuesFiltered threw an "
2129  "exception: "
2130  << e.what() << endl
2131  << "Global row index: " << gblRow << endl;
2132  verbosePrintArray(os, indices, "Global column indices",
2133  maxNumToPrint);
2134  os << endl;
2135  verbosePrintArray(os, values, "Values", maxNumToPrint);
2136  os << endl;
2137  } else {
2138  os << ": insertGlobalValuesFiltered threw an exception: "
2139  << e.what();
2140  }
2141  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2142  }
2143 }
2144 
2145 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2146 LocalOrdinal
2149  const crs_graph_type& graph,
2150  const RowInfo& rowInfo,
2151  const LocalOrdinal inds[],
2152  const impl_scalar_type newVals[],
2153  const LocalOrdinal numElts) {
2154  typedef LocalOrdinal LO;
2155  typedef GlobalOrdinal GO;
2156  const bool sorted = graph.isSorted();
2157 
2158  size_t hint = 0; // Guess for the current index k into rowVals
2159  LO numValid = 0; // number of valid local column indices
2160 
2161  if (graph.isLocallyIndexed()) {
2162  // Get a view of the column indices in the row. This amortizes
2163  // the cost of getting the view over all the entries of inds.
2164  auto colInds = graph.getLocalIndsViewHost(rowInfo);
2165 
2166  for (LO j = 0; j < numElts; ++j) {
2167  const LO lclColInd = inds[j];
2168  const size_t offset =
2169  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2170  lclColInd, hint, sorted);
2171  if (offset != rowInfo.numEntries) {
2172  rowVals[offset] = newVals[j];
2173  hint = offset + 1;
2174  ++numValid;
2175  }
2176  }
2177  } else if (graph.isGloballyIndexed()) {
2178  if (graph.colMap_.is_null()) {
2179  return Teuchos::OrdinalTraits<LO>::invalid();
2180  }
2181  const map_type colMap = *(graph.colMap_);
2182 
2183  // Get a view of the column indices in the row. This amortizes
2184  // the cost of getting the view over all the entries of inds.
2185  auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2186 
2187  for (LO j = 0; j < numElts; ++j) {
2188  const GO gblColInd = colMap.getGlobalElement(inds[j]);
2189  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid()) {
2190  const size_t offset =
2191  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2192  gblColInd, hint, sorted);
2193  if (offset != rowInfo.numEntries) {
2194  rowVals[offset] = newVals[j];
2195  hint = offset + 1;
2196  ++numValid;
2197  }
2198  }
2199  }
2200  }
2201  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2202  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2203  // to be neither locally nor globally indexed on a process.
2204  // This means that the graph or matrix has no entries on that
2205  // process. Epetra also works like this. It's related to lazy
2206  // allocation (on first insertion, not at graph / matrix
2207  // construction). Lazy allocation will go away because it is
2208  // not thread scalable.
2209 
2210  return numValid;
2211 }
2212 
2213 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2214 LocalOrdinal
2216  replaceLocalValues(const LocalOrdinal localRow,
2217  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2218  const Teuchos::ArrayView<const Scalar>& vals) {
2219  typedef LocalOrdinal LO;
2220 
2221  const LO numInputEnt = static_cast<LO>(lclCols.size());
2222  if (static_cast<LO>(vals.size()) != numInputEnt) {
2223  return Teuchos::OrdinalTraits<LO>::invalid();
2224  }
2225  const LO* const inputInds = lclCols.getRawPtr();
2226  const Scalar* const inputVals = vals.getRawPtr();
2227  return this->replaceLocalValues(localRow, numInputEnt,
2228  inputVals, inputInds);
2229 }
2230 
2231 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2233  local_ordinal_type
2236  const local_ordinal_type localRow,
2237  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2238  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) {
2239  using LO = local_ordinal_type;
2240  const LO numInputEnt = inputInds.extent(0);
2241  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2242  return Teuchos::OrdinalTraits<LO>::invalid();
2243  }
2244  const Scalar* const inVals =
2245  reinterpret_cast<const Scalar*>(inputVals.data());
2246  return this->replaceLocalValues(localRow, numInputEnt,
2247  inVals, inputInds.data());
2248 }
2249 
2250 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2251 LocalOrdinal
2253  replaceLocalValues(const LocalOrdinal localRow,
2254  const LocalOrdinal numEnt,
2255  const Scalar inputVals[],
2256  const LocalOrdinal inputCols[]) {
2257  typedef impl_scalar_type IST;
2258  typedef LocalOrdinal LO;
2259 
2260  if (!this->isFillActive() || this->staticGraph_.is_null()) {
2261  // Fill must be active and the "nonconst" graph must exist.
2262  return Teuchos::OrdinalTraits<LO>::invalid();
2263  }
2264  const crs_graph_type& graph = *(this->staticGraph_);
2265  const RowInfo rowInfo = graph.getRowInfo(localRow);
2266 
2267  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2268  // The calling process does not own this row, so it is not
2269  // allowed to modify its values.
2270  return static_cast<LO>(0);
2271  }
2272  auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2273  const IST* const inVals = reinterpret_cast<const IST*>(inputVals);
2274  return this->replaceLocalValuesImpl(curRowVals.data(), graph, rowInfo,
2275  inputCols, inVals, numEnt);
2276 }
2277 
2278 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2279 LocalOrdinal
2282  const crs_graph_type& graph,
2283  const RowInfo& rowInfo,
2284  const GlobalOrdinal inds[],
2285  const impl_scalar_type newVals[],
2286  const LocalOrdinal numElts) {
2287  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2288  auto fun =
2289  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2290  rowVals[offset] = newVals[k];
2291  };
2292  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2293  return graph.findGlobalIndices(rowInfo, indsT, cb);
2294 }
2295 
2296 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2297 LocalOrdinal
2299  replaceGlobalValues(const GlobalOrdinal globalRow,
2300  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2301  const Teuchos::ArrayView<const Scalar>& inputVals) {
2302  typedef LocalOrdinal LO;
2303 
2304  const LO numInputEnt = static_cast<LO>(inputGblColInds.size());
2305  if (static_cast<LO>(inputVals.size()) != numInputEnt) {
2306  return Teuchos::OrdinalTraits<LO>::invalid();
2307  }
2308  return this->replaceGlobalValues(globalRow, numInputEnt,
2309  inputVals.getRawPtr(),
2310  inputGblColInds.getRawPtr());
2311 }
2312 
2313 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2314 LocalOrdinal
2316  replaceGlobalValues(const GlobalOrdinal globalRow,
2317  const LocalOrdinal numEnt,
2318  const Scalar inputVals[],
2319  const GlobalOrdinal inputGblColInds[]) {
2320  typedef impl_scalar_type IST;
2321  typedef LocalOrdinal LO;
2322 
2323  if (!this->isFillActive() || this->staticGraph_.is_null()) {
2324  // Fill must be active and the "nonconst" graph must exist.
2325  return Teuchos::OrdinalTraits<LO>::invalid();
2326  }
2327  const crs_graph_type& graph = *(this->staticGraph_);
2328 
2329  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex(globalRow);
2330  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2331  // The input local row is invalid on the calling process,
2332  // which means that the calling process summed 0 entries.
2333  return static_cast<LO>(0);
2334  }
2335 
2336  auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2337  const IST* const inVals = reinterpret_cast<const IST*>(inputVals);
2338  return this->replaceGlobalValuesImpl(curRowVals.data(), graph, rowInfo,
2339  inputGblColInds, inVals, numEnt);
2340 }
2341 
2342 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2344  local_ordinal_type
2347  const global_ordinal_type globalRow,
2348  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2349  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) {
2350  // We use static_assert here to check the template parameters,
2351  // rather than std::enable_if (e.g., on the return value, to
2352  // enable compilation only if the template parameters match the
2353  // desired attributes). This turns obscure link errors into
2354  // clear compilation errors. It also makes the return value a
2355  // lot easier to see.
2356  using LO = local_ordinal_type;
2357  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2358  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2359  return Teuchos::OrdinalTraits<LO>::invalid();
2360  }
2361  const Scalar* const inVals =
2362  reinterpret_cast<const Scalar*>(inputVals.data());
2363  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2364  inputInds.data());
2365 }
2366 
2367 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2368 LocalOrdinal
2371  const crs_graph_type& graph,
2372  const RowInfo& rowInfo,
2373  const GlobalOrdinal inds[],
2374  const impl_scalar_type newVals[],
2375  const LocalOrdinal numElts,
2376  const bool atomic) {
2377  typedef LocalOrdinal LO;
2378  typedef GlobalOrdinal GO;
2379 
2380  const bool sorted = graph.isSorted();
2381 
2382  size_t hint = 0; // guess at the index's relative offset in the row
2383  LO numValid = 0; // number of valid input column indices
2384 
2385  if (graph.isLocallyIndexed()) {
2386  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2387  // pointer does NOT change its reference count. Thus, this
2388  // code is still thread safe.
2389  if (graph.colMap_.is_null()) {
2390  // NO input column indices are valid in this case, since if
2391  // the column Map is null on the calling process, then the
2392  // calling process owns no graph entries.
2393  return numValid;
2394  }
2395  const map_type& colMap = *(graph.colMap_);
2396 
2397  // Get a view of the column indices in the row. This amortizes
2398  // the cost of getting the view over all the entries of inds.
2399  auto colInds = graph.getLocalIndsViewHost(rowInfo);
2400  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid();
2401 
2402  for (LO j = 0; j < numElts; ++j) {
2403  const LO lclColInd = colMap.getLocalElement(inds[j]);
2404  if (lclColInd != LINV) {
2405  const size_t offset =
2406  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2407  lclColInd, hint, sorted);
2408  if (offset != rowInfo.numEntries) {
2409  if (atomic) {
2410  Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2411  } else {
2412  rowVals[offset] += newVals[j];
2413  }
2414  hint = offset + 1;
2415  numValid++;
2416  }
2417  }
2418  }
2419  } else if (graph.isGloballyIndexed()) {
2420  // Get a view of the column indices in the row. This amortizes
2421  // the cost of getting the view over all the entries of inds.
2422  auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2423 
2424  for (LO j = 0; j < numElts; ++j) {
2425  const GO gblColInd = inds[j];
2426  const size_t offset =
2427  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2428  gblColInd, hint, sorted);
2429  if (offset != rowInfo.numEntries) {
2430  if (atomic) {
2431  Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2432  } else {
2433  rowVals[offset] += newVals[j];
2434  }
2435  hint = offset + 1;
2436  numValid++;
2437  }
2438  }
2439  }
2440  // If the graph is neither locally nor globally indexed on the
2441  // calling process, that means the calling process has no graph
2442  // entries. Thus, none of the input column indices are valid.
2443 
2444  return numValid;
2445 }
2446 
2447 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2448 LocalOrdinal
2450  sumIntoGlobalValues(const GlobalOrdinal gblRow,
2451  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2452  const Teuchos::ArrayView<const Scalar>& inputVals,
2453  const bool atomic) {
2454  typedef LocalOrdinal LO;
2455 
2456  const LO numInputEnt = static_cast<LO>(inputGblColInds.size());
2457  if (static_cast<LO>(inputVals.size()) != numInputEnt) {
2458  return Teuchos::OrdinalTraits<LO>::invalid();
2459  }
2460  return this->sumIntoGlobalValues(gblRow, numInputEnt,
2461  inputVals.getRawPtr(),
2462  inputGblColInds.getRawPtr(),
2463  atomic);
2464 }
2465 
2466 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2467 LocalOrdinal
2469  sumIntoGlobalValues(const GlobalOrdinal gblRow,
2470  const LocalOrdinal numInputEnt,
2471  const Scalar inputVals[],
2472  const GlobalOrdinal inputGblColInds[],
2473  const bool atomic) {
2474  typedef impl_scalar_type IST;
2475  typedef LocalOrdinal LO;
2476  typedef GlobalOrdinal GO;
2477 
2478  if (!this->isFillActive() || this->staticGraph_.is_null()) {
2479  // Fill must be active and the "nonconst" graph must exist.
2480  return Teuchos::OrdinalTraits<LO>::invalid();
2481  }
2482  const crs_graph_type& graph = *(this->staticGraph_);
2483 
2484  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex(gblRow);
2485  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2486  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2487  // thread safe in a debug build, in part because it uses
2488  // Teuchos::ArrayView, and in part because of the data structure
2489  // used to stash outgoing entries.
2490  using Teuchos::ArrayView;
2491  ArrayView<const GO> inputGblColInds_av(
2492  numInputEnt == 0 ? nullptr : inputGblColInds,
2493  numInputEnt);
2494  ArrayView<const Scalar> inputVals_av(
2495  numInputEnt == 0 ? nullptr : inputVals, numInputEnt);
2496  // gblRow is not in the row Map on the calling process, so stash
2497  // the given entries away in a separate data structure.
2498  // globalAssemble() (called during fillComplete()) will exchange
2499  // that data and sum it in using sumIntoGlobalValues().
2500  this->insertNonownedGlobalValues(gblRow, inputGblColInds_av,
2501  inputVals_av);
2502  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2503  // since we won't know whether the given indices were valid
2504  // until globalAssemble (called in fillComplete) is called.
2505  // That's why insertNonownedGlobalValues doesn't return
2506  // anything. Just for consistency, I'll return the number of
2507  // entries that the user gave us.
2508  return numInputEnt;
2509  } else { // input row is in the row Map on the calling process
2510  auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2511  const IST* const inVals = reinterpret_cast<const IST*>(inputVals);
2512  return this->sumIntoGlobalValuesImpl(curRowVals.data(), graph, rowInfo,
2513  inputGblColInds, inVals,
2514  numInputEnt, atomic);
2515  }
2516 }
2517 
2518 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2519 LocalOrdinal
2521  transformLocalValues(const LocalOrdinal lclRow,
2522  const LocalOrdinal numInputEnt,
2523  const impl_scalar_type inputVals[],
2524  const LocalOrdinal inputCols[],
2525  std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2526  const bool atomic) {
2527  using Tpetra::Details::OrdinalTraits;
2528  typedef LocalOrdinal LO;
2529 
2530  if (!this->isFillActive() || this->staticGraph_.is_null()) {
2531  // Fill must be active and the "nonconst" graph must exist.
2532  return Teuchos::OrdinalTraits<LO>::invalid();
2533  }
2534  const crs_graph_type& graph = *(this->staticGraph_);
2535  const RowInfo rowInfo = graph.getRowInfo(lclRow);
2536 
2537  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid()) {
2538  // The calling process does not own this row, so it is not
2539  // allowed to modify its values.
2540  return static_cast<LO>(0);
2541  }
2542  auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2543  return this->transformLocalValues(curRowVals.data(), graph,
2544  rowInfo, inputCols, inputVals,
2545  numInputEnt, f, atomic);
2546 }
2547 
2548 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2549 LocalOrdinal
2550 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2551  transformGlobalValues(const GlobalOrdinal gblRow,
2552  const LocalOrdinal numInputEnt,
2553  const impl_scalar_type inputVals[],
2554  const GlobalOrdinal inputCols[],
2555  std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2556  const bool atomic) {
2557  using Tpetra::Details::OrdinalTraits;
2558  typedef LocalOrdinal LO;
2559 
2560  if (!this->isFillActive() || this->staticGraph_.is_null()) {
2561  // Fill must be active and the "nonconst" graph must exist.
2562  return OrdinalTraits<LO>::invalid();
2563  }
2564  const crs_graph_type& graph = *(this->staticGraph_);
2565  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex(gblRow);
2566 
2567  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid()) {
2568  // The calling process does not own this row, so it is not
2569  // allowed to modify its values.
2570  return static_cast<LO>(0);
2571  }
2572  auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2573  return this->transformGlobalValues(curRowVals.data(), graph,
2574  rowInfo, inputCols, inputVals,
2575  numInputEnt, f, atomic);
2576 }
2577 
2578 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2579 LocalOrdinal
2580 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2581  transformLocalValues(impl_scalar_type rowVals[],
2582  const crs_graph_type& graph,
2583  const RowInfo& rowInfo,
2584  const LocalOrdinal inds[],
2585  const impl_scalar_type newVals[],
2586  const LocalOrdinal numElts,
2587  std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2588  const bool atomic) {
2589  typedef impl_scalar_type ST;
2590  typedef LocalOrdinal LO;
2591  typedef GlobalOrdinal GO;
2592 
2593  // if (newVals.extent (0) != inds.extent (0)) {
2594  // The sizes of the input arrays must match.
2595  // return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2596  // }
2597  // const LO numElts = static_cast<LO> (inds.extent (0));
2598  const bool sorted = graph.isSorted();
2599 
2600  LO numValid = 0; // number of valid input column indices
2601  size_t hint = 0; // Guess for the current index k into rowVals
2602 
2603  if (graph.isLocallyIndexed()) {
2604  // Get a view of the column indices in the row. This amortizes
2605  // the cost of getting the view over all the entries of inds.
2606  auto colInds = graph.getLocalIndsViewHost(rowInfo);
2607 
2608  for (LO j = 0; j < numElts; ++j) {
2609  const LO lclColInd = inds[j];
2610  const size_t offset =
2611  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2612  lclColInd, hint, sorted);
2613  if (offset != rowInfo.numEntries) {
2614  if (atomic) {
2615  // NOTE (mfh 30 Nov 2015) The commented-out code is
2616  // wrong because another thread may have changed
2617  // rowVals[offset] between those two lines of code.
2618  //
2619  // const ST newVal = f (rowVals[offset], newVals[j]);
2620  // Kokkos::atomic_assign (&rowVals[offset], newVal);
2621 
2622  ST* const dest = &rowVals[offset];
2623  (void)atomic_binary_function_update(dest, newVals[j], f);
2624  } else {
2625  // use binary function f
2626  rowVals[offset] = f(rowVals[offset], newVals[j]);
2627  }
2628  hint = offset + 1;
2629  ++numValid;
2630  }
2631  }
2632  } else if (graph.isGloballyIndexed()) {
2633  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2634  // pointer does NOT change its reference count. Thus, this
2635  // code is still thread safe.
2636  if (graph.colMap_.is_null()) {
2637  // NO input column indices are valid in this case. Either
2638  // the column Map hasn't been set yet (so local indices
2639  // don't exist yet), or the calling process owns no graph
2640  // entries.
2641  return numValid;
2642  }
2643  const map_type& colMap = *(graph.colMap_);
2644  // Get a view of the column indices in the row. This amortizes
2645  // the cost of getting the view over all the entries of inds.
2646  auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2647 
2648  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid();
2649  for (LO j = 0; j < numElts; ++j) {
2650  const GO gblColInd = colMap.getGlobalElement(inds[j]);
2651  if (gblColInd != GINV) {
2652  const size_t offset =
2653  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2654  gblColInd, hint, sorted);
2655  if (offset != rowInfo.numEntries) {
2656  if (atomic) {
2657  // NOTE (mfh 30 Nov 2015) The commented-out code is
2658  // wrong because another thread may have changed
2659  // rowVals[offset] between those two lines of code.
2660  //
2661  // const ST newVal = f (rowVals[offset], newVals[j]);
2662  // Kokkos::atomic_assign (&rowVals[offset], newVal);
2663 
2664  ST* const dest = &rowVals[offset];
2665  (void)atomic_binary_function_update(dest, newVals[j], f);
2666  } else {
2667  // use binary function f
2668  rowVals[offset] = f(rowVals[offset], newVals[j]);
2669  }
2670  hint = offset + 1;
2671  numValid++;
2672  }
2673  }
2674  }
2675  }
2676  // If the graph is neither locally nor globally indexed on the
2677  // calling process, that means the calling process has no graph
2678  // entries. Thus, none of the input column indices are valid.
2679 
2680  return numValid;
2681 }
2682 
2683 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2684 LocalOrdinal
2685 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2686  transformGlobalValues(impl_scalar_type rowVals[],
2687  const crs_graph_type& graph,
2688  const RowInfo& rowInfo,
2689  const GlobalOrdinal inds[],
2690  const impl_scalar_type newVals[],
2691  const LocalOrdinal numElts,
2692  std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2693  const bool atomic) {
2694  typedef impl_scalar_type ST;
2695  typedef LocalOrdinal LO;
2696  typedef GlobalOrdinal GO;
2697 
2698  // if (newVals.extent (0) != inds.extent (0)) {
2699  // The sizes of the input arrays must match.
2700  // return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2701  // }
2702  // const LO numElts = static_cast<LO> (inds.extent (0));
2703  const bool sorted = graph.isSorted();
2704 
2705  LO numValid = 0; // number of valid input column indices
2706  size_t hint = 0; // Guess for the current index k into rowVals
2707 
2708  if (graph.isGloballyIndexed()) {
2709  // Get a view of the column indices in the row. This amortizes
2710  // the cost of getting the view over all the entries of inds.
2711  auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2712 
2713  for (LO j = 0; j < numElts; ++j) {
2714  const GO gblColInd = inds[j];
2715  const size_t offset =
2716  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2717  gblColInd, hint, sorted);
2718  if (offset != rowInfo.numEntries) {
2719  if (atomic) {
2720  // NOTE (mfh 30 Nov 2015) The commented-out code is
2721  // wrong because another thread may have changed
2722  // rowVals[offset] between those two lines of code.
2723  //
2724  // const ST newVal = f (rowVals[offset], newVals[j]);
2725  // Kokkos::atomic_assign (&rowVals[offset], newVal);
2726 
2727  ST* const dest = &rowVals[offset];
2728  (void)atomic_binary_function_update(dest, newVals[j], f);
2729  } else {
2730  // use binary function f
2731  rowVals[offset] = f(rowVals[offset], newVals[j]);
2732  }
2733  hint = offset + 1;
2734  ++numValid;
2735  }
2736  }
2737  } else if (graph.isLocallyIndexed()) {
2738  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2739  // pointer does NOT change its reference count. Thus, this
2740  // code is still thread safe.
2741  if (graph.colMap_.is_null()) {
2742  // NO input column indices are valid in this case. Either the
2743  // column Map hasn't been set yet (so local indices don't
2744  // exist yet), or the calling process owns no graph entries.
2745  return numValid;
2746  }
2747  const map_type& colMap = *(graph.colMap_);
2748  // Get a view of the column indices in the row. This amortizes
2749  // the cost of getting the view over all the entries of inds.
2750  auto colInds = graph.getLocalIndsViewHost(rowInfo);
2751 
2752  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid();
2753  for (LO j = 0; j < numElts; ++j) {
2754  const LO lclColInd = colMap.getLocalElement(inds[j]);
2755  if (lclColInd != LINV) {
2756  const size_t offset =
2757  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2758  lclColInd, hint, sorted);
2759  if (offset != rowInfo.numEntries) {
2760  if (atomic) {
2761  // NOTE (mfh 30 Nov 2015) The commented-out code is
2762  // wrong because another thread may have changed
2763  // rowVals[offset] between those two lines of code.
2764  //
2765  // const ST newVal = f (rowVals[offset], newVals[j]);
2766  // Kokkos::atomic_assign (&rowVals[offset], newVal);
2767 
2768  ST* const dest = &rowVals[offset];
2769  (void)atomic_binary_function_update(dest, newVals[j], f);
2770  } else {
2771  // use binary function f
2772  rowVals[offset] = f(rowVals[offset], newVals[j]);
2773  }
2774  hint = offset + 1;
2775  numValid++;
2776  }
2777  }
2778  }
2779  }
2780  // If the graph is neither locally nor globally indexed on the
2781  // calling process, that means the calling process has no graph
2782  // entries. Thus, none of the input column indices are valid.
2783 
2784  return numValid;
2785 }
2786 
2787 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2788 LocalOrdinal
2791  const crs_graph_type& graph,
2792  const RowInfo& rowInfo,
2793  const LocalOrdinal inds[],
2794  const impl_scalar_type newVals[],
2795  const LocalOrdinal numElts,
2796  const bool atomic) {
2797  typedef LocalOrdinal LO;
2798  typedef GlobalOrdinal GO;
2799 
2800  const bool sorted = graph.isSorted();
2801 
2802  size_t hint = 0; // Guess for the current index k into rowVals
2803  LO numValid = 0; // number of valid local column indices
2804 
2805  if (graph.isLocallyIndexed()) {
2806  // Get a view of the column indices in the row. This amortizes
2807  // the cost of getting the view over all the entries of inds.
2808  auto colInds = graph.getLocalIndsViewHost(rowInfo);
2809 
2810  for (LO j = 0; j < numElts; ++j) {
2811  const LO lclColInd = inds[j];
2812  const size_t offset =
2813  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2814  lclColInd, hint, sorted);
2815  if (offset != rowInfo.numEntries) {
2816  if (atomic) {
2817  Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2818  } else {
2819  rowVals[offset] += newVals[j];
2820  }
2821  hint = offset + 1;
2822  ++numValid;
2823  }
2824  }
2825  } else if (graph.isGloballyIndexed()) {
2826  if (graph.colMap_.is_null()) {
2827  return Teuchos::OrdinalTraits<LO>::invalid();
2828  }
2829  const map_type colMap = *(graph.colMap_);
2830 
2831  // Get a view of the column indices in the row. This amortizes
2832  // the cost of getting the view over all the entries of inds.
2833  auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2834 
2835  for (LO j = 0; j < numElts; ++j) {
2836  const GO gblColInd = colMap.getGlobalElement(inds[j]);
2837  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid()) {
2838  const size_t offset =
2839  KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2840  gblColInd, hint, sorted);
2841  if (offset != rowInfo.numEntries) {
2842  if (atomic) {
2843  Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2844  } else {
2845  rowVals[offset] += newVals[j];
2846  }
2847  hint = offset + 1;
2848  ++numValid;
2849  }
2850  }
2851  }
2852  }
2853  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2854  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2855  // to be neither locally nor globally indexed on a process.
2856  // This means that the graph or matrix has no entries on that
2857  // process. Epetra also works like this. It's related to lazy
2858  // allocation (on first insertion, not at graph / matrix
2859  // construction). Lazy allocation will go away because it is
2860  // not thread scalable.
2861 
2862  return numValid;
2863 }
2864 
2865 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2866 LocalOrdinal
2868  sumIntoLocalValues(const LocalOrdinal localRow,
2869  const Teuchos::ArrayView<const LocalOrdinal>& indices,
2870  const Teuchos::ArrayView<const Scalar>& values,
2871  const bool atomic) {
2872  using LO = local_ordinal_type;
2873  const LO numInputEnt = static_cast<LO>(indices.size());
2874  if (static_cast<LO>(values.size()) != numInputEnt) {
2875  return Teuchos::OrdinalTraits<LO>::invalid();
2876  }
2877  const LO* const inputInds = indices.getRawPtr();
2878  const scalar_type* const inputVals = values.getRawPtr();
2879  return this->sumIntoLocalValues(localRow, numInputEnt,
2880  inputVals, inputInds, atomic);
2881 }
2882 
2883 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2885  local_ordinal_type
2888  const local_ordinal_type localRow,
2889  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2890  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
2891  const bool atomic) {
2892  using LO = local_ordinal_type;
2893  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2894  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2895  return Teuchos::OrdinalTraits<LO>::invalid();
2896  }
2897  const scalar_type* inVals =
2898  reinterpret_cast<const scalar_type*>(inputVals.data());
2899  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
2900  inputInds.data(), atomic);
2901 }
2902 
2903 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2904 LocalOrdinal
2906  sumIntoLocalValues(const LocalOrdinal localRow,
2907  const LocalOrdinal numEnt,
2908  const Scalar vals[],
2909  const LocalOrdinal cols[],
2910  const bool atomic) {
2911  typedef impl_scalar_type IST;
2912  typedef LocalOrdinal LO;
2913 
2914  if (!this->isFillActive() || this->staticGraph_.is_null()) {
2915  // Fill must be active and the "nonconst" graph must exist.
2916  return Teuchos::OrdinalTraits<LO>::invalid();
2917  }
2918  const crs_graph_type& graph = *(this->staticGraph_);
2919  const RowInfo rowInfo = graph.getRowInfo(localRow);
2920 
2921  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2922  // The calling process does not own this row, so it is not
2923  // allowed to modify its values.
2924  return static_cast<LO>(0);
2925  }
2926  auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2927  const IST* const inputVals = reinterpret_cast<const IST*>(vals);
2928  return this->sumIntoLocalValuesImpl(curRowVals.data(), graph, rowInfo,
2929  cols, inputVals, numEnt, atomic);
2930 }
2931 
2932 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2934  values_dualv_type::t_host::const_type
2936  getValuesViewHost(const RowInfo& rowinfo) const {
2937  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2938  return typename values_dualv_type::t_host::const_type();
2939  else
2940  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
2941  rowinfo.allocSize,
2942  Access::ReadOnly);
2943 }
2944 
2945 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2947  values_dualv_type::t_host
2950  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2951  return typename values_dualv_type::t_host();
2952  else
2953  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
2954  rowinfo.allocSize,
2955  Access::ReadWrite);
2956 }
2957 
2958 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2960  values_dualv_type::t_dev::const_type
2962  getValuesViewDevice(const RowInfo& rowinfo) const {
2963  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2964  return typename values_dualv_type::t_dev::const_type();
2965  else
2966  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
2967  rowinfo.allocSize,
2968  Access::ReadOnly);
2969 }
2970 
2971 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2973  values_dualv_type::t_dev
2976  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2977  return typename values_dualv_type::t_dev();
2978  else
2979  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
2980  rowinfo.allocSize,
2981  Access::ReadWrite);
2982 }
2983 
2984 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2987  nonconst_local_inds_host_view_type& indices,
2988  nonconst_values_host_view_type& values,
2989  size_t& numEntries) const {
2990  using Teuchos::ArrayView;
2991  using Teuchos::av_reinterpret_cast;
2992  const char tfecfFuncName[] = "getLocalRowCopy: ";
2993 
2994  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->hasColMap(), std::runtime_error,
2995  "The matrix does not have a column Map yet. This means we don't have "
2996  "local indices for columns yet, so it doesn't make sense to call this "
2997  "method. If the matrix doesn't have a column Map yet, you should call "
2998  "fillComplete on it first.");
2999 
3000  const RowInfo rowinfo = staticGraph_->getRowInfo(localRow);
3001  const size_t theNumEntries = rowinfo.numEntries;
3002  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) < theNumEntries ||
3003  static_cast<size_t>(values.size()) < theNumEntries,
3004  std::runtime_error, "Row with local index " << localRow << " has " << theNumEntries << " entry/ies, but indices.size() = " << indices.size() << " and values.size() = " << values.size() << ".");
3005  numEntries = theNumEntries; // first side effect
3006 
3007  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid()) {
3008  if (staticGraph_->isLocallyIndexed()) {
3009  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3010  auto curVals = getValuesViewHost(rowinfo);
3011 
3012  for (size_t j = 0; j < theNumEntries; ++j) {
3013  values[j] = curVals[j];
3014  indices[j] = curLclInds(j);
3015  }
3016  } else if (staticGraph_->isGloballyIndexed()) {
3017  // Don't call getColMap(), because it touches RCP's reference count.
3018  const map_type& colMap = *(staticGraph_->colMap_);
3019  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3020  auto curVals = getValuesViewHost(rowinfo);
3021 
3022  for (size_t j = 0; j < theNumEntries; ++j) {
3023  values[j] = curVals[j];
3024  indices[j] = colMap.getLocalElement(curGblInds(j));
3025  }
3026  }
3027  }
3028 }
3029 
3030 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3033  nonconst_global_inds_host_view_type& indices,
3034  nonconst_values_host_view_type& values,
3035  size_t& numEntries) const {
3036  using Teuchos::ArrayView;
3037  using Teuchos::av_reinterpret_cast;
3038  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3039 
3040  const RowInfo rowinfo =
3041  staticGraph_->getRowInfoFromGlobalRowIndex(globalRow);
3042  const size_t theNumEntries = rowinfo.numEntries;
3043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3044  static_cast<size_t>(indices.size()) < theNumEntries ||
3045  static_cast<size_t>(values.size()) < theNumEntries,
3046  std::runtime_error, "Row with global index " << globalRow << " has " << theNumEntries << " entry/ies, but indices.size() = " << indices.size() << " and values.size() = " << values.size() << ".");
3047  numEntries = theNumEntries; // first side effect
3048 
3049  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid()) {
3050  if (staticGraph_->isLocallyIndexed()) {
3051  const map_type& colMap = *(staticGraph_->colMap_);
3052  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3053  auto curVals = getValuesViewHost(rowinfo);
3054 
3055  for (size_t j = 0; j < theNumEntries; ++j) {
3056  values[j] = curVals[j];
3057  indices[j] = colMap.getGlobalElement(curLclInds(j));
3058  }
3059  } else if (staticGraph_->isGloballyIndexed()) {
3060  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3061  auto curVals = getValuesViewHost(rowinfo);
3062 
3063  for (size_t j = 0; j < theNumEntries; ++j) {
3064  values[j] = curVals[j];
3065  indices[j] = curGblInds(j);
3066  }
3067  }
3068  }
3069 }
3070 
3071 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3073  getLocalRowView(LocalOrdinal localRow,
3074  local_inds_host_view_type& indices,
3075  values_host_view_type& values) const {
3076  const char tfecfFuncName[] = "getLocalRowView: ";
3077 
3078  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3079  isGloballyIndexed(), std::runtime_error,
3080  "The matrix currently stores "
3081  "its indices as global indices, so you cannot get a view with local "
3082  "column indices. If the matrix has a column Map, you may call "
3083  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3084  "a view with global column indices by calling getGlobalRowCopy().");
3085 
3086  const RowInfo rowInfo = staticGraph_->getRowInfo(localRow);
3087  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid() &&
3088  rowInfo.numEntries > 0) {
3089  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3090  rowInfo.offset1D,
3091  rowInfo.numEntries,
3092  Access::ReadOnly);
3093  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3094  rowInfo.numEntries,
3095  Access::ReadOnly);
3096  } else {
3097  // This does the right thing (reports an empty row) if the input
3098  // row is invalid.
3099  indices = local_inds_host_view_type();
3100  values = values_host_view_type();
3101  }
3102 
3103 #ifdef HAVE_TPETRA_DEBUG
3104  const char suffix[] =
3105  ". This should never happen. Please report this "
3106  "bug to the Tpetra developers.";
3107  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3108  static_cast<size_t>(values.size()),
3109  std::logic_error,
3110  "At the end of this method, for local row " << localRow << ", "
3111  "indices.size() = "
3112  << indices.size() << " != values.size () = "
3113  << values.size() << suffix);
3114  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3115  static_cast<size_t>(rowInfo.numEntries),
3116  std::logic_error,
3117  "At the end of this method, for local row " << localRow << ", "
3118  "indices.size() = "
3119  << indices.size() << " != rowInfo.numEntries = "
3120  << rowInfo.numEntries << suffix);
3121  const size_t expectedNumEntries = getNumEntriesInLocalRow(localRow);
3122  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowInfo.numEntries != expectedNumEntries, std::logic_error,
3123  "At the end "
3124  "of this method, for local row "
3125  << localRow << ", rowInfo.numEntries = "
3126  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " << expectedNumEntries << suffix);
3127 #endif // HAVE_TPETRA_DEBUG
3128 }
3129 
3130 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3132  getGlobalRowView(GlobalOrdinal globalRow,
3133  global_inds_host_view_type& indices,
3134  values_host_view_type& values) const {
3135  const char tfecfFuncName[] = "getGlobalRowView: ";
3136 
3137  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3138  isLocallyIndexed(), std::runtime_error,
3139  "The matrix is locally indexed, so we cannot return a view of the row "
3140  "with global column indices. Use getGlobalRowCopy() instead.");
3141 
3142  // This does the right thing (reports an empty row) if the input
3143  // row is invalid.
3144  const RowInfo rowInfo =
3145  staticGraph_->getRowInfoFromGlobalRowIndex(globalRow);
3146  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid() &&
3147  rowInfo.numEntries > 0) {
3148  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3149  rowInfo.numEntries,
3150  Access::ReadOnly);
3151  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3152  rowInfo.numEntries,
3153  Access::ReadOnly);
3154  } else {
3155  indices = global_inds_host_view_type();
3156  values = values_host_view_type();
3157  }
3158 
3159 #ifdef HAVE_TPETRA_DEBUG
3160  const char suffix[] =
3161  ". This should never happen. Please report this "
3162  "bug to the Tpetra developers.";
3163  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3164  static_cast<size_t>(values.size()),
3165  std::logic_error,
3166  "At the end of this method, for global row " << globalRow << ", "
3167  "indices.size() = "
3168  << indices.size() << " != values.size () = "
3169  << values.size() << suffix);
3170  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3171  static_cast<size_t>(rowInfo.numEntries),
3172  std::logic_error,
3173  "At the end of this method, for global row " << globalRow << ", "
3174  "indices.size() = "
3175  << indices.size() << " != rowInfo.numEntries = "
3176  << rowInfo.numEntries << suffix);
3177  const size_t expectedNumEntries = getNumEntriesInGlobalRow(globalRow);
3178  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowInfo.numEntries != expectedNumEntries, std::logic_error,
3179  "At the end "
3180  "of this method, for global row "
3181  << globalRow << ", rowInfo.numEntries "
3182  "= "
3183  << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3184  " "
3185  << expectedNumEntries << suffix);
3186 #endif // HAVE_TPETRA_DEBUG
3187 }
3188 
3189 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3191  scale(const Scalar& alpha) {
3192  const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3193 
3194  const size_t nlrs = staticGraph_->getLocalNumRows();
3195  const size_t numEntries = staticGraph_->getLocalNumEntries();
3196  if (!staticGraph_->indicesAreAllocated() ||
3197  nlrs == 0 || numEntries == 0) {
3198  // do nothing
3199  } else {
3200  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3201  KokkosBlas::scal(vals, theAlpha, vals);
3202  }
3203 }
3204 
3205 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3207  setAllToScalar(const Scalar& alpha) {
3208  const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3209 
3210  // replace all values in the matrix
3211  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3212  // however, if there are no valid entries, we can short-circuit
3213  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3214  const size_t numEntries = staticGraph_->getLocalNumEntries();
3215  if (!staticGraph_->indicesAreAllocated() || numEntries == 0) {
3216  // do nothing
3217  } else {
3218  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3219  Kokkos::deep_copy(execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3220  theAlpha);
3221  // CAG: This fence was found to be required on Cuda with UVM=on.
3222  Kokkos::fence("CrsMatrix::setAllToScalar");
3223  }
3224 }
3225 
3226 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3228  setAllValues(const typename local_graph_device_type::row_map_type& rowPointers,
3229  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3230  const typename local_matrix_device_type::values_type& values) {
3231  using ProfilingRegion = Details::ProfilingRegion;
3232  ProfilingRegion region("Tpetra::CrsMatrix::setAllValues");
3233  const char tfecfFuncName[] = "setAllValues: ";
3234  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(columnIndices.size() != values.size(), std::invalid_argument,
3235  "columnIndices.size() = " << columnIndices.size() << " != values.size()"
3236  " = "
3237  << values.size() << ".");
3238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_.is_null(), std::runtime_error, "myGraph_ must not be null.");
3239 
3240  try {
3241  myGraph_->setAllIndices(rowPointers, columnIndices);
3242  } catch (std::exception& e) {
3243  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3244  "myGraph_->setAllIndices() threw an "
3245  "exception: "
3246  << e.what());
3247  }
3248 
3249  // Make sure that myGraph_ now has a local graph. It may not be
3250  // fillComplete yet, so it's important to check. We don't care
3251  // whether setAllIndices() did a shallow copy or a deep copy, so a
3252  // good way to check is to compare dimensions.
3253  auto lclGraph = myGraph_->getLocalGraphDevice();
3254  const size_t numEnt = lclGraph.entries.extent(0);
3255  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclGraph.row_map.extent(0) != rowPointers.extent(0) ||
3256  numEnt != static_cast<size_t>(columnIndices.extent(0)),
3257  std::logic_error,
3258  "myGraph_->setAllIndices() did not correctly create "
3259  "local graph. Please report this bug to the Tpetra developers.");
3260 
3261  valuesPacked_wdv = values_wdv_type(values);
3262  valuesUnpacked_wdv = valuesPacked_wdv;
3263 
3264  // Storage MUST be packed, since the interface doesn't give any
3265  // way to indicate any extra space at the end of each row.
3266  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3267 
3268  checkInternalState();
3269 }
3270 
3271 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3273  setAllValues(const local_matrix_device_type& localDeviceMatrix) {
3274  using ProfilingRegion = Details::ProfilingRegion;
3275  ProfilingRegion region("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3276 
3277  auto graph = localDeviceMatrix.graph;
3278  // FIXME how to check whether graph is allocated
3279 
3280  auto rows = graph.row_map;
3281  auto columns = graph.entries;
3282  auto values = localDeviceMatrix.values;
3283 
3284  setAllValues(rows, columns, values);
3285 }
3286 
3287 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3289  setAllValues(const Teuchos::ArrayRCP<size_t>& ptr,
3290  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3291  const Teuchos::ArrayRCP<Scalar>& val) {
3292  using Kokkos::Compat::getKokkosViewDeepCopy;
3293  using Teuchos::ArrayRCP;
3294  using Teuchos::av_reinterpret_cast;
3295  typedef device_type DT;
3296  typedef impl_scalar_type IST;
3297  typedef typename local_graph_device_type::row_map_type row_map_type;
3298  // typedef typename row_map_type::non_const_value_type row_offset_type;
3299  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3300 
3301  // The row offset type may depend on the execution space. It may
3302  // not necessarily be size_t. If it's not, we need to make a deep
3303  // copy. We need to make a deep copy anyway so that Kokkos can
3304  // own the memory. Regardless, ptrIn gets the copy.
3305  typename row_map_type::non_const_type ptrNative("ptr", ptr.size());
3306  Kokkos::View<const size_t*,
3307  typename row_map_type::array_layout,
3308  Kokkos::HostSpace,
3309  Kokkos::MemoryUnmanaged>
3310  ptrSizeT(ptr.getRawPtr(), ptr.size());
3311  ::Tpetra::Details::copyOffsets(ptrNative, ptrSizeT);
3312 
3313  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(ptrNative.extent(0) != ptrSizeT.extent(0),
3314  std::logic_error, "ptrNative.extent(0) = " << ptrNative.extent(0) << " != ptrSizeT.extent(0) = " << ptrSizeT.extent(0) << ". Please report this bug to the "
3315  "Tpetra developers.");
3316 
3317  auto indIn = getKokkosViewDeepCopy<DT>(ind());
3318  auto valIn = getKokkosViewDeepCopy<DT>(av_reinterpret_cast<IST>(val()));
3319  this->setAllValues(ptrNative, indIn, valIn);
3320 }
3321 
3322 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3324  getLocalDiagOffsets(Teuchos::ArrayRCP<size_t>& offsets) const {
3325  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3326  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(staticGraph_.is_null(), std::runtime_error, "The matrix has no graph.");
3327 
3328  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3329  // this method in CrsGraph too, so don't call it (otherwise build
3330  // warnings will show up and annoy users). Instead, copy results
3331  // in and out, if the memory space requires it.
3332 
3333  const size_t lclNumRows = staticGraph_->getLocalNumRows();
3334  if (static_cast<size_t>(offsets.size()) < lclNumRows) {
3335  offsets.resize(lclNumRows);
3336  }
3337 
3338  // The input ArrayRCP must always be a host pointer. Thus, if
3339  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3340  // to write to that allocation directly as a Kokkos::View.
3341  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3342  // It is always syntactically correct to assign a raw host
3343  // pointer to a device View, so this code will compile correctly
3344  // even if this branch never runs.
3345  typedef Kokkos::View<size_t*, device_type,
3346  Kokkos::MemoryUnmanaged>
3347  output_type;
3348  output_type offsetsOut(offsets.getRawPtr(), lclNumRows);
3349  staticGraph_->getLocalDiagOffsets(offsetsOut);
3350  } else {
3351  Kokkos::View<size_t*, device_type> offsetsTmp("diagOffsets", lclNumRows);
3352  staticGraph_->getLocalDiagOffsets(offsetsTmp);
3353  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3354  Kokkos::MemoryUnmanaged>
3355  output_type;
3356  output_type offsetsOut(offsets.getRawPtr(), lclNumRows);
3357  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3358  Kokkos::deep_copy(execution_space(), offsetsOut, offsetsTmp);
3359  }
3360 }
3361 
3362 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3365  using Teuchos::ArrayRCP;
3366  using Teuchos::ArrayView;
3367  using Teuchos::av_reinterpret_cast;
3368  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3369  typedef local_ordinal_type LO;
3370 
3371  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3372  staticGraph_.is_null(), std::runtime_error,
3373  "This method requires that the matrix have a graph.");
3374  auto rowMapPtr = this->getRowMap();
3375  if (rowMapPtr.is_null() || rowMapPtr->getComm().is_null()) {
3376  // Processes on which the row Map or its communicator is null
3377  // don't participate. Users shouldn't even call this method on
3378  // those processes.
3379  return;
3380  }
3381  auto colMapPtr = this->getColMap();
3382  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->hasColMap() || colMapPtr.is_null(), std::runtime_error,
3383  "This method requires that the matrix have a column Map.");
3384  const map_type& rowMap = *rowMapPtr;
3385  const map_type& colMap = *colMapPtr;
3386  const LO myNumRows = static_cast<LO>(this->getLocalNumRows());
3387 
3388 #ifdef HAVE_TPETRA_DEBUG
3389  // isCompatible() requires an all-reduce, and thus this check
3390  // should only be done in debug mode.
3391  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3392  !diag.getMap()->isCompatible(rowMap), std::runtime_error,
3393  "The input Vector's Map must be compatible with the CrsMatrix's row "
3394  "Map. You may check this by using Map's isCompatible method: "
3395  "diag.getMap ()->isCompatible (A.getRowMap ());");
3396 #endif // HAVE_TPETRA_DEBUG
3397 
3398  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3399  // 1-D subview of the first (and only) column of D_lcl.
3400  const auto D_lcl_1d =
3401  Kokkos::subview(D_lcl, Kokkos::make_pair(LO(0), myNumRows), 0);
3402 
3403  const auto lclRowMap = rowMap.getLocalMap();
3404  const auto lclColMap = colMap.getLocalMap();
3406  (void)getDiagCopyWithoutOffsets(D_lcl_1d, lclRowMap,
3407  lclColMap,
3408  getLocalMatrixDevice());
3409 }
3410 
3411 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3414  const Kokkos::View<const size_t*, device_type,
3415  Kokkos::MemoryUnmanaged>& offsets) const {
3416  typedef LocalOrdinal LO;
3417 
3418 #ifdef HAVE_TPETRA_DEBUG
3419  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3420  const map_type& rowMap = *(this->getRowMap());
3421  // isCompatible() requires an all-reduce, and thus this check
3422  // should only be done in debug mode.
3423  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3424  !diag.getMap()->isCompatible(rowMap), std::runtime_error,
3425  "The input Vector's Map must be compatible with (in the sense of Map::"
3426  "isCompatible) the CrsMatrix's row Map.");
3427 #endif // HAVE_TPETRA_DEBUG
3428 
3429  // For now, we fill the Vector on the host and sync to device.
3430  // Later, we may write a parallel kernel that works entirely on
3431  // device.
3432  //
3433  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3434  // we write a device kernel, it will not need to assume UVM.
3435 
3436  auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3437  const LO myNumRows = static_cast<LO>(this->getLocalNumRows());
3438  // Get 1-D subview of the first (and only) column of D_lcl.
3439  auto D_lcl_1d =
3440  Kokkos::subview(D_lcl, Kokkos::make_pair(LO(0), myNumRows), 0);
3441 
3442  KokkosSparse::getDiagCopy(D_lcl_1d, offsets,
3443  getLocalMatrixDevice());
3444 }
3445 
3446 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3449  const Teuchos::ArrayView<const size_t>& offsets) const {
3450  using LO = LocalOrdinal;
3451  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3452  using IST = impl_scalar_type;
3453 
3454 #ifdef HAVE_TPETRA_DEBUG
3455  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3456  const map_type& rowMap = *(this->getRowMap());
3457  // isCompatible() requires an all-reduce, and thus this check
3458  // should only be done in debug mode.
3459  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3460  !diag.getMap()->isCompatible(rowMap), std::runtime_error,
3461  "The input Vector's Map must be compatible with (in the sense of Map::"
3462  "isCompatible) the CrsMatrix's row Map.");
3463 #endif // HAVE_TPETRA_DEBUG
3464 
3465  // See #1510. In case diag has already been marked modified on
3466  // device, we need to clear that flag, since the code below works
3467  // on host.
3468  // diag.clear_sync_state ();
3469 
3470  // For now, we fill the Vector on the host and sync to device.
3471  // Later, we may write a parallel kernel that works entirely on
3472  // device.
3473  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3474  // 1-D subview of the first (and only) column of lclVecHost.
3475  auto lclVecHost1d = Kokkos::subview(lclVecHost, Kokkos::ALL(), 0);
3476 
3477  using host_offsets_view_type =
3478  Kokkos::View<const size_t*, Kokkos::HostSpace,
3479  Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
3480  host_offsets_view_type h_offsets(offsets.getRawPtr(), offsets.size());
3481  // Find the diagonal entries and put them in lclVecHost1d.
3482  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3483  const LO myNumRows = static_cast<LO>(this->getLocalNumRows());
3484  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid();
3485 
3486  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3487  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3488  Kokkos::parallel_for("Tpetra::CrsMatrix::getLocalDiagCopy",
3489  range_type(0, myNumRows),
3490  [&, INV, h_offsets](const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3491  lclVecHost1d(lclRow) = STS::zero(); // default value if no diag entry
3492  if (h_offsets[lclRow] != INV) {
3493  auto curRowOffset = rowPtrsPackedHost(lclRow);
3494  lclVecHost1d(lclRow) =
3495  static_cast<IST>(valuesPackedHost(curRowOffset + h_offsets[lclRow]));
3496  }
3497  });
3498  // diag.sync_device ();
3499 }
3500 
3501 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3504  using Teuchos::ArrayRCP;
3505  using Teuchos::ArrayView;
3506  using Teuchos::null;
3507  using Teuchos::RCP;
3508  using Teuchos::rcp;
3509  using Teuchos::rcpFromRef;
3510  using ::Tpetra::Details::ProfilingRegion;
3512  const char tfecfFuncName[] = "leftScale: ";
3513 
3514  ProfilingRegion region("Tpetra::CrsMatrix::leftScale");
3515 
3516  RCP<const vec_type> xp;
3517  if (this->getRangeMap()->isSameAs(*(x.getMap()))) {
3518  // Take from Epetra: If we have a non-trivial exporter, we must
3519  // import elements that are permuted or are on other processors.
3520  auto exporter = this->getCrsGraphRef().getExporter();
3521  if (exporter.get() != nullptr) {
3522  RCP<vec_type> tempVec(new vec_type(this->getRowMap()));
3523  tempVec->doImport(x, *exporter, REPLACE); // reverse mode
3524  xp = tempVec;
3525  } else {
3526  xp = rcpFromRef(x);
3527  }
3528  } else if (this->getRowMap()->isSameAs(*(x.getMap()))) {
3529  xp = rcpFromRef(x);
3530  } else {
3531  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument,
3532  "x's Map must be the same as "
3533  "either the row Map or the range Map of the CrsMatrix.");
3534  }
3535 
3536  if (this->isFillComplete()) {
3537  auto x_lcl = xp->getLocalViewDevice(Access::ReadOnly);
3538  auto x_lcl_1d = Kokkos::subview(x_lcl, Kokkos::ALL(), 0);
3540  leftScaleLocalCrsMatrix(getLocalMatrixDevice(),
3541  x_lcl_1d, false, false);
3542  } else {
3543  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3544  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3545  "CrsMatrix::leftScale requires matrix to be"
3546  " fillComplete");
3547  }
3548 }
3549 
3550 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3553  using Teuchos::ArrayRCP;
3554  using Teuchos::ArrayView;
3555  using Teuchos::null;
3556  using Teuchos::RCP;
3557  using Teuchos::rcp;
3558  using Teuchos::rcpFromRef;
3559  using ::Tpetra::Details::ProfilingRegion;
3561  const char tfecfFuncName[] = "rightScale: ";
3562 
3563  ProfilingRegion region("Tpetra::CrsMatrix::rightScale");
3564 
3565  RCP<const vec_type> xp;
3566  if (this->getDomainMap()->isSameAs(*(x.getMap()))) {
3567  // Take from Epetra: If we have a non-trivial exporter, we must
3568  // import elements that are permuted or are on other processors.
3569  auto importer = this->getCrsGraphRef().getImporter();
3570  if (importer.get() != nullptr) {
3571  RCP<vec_type> tempVec(new vec_type(this->getColMap()));
3572  tempVec->doImport(x, *importer, REPLACE);
3573  xp = tempVec;
3574  } else {
3575  xp = rcpFromRef(x);
3576  }
3577  } else if (this->getColMap()->isSameAs(*(x.getMap()))) {
3578  xp = rcpFromRef(x);
3579  } else {
3580  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3581  "x's Map must be the same as "
3582  "either the domain Map or the column Map of the CrsMatrix.");
3583  }
3584 
3585  if (this->isFillComplete()) {
3586  auto x_lcl = xp->getLocalViewDevice(Access::ReadOnly);
3587  auto x_lcl_1d = Kokkos::subview(x_lcl, Kokkos::ALL(), 0);
3589  rightScaleLocalCrsMatrix(getLocalMatrixDevice(),
3590  x_lcl_1d, false, false);
3591  } else {
3592  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3593  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3594  "CrsMatrix::rightScale requires matrix to be"
3595  " fillComplete");
3596  }
3597 }
3598 
3599 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3602  auto equilInfo = computeRowOneNorms(*this);
3603  mag_type myMax;
3604  using range_type = Kokkos::RangePolicy<execution_space, local_ordinal_type>;
3605  Kokkos::parallel_reduce(
3606  "getNormInf", range_type(0, equilInfo.rowNorms.extent(0)),
3607  KOKKOS_LAMBDA(local_ordinal_type i, mag_type & max) {
3608  max = equilInfo.rowNorms(i);
3609  },
3610  Kokkos::Max<mag_type>(myMax));
3611  mag_type totalMax = STM::zero();
3612  Teuchos::reduceAll<int, mag_type>(*(getComm()), Teuchos::REDUCE_MAX, myMax,
3613  Teuchos::outArg(totalMax));
3614  return totalMax;
3615 }
3616 
3617 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3620  getNorm1(const bool assumeSymmetric) const {
3621  if (assumeSymmetric)
3622  return getNormInf();
3623  auto equilInfo = computeRowAndColumnOneNorms(*this, false);
3624  mag_type myMax;
3625  using range_type = Kokkos::RangePolicy<execution_space, local_ordinal_type>;
3626  Kokkos::parallel_reduce(
3627  "getNorm1", range_type(0, equilInfo.colNorms.extent(0)),
3628  KOKKOS_LAMBDA(local_ordinal_type i, mag_type & max) {
3629  max = equilInfo.colNorms(i);
3630  },
3631  Kokkos::Max<mag_type>(myMax));
3632  mag_type totalMax = STM::zero();
3633  Teuchos::reduceAll<int, mag_type>(*(getComm()), Teuchos::REDUCE_MAX, myMax,
3634  Teuchos::outArg(totalMax));
3635  return totalMax;
3636 }
3637 
3638 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3642  using Teuchos::ArrayView;
3643  using Teuchos::outArg;
3644  using Teuchos::REDUCE_SUM;
3645  using Teuchos::reduceAll;
3646 
3647  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3648  // local part of this computation. It could make sense to put
3649  // this operation in the Kokkos::CrsMatrix.
3650 
3651  // check the cache first
3652  mag_type mySum = STM::zero();
3653  if (getLocalNumEntries() > 0) {
3654  if (isStorageOptimized()) {
3655  // "Optimized" storage is packed storage. That means we can
3656  // iterate in one pass through the 1-D values array.
3657  const size_t numEntries = getLocalNumEntries();
3658  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3659  for (size_t k = 0; k < numEntries; ++k) {
3660  auto val = values[k];
3661  // Note (etp 06 Jan 2015) We need abs() here for composite types
3662  // (in general, if mag_type is on the left-hand-side, we need
3663  // abs() on the right-hand-side)
3664  const mag_type val_abs = STS::abs(val);
3665  mySum += val_abs * val_abs;
3666  }
3667  } else {
3668  const LocalOrdinal numRows =
3669  static_cast<LocalOrdinal>(this->getLocalNumRows());
3670  for (LocalOrdinal r = 0; r < numRows; ++r) {
3671  const RowInfo rowInfo = myGraph_->getRowInfo(r);
3672  const size_t numEntries = rowInfo.numEntries;
3673  auto A_r = this->getValuesViewHost(rowInfo);
3674  for (size_t k = 0; k < numEntries; ++k) {
3675  const impl_scalar_type val = A_r[k];
3676  const mag_type val_abs = STS::abs(val);
3677  mySum += val_abs * val_abs;
3678  }
3679  }
3680  }
3681  }
3682  mag_type totalSum = STM::zero();
3683  reduceAll<int, mag_type>(*(getComm()), REDUCE_SUM,
3684  mySum, outArg(totalSum));
3685  return STM::sqrt(totalSum);
3686 }
3687 
3688 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3690  replaceColMap(const Teuchos::RCP<const map_type>& newColMap) {
3691  const char tfecfFuncName[] = "replaceColMap: ";
3692  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3693  // Then replacing the column Map might mean that we need to
3694  // reindex the column indices.
3695  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3696  myGraph_.is_null(), std::runtime_error,
3697  "This method does not work if the matrix has a const graph. The whole "
3698  "idea of a const graph is that you are not allowed to change it, but "
3699  "this method necessarily must modify the graph, since the graph owns "
3700  "the matrix's column Map.");
3701  myGraph_->replaceColMap(newColMap);
3702 }
3703 
3704 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3707  const Teuchos::RCP<const map_type>& newColMap,
3708  const Teuchos::RCP<const import_type>& newImport,
3709  const bool sortEachRow) {
3710  const char tfecfFuncName[] = "reindexColumns: ";
3711  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3712  graph == nullptr && myGraph_.is_null(), std::invalid_argument,
3713  "The input graph is null, but the matrix does not own its graph.");
3714 
3715  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3716  const bool sortGraph = false; // we'll sort graph & matrix together below
3717 
3718  theGraph.reindexColumns(newColMap, newImport, sortGraph);
3719 
3720  if (sortEachRow && theGraph.isLocallyIndexed() && !theGraph.isSorted()) {
3721  const LocalOrdinal lclNumRows =
3722  static_cast<LocalOrdinal>(theGraph.getLocalNumRows());
3723 
3724  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3725  const RowInfo rowInfo = theGraph.getRowInfo(row);
3726  auto lclColInds = theGraph.getLocalIndsViewHostNonConst(rowInfo);
3727  auto vals = this->getValuesViewHostNonConst(rowInfo);
3728 
3729  sort2(lclColInds.data(),
3730  lclColInds.data() + rowInfo.numEntries,
3731  vals.data());
3732  }
3733  theGraph.indicesAreSorted_ = true;
3734  }
3735 }
3736 
3737 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3739  replaceDomainMap(const Teuchos::RCP<const map_type>& newDomainMap) {
3740  const char tfecfFuncName[] = "replaceDomainMap: ";
3741  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3742  myGraph_.is_null(), std::runtime_error,
3743  "This method does not work if the matrix has a const graph. The whole "
3744  "idea of a const graph is that you are not allowed to change it, but this"
3745  " method necessarily must modify the graph, since the graph owns the "
3746  "matrix's domain Map and Import objects.");
3747  myGraph_->replaceDomainMap(newDomainMap);
3748 }
3749 
3750 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3752  replaceDomainMapAndImporter(const Teuchos::RCP<const map_type>& newDomainMap,
3753  Teuchos::RCP<const import_type>& newImporter) {
3754  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3755  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3756  myGraph_.is_null(), std::runtime_error,
3757  "This method does not work if the matrix has a const graph. The whole "
3758  "idea of a const graph is that you are not allowed to change it, but this"
3759  " method necessarily must modify the graph, since the graph owns the "
3760  "matrix's domain Map and Import objects.");
3761  myGraph_->replaceDomainMapAndImporter(newDomainMap, newImporter);
3762 }
3763 
3764 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3766  replaceRangeMap(const Teuchos::RCP<const map_type>& newRangeMap) {
3767  const char tfecfFuncName[] = "replaceRangeMap: ";
3768  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3769  myGraph_.is_null(), std::runtime_error,
3770  "This method does not work if the matrix has a const graph. The whole "
3771  "idea of a const graph is that you are not allowed to change it, but this"
3772  " method necessarily must modify the graph, since the graph owns the "
3773  "matrix's domain Map and Import objects.");
3774  myGraph_->replaceRangeMap(newRangeMap);
3775 }
3776 
3777 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3779  replaceRangeMapAndExporter(const Teuchos::RCP<const map_type>& newRangeMap,
3780  Teuchos::RCP<const export_type>& newExporter) {
3781  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3782  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3783  myGraph_.is_null(), std::runtime_error,
3784  "This method does not work if the matrix has a const graph. The whole "
3785  "idea of a const graph is that you are not allowed to change it, but this"
3786  " method necessarily must modify the graph, since the graph owns the "
3787  "matrix's domain Map and Import objects.");
3788  myGraph_->replaceRangeMapAndExporter(newRangeMap, newExporter);
3789 }
3790 
3791 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3793  insertNonownedGlobalValues(const GlobalOrdinal globalRow,
3794  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
3795  const Teuchos::ArrayView<const Scalar>& values) {
3796  using Teuchos::Array;
3797  typedef GlobalOrdinal GO;
3798  typedef typename Array<GO>::size_type size_type;
3799 
3800  const size_type numToInsert = indices.size();
3801  // Add the new data to the list of nonlocals.
3802  // This creates the arrays if they don't exist yet.
3803  std::pair<Array<GO>, Array<Scalar>>& curRow = nonlocals_[globalRow];
3804  Array<GO>& curRowInds = curRow.first;
3805  Array<Scalar>& curRowVals = curRow.second;
3806  const size_type newCapacity = curRowInds.size() + numToInsert;
3807  curRowInds.reserve(newCapacity);
3808  curRowVals.reserve(newCapacity);
3809  for (size_type k = 0; k < numToInsert; ++k) {
3810  curRowInds.push_back(indices[k]);
3811  curRowVals.push_back(values[k]);
3812  }
3813 }
3814 
3815 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3818  using Details::Behavior;
3820  using std::endl;
3821  using Teuchos::Comm;
3822  using Teuchos::outArg;
3823  using Teuchos::RCP;
3824  using Teuchos::rcp;
3825  using Teuchos::REDUCE_MAX;
3826  using Teuchos::REDUCE_MIN;
3827  using Teuchos::reduceAll;
3829  // typedef LocalOrdinal LO;
3830  typedef GlobalOrdinal GO;
3831  typedef typename Teuchos::Array<GO>::size_type size_type;
3832  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
3833  ProfilingRegion regionGlobalAssemble("Tpetra::CrsMatrix::globalAssemble");
3834 
3835  const bool verbose = Behavior::verbose("CrsMatrix");
3836  std::unique_ptr<std::string> prefix;
3837  if (verbose) {
3838  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
3839  std::ostringstream os;
3840  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
3841  << endl;
3842  std::cerr << os.str();
3843  }
3844  RCP<const Comm<int>> comm = getComm();
3845 
3846  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillActive(), std::runtime_error,
3847  "Fill must be active before "
3848  "you may call this method.");
3849 
3850  const size_t myNumNonlocalRows = nonlocals_.size();
3851 
3852  // If no processes have nonlocal rows, then we don't have to do
3853  // anything. Checking this is probably cheaper than constructing
3854  // the Map of nonlocal rows (see below) and noticing that it has
3855  // zero global entries.
3856  {
3857  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
3858  int someoneHasNonlocalRows = 0;
3859  reduceAll<int, int>(*comm, REDUCE_MAX, iHaveNonlocalRows,
3860  outArg(someoneHasNonlocalRows));
3861  if (someoneHasNonlocalRows == 0) {
3862  return; // no process has nonlocal rows, so nothing to do
3863  }
3864  }
3865 
3866  // 1. Create a list of the "nonlocal" rows on each process. this
3867  // requires iterating over nonlocals_, so while we do this,
3868  // deduplicate the entries and get a count for each nonlocal
3869  // row on this process.
3870  // 2. Construct a new row Map corresponding to those rows. This
3871  // Map is likely overlapping. We know that the Map is not
3872  // empty on all processes, because the above all-reduce and
3873  // return exclude that case.
3874 
3875  RCP<const map_type> nonlocalRowMap;
3876  Teuchos::Array<size_t> numEntPerNonlocalRow(myNumNonlocalRows);
3877  {
3878  Teuchos::Array<GO> myNonlocalGblRows(myNumNonlocalRows);
3879  size_type curPos = 0;
3880  for (auto mapIter = nonlocals_.begin(); mapIter != nonlocals_.end();
3881  ++mapIter, ++curPos) {
3882  myNonlocalGblRows[curPos] = mapIter->first;
3883  // Get the values and column indices by reference, since we
3884  // intend to change them in place (that's what "erase" does).
3885  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
3886  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
3887 
3888  // Sort both arrays jointly, using the column indices as keys,
3889  // then merge them jointly. "Merge" here adds values
3890  // corresponding to the same column indices. The first 2 args
3891  // of merge2 are output arguments that work just like the
3892  // return value of std::unique.
3893  sort2(gblCols.begin(), gblCols.end(), vals.begin());
3894  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
3895  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
3896  merge2(gblCols_newEnd, vals_newEnd,
3897  gblCols.begin(), gblCols.end(),
3898  vals.begin(), vals.end());
3899  gblCols.erase(gblCols_newEnd, gblCols.end());
3900  vals.erase(vals_newEnd, vals.end());
3901  numEntPerNonlocalRow[curPos] = gblCols.size();
3902  }
3903 
3904  // Currently, Map requires that its indexBase be the global min
3905  // of all its global indices. Map won't compute this for us, so
3906  // we must do it. If our process has no nonlocal rows, set the
3907  // "min" to the max possible GO value. This ensures that if
3908  // some process has at least one nonlocal row, then it will pick
3909  // that up as the min. We know that at least one process has a
3910  // nonlocal row, since the all-reduce and return at the top of
3911  // this method excluded that case.
3912  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max();
3913  {
3914  auto iter = std::min_element(myNonlocalGblRows.begin(),
3915  myNonlocalGblRows.end());
3916  if (iter != myNonlocalGblRows.end()) {
3917  myMinNonlocalGblRow = *iter;
3918  }
3919  }
3920  GO gblMinNonlocalGblRow = 0;
3921  reduceAll<int, GO>(*comm, REDUCE_MIN, myMinNonlocalGblRow,
3922  outArg(gblMinNonlocalGblRow));
3923  const GO indexBase = gblMinNonlocalGblRow;
3924  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid();
3925  nonlocalRowMap = rcp(new map_type(INV, myNonlocalGblRows(), indexBase, comm));
3926  }
3927 
3928  // 3. Use the values and column indices for each nonlocal row, as
3929  // stored in nonlocals_, to construct a CrsMatrix corresponding
3930  // to nonlocal rows. We have
3931  // exact counts of the number of entries in each nonlocal row.
3932 
3933  if (verbose) {
3934  std::ostringstream os;
3935  os << *prefix << "Create nonlocal matrix" << endl;
3936  std::cerr << os.str();
3937  }
3938  RCP<crs_matrix_type> nonlocalMatrix =
3939  rcp(new crs_matrix_type(nonlocalRowMap, numEntPerNonlocalRow()));
3940  {
3941  size_type curPos = 0;
3942  for (auto mapIter = nonlocals_.begin(); mapIter != nonlocals_.end();
3943  ++mapIter, ++curPos) {
3944  const GO gblRow = mapIter->first;
3945  // Get values & column indices by ref, just to avoid copy.
3946  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
3947  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
3948  // const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
3949  nonlocalMatrix->insertGlobalValues(gblRow, gblCols(), vals());
3950  }
3951  }
3952  // There's no need to fill-complete the nonlocals matrix.
3953  // We just use it as a temporary container for the Export.
3954 
3955  // 4. If the original row Map is one to one, then we can Export
3956  // directly from nonlocalMatrix into this. Otherwise, we have
3957  // to create a temporary matrix with a one-to-one row Map,
3958  // Export into that, then Import from the temporary matrix into
3959  // *this.
3960 
3961  auto origRowMap = this->getRowMap();
3962  const bool origRowMapIsOneToOne = origRowMap->isOneToOne();
3963 
3964  int isLocallyComplete = 1; // true by default
3965 
3966  if (origRowMapIsOneToOne) {
3967  if (verbose) {
3968  std::ostringstream os;
3969  os << *prefix << "Original row Map is 1-to-1" << endl;
3970  std::cerr << os.str();
3971  }
3972  export_type exportToOrig(nonlocalRowMap, origRowMap);
3973  if (!exportToOrig.isLocallyComplete()) {
3974  isLocallyComplete = 0;
3975  }
3976  if (verbose) {
3977  std::ostringstream os;
3978  os << *prefix << "doExport from nonlocalMatrix" << endl;
3979  std::cerr << os.str();
3980  }
3981  this->doExport(*nonlocalMatrix, exportToOrig, Tpetra::ADD);
3982  // We're done at this point!
3983  } else {
3984  if (verbose) {
3985  std::ostringstream os;
3986  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
3987  std::cerr << os.str();
3988  }
3989  // If you ask a Map whether it is one to one, it does some
3990  // communication and stashes intermediate results for later use
3991  // by createOneToOne. Thus, calling createOneToOne doesn't cost
3992  // much more then the original cost of calling isOneToOne.
3993  auto oneToOneRowMap = Tpetra::createOneToOne(origRowMap);
3994  export_type exportToOneToOne(nonlocalRowMap, oneToOneRowMap);
3995  if (!exportToOneToOne.isLocallyComplete()) {
3996  isLocallyComplete = 0;
3997  }
3998 
3999  // Create a temporary matrix with the one-to-one row Map.
4000  //
4001  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4002  // each row, to avoid reallocation during the Export operation.
4003  if (verbose) {
4004  std::ostringstream os;
4005  os << *prefix << "Create & doExport into 1-to-1 matrix"
4006  << endl;
4007  std::cerr << os.str();
4008  }
4009  crs_matrix_type oneToOneMatrix(oneToOneRowMap, 0);
4010  // Export from matrix of nonlocals into the temp one-to-one matrix.
4011  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4012  Tpetra::ADD);
4013 
4014  // We don't need the matrix of nonlocals anymore, so get rid of
4015  // it, to keep the memory high-water mark down.
4016  if (verbose) {
4017  std::ostringstream os;
4018  os << *prefix << "Free nonlocalMatrix" << endl;
4019  std::cerr << os.str();
4020  }
4021  nonlocalMatrix = Teuchos::null;
4022 
4023  // Import from the one-to-one matrix to the original matrix.
4024  if (verbose) {
4025  std::ostringstream os;
4026  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4027  std::cerr << os.str();
4028  }
4029  import_type importToOrig(oneToOneRowMap, origRowMap);
4030  this->doImport(oneToOneMatrix, importToOrig, Tpetra::ADD);
4031  }
4032 
4033  // It's safe now to clear out nonlocals_, since we've already
4034  // committed side effects to *this. The standard idiom for
4035  // clearing a Container like std::map, is to swap it with an empty
4036  // Container and let the swapped Container fall out of scope.
4037  if (verbose) {
4038  std::ostringstream os;
4039  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4040  std::cerr << os.str();
4041  }
4042  decltype(nonlocals_) newNonlocals;
4043  std::swap(nonlocals_, newNonlocals);
4044 
4045  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4046  // don't like throwing an exception here. A local return value
4047  // would likely be more useful to users. However, if users find
4048  // themselves exercising nonlocal inserts often, then they are
4049  // probably novice users who need the help. See Gibhub Issues
4050  // #603 and #601 (esp. the latter) for discussion.
4051 
4052  int isGloballyComplete = 0; // output argument of reduceAll
4053  reduceAll<int, int>(*comm, REDUCE_MIN, isLocallyComplete,
4054  outArg(isGloballyComplete));
4055  TEUCHOS_TEST_FOR_EXCEPTION(isGloballyComplete != 1, std::runtime_error,
4056  "On at least one process, "
4057  "you called insertGlobalValues with a global row index which is not in "
4058  "the matrix's row Map on any process in its communicator.");
4059 }
4060 
4061 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4063  resumeFill(const Teuchos::RCP<Teuchos::ParameterList>& params) {
4064  if (!isStaticGraph()) { // Don't resume fill of a nonowned graph.
4065  myGraph_->resumeFill(params);
4066  }
4067  // Delete the apply helper (if it exists)
4068  applyHelper.reset();
4069  fillComplete_ = false;
4070 }
4071 
4072 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4075  return getCrsGraphRef().haveGlobalConstants();
4076 }
4077 
4078 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4080  fillComplete(const Teuchos::RCP<Teuchos::ParameterList>& params) {
4081  const char tfecfFuncName[] = "fillComplete(params): ";
4082 
4083  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->getCrsGraph().is_null(), std::logic_error,
4084  "getCrsGraph() returns null. This should not happen at this point. "
4085  "Please report this bug to the Tpetra developers.");
4086 
4087  const crs_graph_type& graph = this->getCrsGraphRef();
4088  if (this->isStaticGraph() && graph.isFillComplete()) {
4089  // If this matrix's graph is fill complete and the user did not
4090  // supply a domain or range Map, use the graph's domain and
4091  // range Maps.
4092  this->fillComplete(graph.getDomainMap(), graph.getRangeMap(), params);
4093  } else { // assume that user's row Map is the domain and range Map
4094  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap();
4095  Teuchos::RCP<const map_type> domainMap = rangeMap;
4096  this->fillComplete(domainMap, rangeMap, params);
4097  }
4098 }
4099 
4100 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4102  fillComplete(const Teuchos::RCP<const map_type>& domainMap,
4103  const Teuchos::RCP<const map_type>& rangeMap,
4104  const Teuchos::RCP<Teuchos::ParameterList>& params) {
4105  using Details::Behavior;
4107  using std::endl;
4108  using Teuchos::ArrayRCP;
4109  using Teuchos::RCP;
4110  using Teuchos::rcp;
4111  const char tfecfFuncName[] = "fillComplete: ";
4112  ProfilingRegion regionFillComplete("Tpetra::CrsMatrix::fillComplete");
4113  const bool verbose = Behavior::verbose("CrsMatrix");
4114  std::unique_ptr<std::string> prefix;
4115  if (verbose) {
4116  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4117  std::ostringstream os;
4118  os << *prefix << endl;
4119  std::cerr << os.str();
4120  }
4121  Details::ProfilingRegion region(
4122  "Tpetra::CrsMatrix::fillCompete",
4123  "fillCompete");
4124 
4125  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->isFillActive() || this->isFillComplete(), std::runtime_error,
4126  "Matrix fill state must be active (isFillActive() "
4127  "must be true) before you may call fillComplete().");
4128  const int numProcs = this->getComm()->getSize();
4129 
4130  //
4131  // Read parameters from the input ParameterList.
4132  //
4133  {
4134  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4135 
4136  // If true, the caller promises that no process did nonlocal
4137  // changes since the last call to fillComplete.
4138  bool assertNoNonlocalInserts = false;
4139  // If true, makeColMap sorts remote GIDs (within each remote
4140  // process' group).
4141  bool sortGhosts = true;
4142 
4143  if (!params.is_null()) {
4144  assertNoNonlocalInserts = params->get("No Nonlocal Changes",
4145  assertNoNonlocalInserts);
4146  if (params->isParameter("sort column map ghost gids")) {
4147  sortGhosts = params->get("sort column map ghost gids", sortGhosts);
4148  } else if (params->isParameter("Sort column Map ghost GIDs")) {
4149  sortGhosts = params->get("Sort column Map ghost GIDs", sortGhosts);
4150  }
4151  }
4152  // We also don't need to do global assembly if there is only one
4153  // process in the communicator.
4154  const bool needGlobalAssemble = !assertNoNonlocalInserts && numProcs > 1;
4155  // This parameter only matters if this matrix owns its graph.
4156  if (!this->myGraph_.is_null()) {
4157  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4158  }
4159 
4160  if (!this->getCrsGraphRef().indicesAreAllocated()) {
4161  if (this->hasColMap()) { // use local indices
4162  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4163  } else { // no column Map, so use global indices
4164  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4165  }
4166  }
4167  // Global assemble, if we need to. This call only costs a single
4168  // all-reduce if we didn't need global assembly after all.
4169  if (needGlobalAssemble) {
4170  this->globalAssemble();
4171  } else {
4172  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numProcs == 1 && nonlocals_.size() > 0,
4173  std::runtime_error,
4174  "Cannot have nonlocal entries on a serial run. "
4175  "An invalid entry (i.e., with row index not in the row Map) must have "
4176  "been submitted to the CrsMatrix.");
4177  }
4178  }
4179  if (this->isStaticGraph()) {
4180  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4181  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4182  // checks below only in debug mode. It would be nicer to do a
4183  // local check, then propagate the error state in a deferred
4184  // way, whenever communication happens. That would reduce the
4185  // cost of checking, to the point where it may make sense to
4186  // enable it even in release mode.
4187 #ifdef HAVE_TPETRA_DEBUG
4188  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4189  // input Maps incurs a penalty of two all-reduces for the
4190  // otherwise optimal const graph case.
4191  //
4192  // We could turn these (max) 2 all-reduces into (max) 1, by
4193  // fusing them. We could do this by adding a "locallySameAs"
4194  // method to Map, which would return one of four states:
4195  //
4196  // a. Certainly globally the same
4197  // b. Certainly globally not the same
4198  // c. Locally the same
4199  // d. Locally not the same
4200  //
4201  // The first two states don't require further communication.
4202  // The latter two states require an all-reduce to communicate
4203  // globally, but we only need one all-reduce, since we only need
4204  // to check whether at least one of the Maps is wrong.
4205  const bool domainMapsMatch =
4206  this->staticGraph_->getDomainMap()->isSameAs(*domainMap);
4207  const bool rangeMapsMatch =
4208  this->staticGraph_->getRangeMap()->isSameAs(*rangeMap);
4209 
4210  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!domainMapsMatch, std::runtime_error,
4211  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4212  "The graph cannot be changed because it was given to the CrsMatrix "
4213  "constructor as const. You can fix this by passing in the graph's "
4214  "domain Map and range Map to the matrix's fillComplete call.");
4215 
4216  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!rangeMapsMatch, std::runtime_error,
4217  "The CrsMatrix's range Map does not match the graph's range Map. "
4218  "The graph cannot be changed because it was given to the CrsMatrix "
4219  "constructor as const. You can fix this by passing in the graph's "
4220  "domain Map and range Map to the matrix's fillComplete call.");
4221 #endif // HAVE_TPETRA_DEBUG
4222 
4223  // The matrix does _not_ own the graph, and the graph's
4224  // structure is already fixed, so just fill the local matrix.
4225  this->fillLocalMatrix(params);
4226  } else {
4227  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4228  // Set the graph's domain and range Maps. This will clear the
4229  // Import if the domain Map has changed (is a different
4230  // pointer), and the Export if the range Map has changed (is a
4231  // different pointer).
4232  this->myGraph_->setDomainRangeMaps(domainMap, rangeMap);
4233 
4234  // Make the graph's column Map, if necessary.
4235  Teuchos::Array<int> remotePIDs(0);
4236  const bool mustBuildColMap = !this->hasColMap();
4237  if (mustBuildColMap) {
4238  this->myGraph_->makeColMap(remotePIDs);
4239  }
4240 
4241  // Make indices local, if necessary. The method won't do
4242  // anything if the graph is already locally indexed.
4243  const std::pair<size_t, std::string> makeIndicesLocalResult =
4244  this->myGraph_->makeIndicesLocal(verbose);
4245  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4246  // the error state to makeImportExport
4247  // which may do all-reduces and thus may
4248  // have the opportunity to communicate that error state.
4249  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(makeIndicesLocalResult.first != 0, std::runtime_error,
4250  makeIndicesLocalResult.second);
4251 
4252  const bool sorted = this->myGraph_->isSorted();
4253  const bool merged = this->myGraph_->isMerged();
4254  this->sortAndMergeIndicesAndValues(sorted, merged);
4255 
4256  // Make Import and Export objects, if they haven't been made
4257  // already. If we made a column Map above, reuse information
4258  // from that process to avoid communiation in the Import setup.
4259  this->myGraph_->makeImportExport(remotePIDs, mustBuildColMap);
4260 
4261  // The matrix _does_ own the graph, so fill the local graph at
4262  // the same time as the local matrix.
4263  this->fillLocalGraphAndMatrix(params);
4264 
4265  const bool callGraphComputeGlobalConstants = params.get() == nullptr ||
4266  params->get("compute global constants", true);
4267  if (callGraphComputeGlobalConstants) {
4268  this->myGraph_->computeGlobalConstants();
4269  } else {
4270  this->myGraph_->computeLocalConstants();
4271  }
4272  this->myGraph_->fillComplete_ = true;
4273  this->myGraph_->checkInternalState();
4274  }
4275 
4276  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4277 
4278  this->fillComplete_ = true; // Now we're fill complete!
4279  {
4280  Details::ProfilingRegion region_cis(
4281  "Tpetra::CrsMatrix::fillCompete", "checkInternalState");
4282  this->checkInternalState();
4283  }
4284 } // fillComplete(domainMap, rangeMap, params)
4285 
4286 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4288  expertStaticFillComplete(const Teuchos::RCP<const map_type>& domainMap,
4289  const Teuchos::RCP<const map_type>& rangeMap,
4290  const Teuchos::RCP<const import_type>& importer,
4291  const Teuchos::RCP<const export_type>& exporter,
4292  const Teuchos::RCP<Teuchos::ParameterList>& params) {
4293 #ifdef HAVE_TPETRA_MMM_TIMINGS
4294  std::string label;
4295  if (!params.is_null())
4296  label = params->get("Timer Label", label);
4297  std::string prefix = std::string("Tpetra ") + label + std::string(": ");
4298  using Teuchos::TimeMonitor;
4299 
4300  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4301 #endif
4302 
4303  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4304  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillActive() || isFillComplete(),
4305  std::runtime_error,
4306  "Matrix fill state must be active (isFillActive() "
4307  "must be true) before calling fillComplete().");
4308  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4309  myGraph_.is_null(), std::logic_error, "myGraph_ is null. This is not allowed.");
4310 
4311  {
4312 #ifdef HAVE_TPETRA_MMM_TIMINGS
4313  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4314 #endif
4315  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4316  myGraph_->expertStaticFillComplete(domainMap, rangeMap, importer, exporter, params);
4317  }
4318 
4319  {
4320 #ifdef HAVE_TPETRA_MMM_TIMINGS
4321  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4322 #endif
4323  // Fill the local graph and matrix
4324  fillLocalGraphAndMatrix(params);
4325  }
4326  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4327 
4328  // Now we're fill complete!
4329  fillComplete_ = true;
4330 
4331  // Sanity checks at the end.
4332 #ifdef HAVE_TPETRA_DEBUG
4333  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4334  ": We're at the end of fillComplete(), but isFillActive() is true. "
4335  "Please report this bug to the Tpetra developers.");
4336  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::logic_error,
4337  ": We're at the end of fillComplete(), but isFillActive() is true. "
4338  "Please report this bug to the Tpetra developers.");
4339 #endif // HAVE_TPETRA_DEBUG
4340  {
4341 #ifdef HAVE_TPETRA_MMM_TIMINGS
4342  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4343 #endif
4344 
4345  checkInternalState();
4346  }
4347 }
4348 
4349 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4351  mergeRowIndicesAndValues(size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals) {
4352  impl_scalar_type* rowValueIter = vals;
4353  // beg,end define a half-exclusive interval over which to iterate.
4354  LocalOrdinal* beg = cols;
4355  LocalOrdinal* end = cols + rowLen;
4356  LocalOrdinal* newend = beg;
4357  if (beg != end) {
4358  LocalOrdinal* cur = beg + 1;
4359  impl_scalar_type* vcur = rowValueIter + 1;
4360  impl_scalar_type* vend = rowValueIter;
4361  cur = beg + 1;
4362  while (cur != end) {
4363  if (*cur != *newend) {
4364  // new entry; save it
4365  ++newend;
4366  ++vend;
4367  (*newend) = (*cur);
4368  (*vend) = (*vcur);
4369  } else {
4370  // old entry; merge it
4371  //(*vend) = f (*vend, *vcur);
4372  (*vend) += *vcur;
4373  }
4374  ++cur;
4375  ++vcur;
4376  }
4377  ++newend; // one past the last entry, per typical [beg,end) semantics
4378  }
4379  return newend - beg;
4380 }
4381 
4382 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4384  sortAndMergeIndicesAndValues(const bool sorted, const bool merged) {
4385  using ::Tpetra::Details::ProfilingRegion;
4386  typedef LocalOrdinal LO;
4387  typedef typename Kokkos::View<LO*, device_type>::host_mirror_type::execution_space
4388  host_execution_space;
4389  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4390  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4391  ProfilingRegion regionSAM("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4392 
4393  if (!sorted || !merged) {
4394  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->isStaticGraph(), std::runtime_error,
4395  "Cannot sort or merge with "
4396  "\"static\" (const) graph, since the matrix does not own the graph.");
4397  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->myGraph_.is_null(), std::logic_error,
4398  "myGraph_ is null, but "
4399  "this matrix claims ! isStaticGraph(). "
4400  "Please report this bug to the Tpetra developers.");
4401  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->isStorageOptimized(), std::logic_error,
4402  "It is invalid to call "
4403  "this method if the graph's storage has already been optimized. "
4404  "Please report this bug to the Tpetra developers.");
4405 
4406  crs_graph_type& graph = *(this->myGraph_);
4407  const LO lclNumRows = static_cast<LO>(this->getLocalNumRows());
4408  size_t totalNumDups = 0;
4409  {
4410  // Accessing host unpacked (4-array CRS) local matrix.
4411  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4412  auto rowLengths_ = graph.k_numRowEntries_;
4413  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4414  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4415  Kokkos::parallel_reduce(
4416  "sortAndMergeIndicesAndValues", range_type(0, lclNumRows),
4417  [=](const LO lclRow, size_t& numDups) {
4418  size_t rowBegin = rowBegins_(lclRow);
4419  size_t rowLen = rowLengths_(lclRow);
4420  LO* cols = cols_.data() + rowBegin;
4421  impl_scalar_type* vals = vals_.data() + rowBegin;
4422  if (!sorted) {
4423  sort2(cols, cols + rowLen, vals);
4424  }
4425  if (!merged) {
4426  size_t newRowLength = mergeRowIndicesAndValues(rowLen, cols, vals);
4427  rowLengths_(lclRow) = newRowLength;
4428  numDups += rowLen - newRowLength;
4429  }
4430  },
4431  totalNumDups);
4432  }
4433  if (!sorted) {
4434  graph.indicesAreSorted_ = true; // we just sorted every row
4435  }
4436  if (!merged) {
4437  graph.noRedundancies_ = true; // we just merged every row
4438  }
4439  }
4440 }
4441 
4442 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4446  Scalar alpha,
4447  Scalar beta) const {
4448  using Teuchos::RCP;
4449  using Teuchos::rcp;
4450  using Teuchos::rcp_const_cast;
4451  using Teuchos::rcpFromRef;
4453  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
4454  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
4455 
4456  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4457  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4458  // failing only for the Kokkos refactor version of Tpetra. It's a
4459  // good idea regardless to have the bypass.
4460  if (alpha == ZERO) {
4461  if (beta == ZERO) {
4462  Y_in.putScalar(ZERO);
4463  } else if (beta != ONE) {
4464  Y_in.scale(beta);
4465  }
4466  return;
4467  }
4468 
4469  // It's possible that X is a view of Y or vice versa. We don't
4470  // allow this (apply() requires that X and Y not alias one
4471  // another), but it's helpful to detect and work around this case.
4472  // We don't try to to detect the more subtle cases (e.g., one is a
4473  // subview of the other, but their initial pointers differ). We
4474  // only need to do this if this matrix's Import is trivial;
4475  // otherwise, we don't actually apply the operator from X into Y.
4476 
4477  RCP<const import_type> importer = this->getGraph()->getImporter();
4478  RCP<const export_type> exporter = this->getGraph()->getExporter();
4479 
4480  // If beta == 0, then the output MV will be overwritten; none of
4481  // its entries should be read. (Sparse BLAS semantics say that we
4482  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4483  // This matters if we need to do an Export operation; see below.
4484  const bool Y_is_overwritten = (beta == ZERO);
4485 
4486  // We treat the case of a replicated MV output specially.
4487  const bool Y_is_replicated =
4488  (!Y_in.isDistributed() && this->getComm()->getSize() != 1);
4489 
4490  // This is part of the special case for replicated MV output.
4491  // We'll let each process do its thing, but do an all-reduce at
4492  // the end to sum up the results. Setting beta=0 on all processes
4493  // but Proc 0 makes the math work out for the all-reduce. (This
4494  // assumes that the replicated data is correctly replicated, so
4495  // that the data are the same on all processes.)
4496  if (Y_is_replicated && this->getComm()->getRank() > 0) {
4497  beta = ZERO;
4498  }
4499 
4500  // Temporary MV for Import operation. After the block of code
4501  // below, this will be an (Imported if necessary) column Map MV
4502  // ready to give to localApply(...).
4503  RCP<const MV> X_colMap;
4504  if (importer.is_null()) {
4505  if (!X_in.isConstantStride()) {
4506  // Not all sparse mat-vec kernels can handle an input MV with
4507  // nonconstant stride correctly, so we have to copy it in that
4508  // case into a constant stride MV. To make a constant stride
4509  // copy of X_in, we force creation of the column (== domain)
4510  // Map MV (if it hasn't already been created, else fetch the
4511  // cached copy). This avoids creating a new MV each time.
4512  RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in, true);
4513  Tpetra::deep_copy(*X_colMapNonConst, X_in);
4514  X_colMap = rcp_const_cast<const MV>(X_colMapNonConst);
4515  } else {
4516  // The domain and column Maps are the same, so do the local
4517  // multiply using the domain Map input MV X_in.
4518  X_colMap = rcpFromRef(X_in);
4519  }
4520  } else { // need to Import source (multi)vector
4521  ProfilingRegion regionImport("Tpetra::CrsMatrix::apply: Import");
4522 
4523  // We're doing an Import anyway, which will copy the relevant
4524  // elements of the domain Map MV X_in into a separate column Map
4525  // MV. Thus, we don't have to worry whether X_in is constant
4526  // stride.
4527  RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in);
4528 
4529  // Import from the domain Map MV to the column Map MV.
4530  X_colMapNonConst->doImport(X_in, *importer, INSERT);
4531  X_colMap = rcp_const_cast<const MV>(X_colMapNonConst);
4532  }
4533 
4534  // Temporary MV for doExport (if needed), or for copying a
4535  // nonconstant stride output MV into a constant stride MV. This
4536  // is null if we don't need the temporary MV, that is, if the
4537  // Export is trivial (null).
4538  RCP<MV> Y_rowMap = getRowMapMultiVector(Y_in);
4539 
4540  // If we have a nontrivial Export object, we must perform an
4541  // Export. In that case, the local multiply result will go into
4542  // the row Map multivector. We don't have to make a
4543  // constant-stride version of Y_in in this case, because we had to
4544  // make a constant stride Y_rowMap MV and do an Export anyway.
4545  if (!exporter.is_null()) {
4546  this->localApply(*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4547  {
4548  ProfilingRegion regionExport("Tpetra::CrsMatrix::apply: Export");
4549 
4550  // If we're overwriting the output MV Y_in completely (beta ==
4551  // 0), then make sure that it is filled with zeros before we
4552  // do the Export. Otherwise, the ADD combine mode will use
4553  // data in Y_in, which is supposed to be zero.
4554  if (Y_is_overwritten) {
4555  Y_in.putScalar(ZERO);
4556  } else {
4557  // Scale output MV by beta, so that doExport sums in the
4558  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4559  Y_in.scale(beta);
4560  }
4561  // Do the Export operation.
4562  Y_in.doExport(*Y_rowMap, *exporter, ADD_ASSIGN);
4563  }
4564  } else { // Don't do an Export: row Map and range Map are the same.
4565  //
4566  // If Y_in does not have constant stride, or if the column Map
4567  // MV aliases Y_in, then we can't let the kernel write directly
4568  // to Y_in. Instead, we have to use the cached row (== range)
4569  // Map MV as temporary storage.
4570  //
4571  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4572  // the user passed in the same MultiVector for both X and Y. It
4573  // won't detect whether one MultiVector views the other. We
4574  // should also check the MultiVectors' raw data pointers.
4575  if (!Y_in.isConstantStride() || X_colMap.getRawPtr() == &Y_in) {
4576  // Force creating the MV if it hasn't been created already.
4577  // This will reuse a previously created cached MV.
4578  Y_rowMap = getRowMapMultiVector(Y_in, true);
4579 
4580  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4581  // since we're overwriting it anyway.
4582  if (beta != ZERO) {
4583  Tpetra::deep_copy(*Y_rowMap, Y_in);
4584  }
4585  this->localApply(*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4586  Tpetra::deep_copy(Y_in, *Y_rowMap);
4587  } else {
4588  this->localApply(*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4589  }
4590  }
4591 
4592  // If the range Map is a locally replicated Map, sum up
4593  // contributions from each process. We set beta = 0 on all
4594  // processes but Proc 0 initially, so this will handle the scaling
4595  // factor beta correctly.
4596  if (Y_is_replicated) {
4597  ProfilingRegion regionReduce("Tpetra::CrsMatrix::apply: Reduce Y");
4598  Y_in.reduce();
4599  }
4600 }
4601 
4602 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4606  const Teuchos::ETransp mode,
4607  Scalar alpha,
4608  Scalar beta) const {
4609  using Teuchos::null;
4610  using Teuchos::RCP;
4611  using Teuchos::rcp;
4612  using Teuchos::rcp_const_cast;
4613  using Teuchos::rcpFromRef;
4615  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
4616 
4617  // Take shortcuts for alpha == 0.
4618  if (alpha == ZERO) {
4619  // Follow the Sparse BLAS convention by ignoring both the matrix
4620  // and X_in, in this case.
4621  if (beta == ZERO) {
4622  // Follow the Sparse BLAS convention by overwriting any Inf or
4623  // NaN values in Y_in, in this case.
4624  Y_in.putScalar(ZERO);
4625  } else {
4626  Y_in.scale(beta);
4627  }
4628  return;
4629  }
4630 
4631  const size_t numVectors = X_in.getNumVectors();
4632 
4633  // We don't allow X_in and Y_in to alias one another. It's hard
4634  // to check this, because advanced users could create views from
4635  // raw pointers. However, if X_in and Y_in reference the same
4636  // object, we will do the user a favor by copying X into new
4637  // storage (with a warning). We only need to do this if we have
4638  // trivial importers; otherwise, we don't actually apply the
4639  // operator from X into Y.
4640  RCP<const import_type> importer = this->getGraph()->getImporter();
4641  RCP<const export_type> exporter = this->getGraph()->getExporter();
4642  // access X indirectly, in case we need to create temporary storage
4643  RCP<const MV> X;
4644 
4645  // some parameters for below
4646  const bool Y_is_replicated = (!Y_in.isDistributed() && this->getComm()->getSize() != 1);
4647  const bool Y_is_overwritten = (beta == ZERO);
4648  if (Y_is_replicated && this->getComm()->getRank() > 0) {
4649  beta = ZERO;
4650  }
4651 
4652  // The kernels do not allow input or output with nonconstant stride.
4653  if (!X_in.isConstantStride() && importer.is_null()) {
4654  X = rcp(new MV(X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4655  } else {
4656  X = rcpFromRef(X_in); // Reference to X_in
4657  }
4658 
4659  // Set up temporary multivectors for Import and/or Export.
4660  if (importer != Teuchos::null) {
4661  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4662  importMV_ = null;
4663  }
4664  if (importMV_ == null) {
4665  importMV_ = rcp(new MV(this->getColMap(), numVectors));
4666  }
4667  }
4668  if (exporter != Teuchos::null) {
4669  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4670  exportMV_ = null;
4671  }
4672  if (exportMV_ == null) {
4673  exportMV_ = rcp(new MV(this->getRowMap(), numVectors));
4674  }
4675  }
4676 
4677  // If we have a non-trivial exporter, we must import elements that
4678  // are permuted or are on other processors.
4679  if (!exporter.is_null()) {
4680  ProfilingRegion regionImport("Tpetra::CrsMatrix::apply (transpose): Import");
4681  exportMV_->doImport(X_in, *exporter, INSERT);
4682  X = exportMV_; // multiply out of exportMV_
4683  }
4684 
4685  // If we have a non-trivial importer, we must export elements that
4686  // are permuted or belong to other processors. We will compute
4687  // solution into the to-be-exported MV; get a view.
4688  if (importer != Teuchos::null) {
4689  ProfilingRegion regionExport("Tpetra::CrsMatrix::apply (transpose): Export");
4690 
4691  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4692  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4693  // back and figure out why this helps. importMV_ SHOULD be
4694  // completely overwritten in the localApply(...) call
4695  // below, because beta == ZERO there.
4696  importMV_->putScalar(ZERO);
4697  // Do the local computation.
4698  this->localApply(*X, *importMV_, mode, alpha, ZERO);
4699 
4700  if (Y_is_overwritten) {
4701  Y_in.putScalar(ZERO);
4702  } else {
4703  Y_in.scale(beta);
4704  }
4705  Y_in.doExport(*importMV_, *importer, ADD_ASSIGN);
4706  }
4707  // otherwise, multiply into Y
4708  else {
4709  // can't multiply in-situ; can't multiply into non-strided multivector
4710  //
4711  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4712  // the user passed in the same MultiVector for both X and Y. It
4713  // won't detect whether one MultiVector views the other. We
4714  // should also check the MultiVectors' raw data pointers.
4715  if (!Y_in.isConstantStride() || X.getRawPtr() == &Y_in) {
4716  // Make a deep copy of Y_in, into which to write the multiply result.
4717  MV Y(Y_in, Teuchos::Copy);
4718  this->localApply(*X, Y, mode, alpha, beta);
4719  Tpetra::deep_copy(Y_in, Y);
4720  } else {
4721  this->localApply(*X, Y_in, mode, alpha, beta);
4722  }
4723  }
4724 
4725  // If the range Map is a locally replicated map, sum the
4726  // contributions from each process. (That's why we set beta=0
4727  // above for all processes but Proc 0.)
4728  if (Y_is_replicated) {
4729  ProfilingRegion regionReduce("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4730  Y_in.reduce();
4731  }
4732 }
4733 
4734 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4738  const Teuchos::ETransp mode,
4739  const Scalar& alpha,
4740  const Scalar& beta) const {
4741  using Teuchos::NO_TRANS;
4743  ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localApply");
4744 
4745  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
4746  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
4747 
4748  const bool debug = ::Tpetra::Details::Behavior::debug();
4749  if (debug) {
4750  const char tfecfFuncName[] = "localApply: ";
4751  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X.getNumVectors() != Y.getNumVectors(), std::runtime_error,
4752  "X.getNumVectors() = " << X.getNumVectors() << " != "
4753  "Y.getNumVectors() = "
4754  << Y.getNumVectors() << ".");
4755  const bool transpose = (mode != Teuchos::NO_TRANS);
4756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!transpose && X.getLocalLength() !=
4757  getColMap()->getLocalNumElements(),
4758  std::runtime_error,
4759  "NO_TRANS case: X has the wrong number of local rows. "
4760  "X.getLocalLength() = "
4761  << X.getLocalLength() << " != "
4762  "getColMap()->getLocalNumElements() = "
4763  << getColMap()->getLocalNumElements() << ".");
4764  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!transpose && Y.getLocalLength() !=
4765  getRowMap()->getLocalNumElements(),
4766  std::runtime_error,
4767  "NO_TRANS case: Y has the wrong number of local rows. "
4768  "Y.getLocalLength() = "
4769  << Y.getLocalLength() << " != "
4770  "getRowMap()->getLocalNumElements() = "
4771  << getRowMap()->getLocalNumElements() << ".");
4772  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(transpose && X.getLocalLength() !=
4773  getRowMap()->getLocalNumElements(),
4774  std::runtime_error,
4775  "TRANS or CONJ_TRANS case: X has the wrong number of local "
4776  "rows. X.getLocalLength() = "
4777  << X.getLocalLength()
4778  << " != getRowMap()->getLocalNumElements() = "
4779  << getRowMap()->getLocalNumElements() << ".");
4780  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(transpose && Y.getLocalLength() !=
4781  getColMap()->getLocalNumElements(),
4782  std::runtime_error,
4783  "TRANS or CONJ_TRANS case: X has the wrong number of local "
4784  "rows. Y.getLocalLength() = "
4785  << Y.getLocalLength()
4786  << " != getColMap()->getLocalNumElements() = "
4787  << getColMap()->getLocalNumElements() << ".");
4788  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::runtime_error,
4789  "The matrix is not "
4790  "fill complete. You must call fillComplete() (possibly with "
4791  "domain and range Map arguments) without an intervening "
4792  "resumeFill() call before you may call this method.");
4793  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!X.isConstantStride() || !Y.isConstantStride(),
4794  std::runtime_error, "X and Y must be constant stride.");
4795  // If the two pointers are null, then they don't alias one
4796  // another, even though they are equal.
4797  // Kokkos does not guarantee that zero row-extent vectors
4798  // point to different places, so we have to check that too.
4799  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X_lcl.data() == Y_lcl.data() && X_lcl.data() != nullptr && X_lcl.extent(0) != 0,
4800  std::runtime_error, "X and Y may not alias one another.");
4801  }
4802 
4803  auto A_lcl = getLocalMatrixDevice();
4804 
4805  if (!applyHelper.get()) {
4806  // The apply helper does not exist, so create it.
4807  // Decide now whether to use the imbalanced row path, or the default.
4808  bool useMergePath = false;
4809 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
4810  // TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
4811  // we can use SPMV_MERGE_PATH for the native spmv as well.
4812  // Take out this ifdef to enable that.
4813  //
4814  // Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
4815  if constexpr (std::is_same_v<execution_space, Kokkos::Cuda>) {
4816  LocalOrdinal nrows = getLocalNumRows();
4817  LocalOrdinal maxRowImbalance = 0;
4818  if (nrows != 0)
4819  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
4820 
4821  if (size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
4822  useMergePath = true;
4823  }
4824 #endif
4825  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
4826  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
4827  }
4828 
4829  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
4830  const char* modeKK = nullptr;
4831  switch (mode) {
4832  case Teuchos::NO_TRANS:
4833  modeKK = KokkosSparse::NoTranspose;
4834  break;
4835  case Teuchos::TRANS:
4836  modeKK = KokkosSparse::Transpose;
4837  break;
4838  case Teuchos::CONJ_TRANS:
4839  modeKK = KokkosSparse::ConjugateTranspose;
4840  break;
4841  default:
4842  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
4843  }
4844 
4845  if (applyHelper->shouldUseIntRowptrs()) {
4846  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
4847  KokkosSparse::spmv(
4848  &applyHelper->handle_int, modeKK,
4849  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
4850  } else {
4851  KokkosSparse::spmv(
4852  &applyHelper->handle, modeKK,
4853  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
4854  }
4855 }
4856 
4857 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4861  Teuchos::ETransp mode,
4862  Scalar alpha,
4863  Scalar beta) const {
4865  const char fnName[] = "Tpetra::CrsMatrix::apply";
4866 
4867  TEUCHOS_TEST_FOR_EXCEPTION(!isFillComplete(), std::runtime_error,
4868  fnName << ": Cannot call apply() until fillComplete() "
4869  "has been called.");
4870 
4871  if (mode == Teuchos::NO_TRANS) {
4872  ProfilingRegion regionNonTranspose(fnName);
4873  this->applyNonTranspose(X, Y, alpha, beta);
4874  } else {
4875  ProfilingRegion regionTranspose("Tpetra::CrsMatrix::apply (transpose)");
4876  this->applyTranspose(X, Y, mode, alpha, beta);
4877  }
4878 }
4879 
4880 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4881 template <class T>
4882 Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node>>
4884  convert() const {
4885  using Teuchos::RCP;
4886  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
4887  const char tfecfFuncName[] = "convert: ";
4888 
4889  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->isFillComplete(), std::runtime_error,
4890  "This matrix (the source "
4891  "of the conversion) is not fill complete. You must first call "
4892  "fillComplete() (possibly with the domain and range Map) without an "
4893  "intervening call to resumeFill(), before you may call this method.");
4894 
4895  RCP<output_matrix_type> newMatrix(new output_matrix_type(this->getCrsGraph()));
4896  // Copy old values into new values. impl_scalar_type and T may
4897  // differ, so we can't use Kokkos::deep_copy.
4899  copyConvert(newMatrix->getLocalMatrixDevice().values,
4900  this->getLocalMatrixDevice().values);
4901  // Since newmat has a static (const) graph, the graph already has
4902  // a column Map, and Import and Export objects already exist (if
4903  // applicable). Thus, calling fillComplete is cheap.
4904  newMatrix->fillComplete(this->getDomainMap(), this->getRangeMap());
4905 
4906  return newMatrix;
4907 }
4908 
4909 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4912  const bool debug = ::Tpetra::Details::Behavior::debug("CrsGraph");
4913  if (debug) {
4914  const char tfecfFuncName[] = "checkInternalState: ";
4915  const char err[] =
4916  "Internal state is not consistent. "
4917  "Please report this bug to the Tpetra developers.";
4918 
4919  // This version of the graph (RCP<const crs_graph_type>) must
4920  // always be nonnull.
4921  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(staticGraph_.is_null(), std::logic_error, err);
4922  // myGraph == null means that the matrix has a const ("static")
4923  // graph. Otherwise, the matrix has a dynamic graph (it owns its
4924  // graph).
4925  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!myGraph_.is_null() && myGraph_ != staticGraph_,
4926  std::logic_error, err);
4927  // if matrix is fill complete, then graph must be fill complete
4928  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillComplete() && !staticGraph_->isFillComplete(),
4929  std::logic_error, err << " Specifically, the matrix is fill complete, "
4930  "but its graph is NOT fill complete.");
4931  // if values are allocated and they are non-zero in number, then
4932  // one of the allocations should be present
4933  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(staticGraph_->indicesAreAllocated() &&
4934  staticGraph_->getLocalAllocationSize() > 0 &&
4935  staticGraph_->getLocalNumRows() > 0 &&
4936  valuesUnpacked_wdv.extent(0) == 0,
4937  std::logic_error, err);
4938  }
4939 }
4940 
4941 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4942 std::string
4944  description() const {
4945  std::ostringstream os;
4946 
4947  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
4948  if (this->getObjectLabel() != "") {
4949  os << "Label: \"" << this->getObjectLabel() << "\", ";
4950  }
4951  if (isFillComplete()) {
4952  os << "isFillComplete: true"
4953  << ", global dimensions: [" << getGlobalNumRows() << ", "
4954  << getGlobalNumCols() << "]"
4955  << ", global number of entries: " << getGlobalNumEntries()
4956  << "}";
4957  } else {
4958  os << "isFillComplete: false"
4959  << ", global dimensions: [" << getGlobalNumRows() << ", "
4960  << getGlobalNumCols() << "]}";
4961  }
4962  return os.str();
4963 }
4964 
4965 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4967  describe(Teuchos::FancyOStream& out,
4968  const Teuchos::EVerbosityLevel verbLevel) const {
4969  using std::endl;
4970  using std::setw;
4971  using Teuchos::ArrayView;
4972  using Teuchos::Comm;
4973  using Teuchos::RCP;
4974  using Teuchos::TypeNameTraits;
4975  using Teuchos::VERB_DEFAULT;
4976  using Teuchos::VERB_EXTREME;
4977  using Teuchos::VERB_HIGH;
4978  using Teuchos::VERB_LOW;
4979  using Teuchos::VERB_MEDIUM;
4980  using Teuchos::VERB_NONE;
4981 
4982  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4983 
4984  if (vl == VERB_NONE) {
4985  return; // Don't print anything at all
4986  }
4987 
4988  // By convention, describe() always begins with a tab.
4989  Teuchos::OSTab tab0(out);
4990 
4991  RCP<const Comm<int>> comm = this->getComm();
4992  const int myRank = comm->getRank();
4993  const int numProcs = comm->getSize();
4994  size_t width = 1;
4995  for (size_t dec = 10; dec < getGlobalNumRows(); dec *= 10) {
4996  ++width;
4997  }
4998  width = std::max<size_t>(width, static_cast<size_t>(11)) + 2;
4999 
5000  // none: print nothing
5001  // low: print O(1) info from node 0
5002  // medium: print O(P) info, num entries per process
5003  // high: print O(N) info, num entries per row
5004  // extreme: print O(NNZ) info: print indices and values
5005  //
5006  // for medium and higher, print constituent objects at specified verbLevel
5007  if (myRank == 0) {
5008  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5009  }
5010  Teuchos::OSTab tab1(out);
5011 
5012  if (myRank == 0) {
5013  if (this->getObjectLabel() != "") {
5014  out << "Label: \"" << this->getObjectLabel() << "\", ";
5015  }
5016  {
5017  out << "Template parameters:" << endl;
5018  Teuchos::OSTab tab2(out);
5019  out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
5020  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name() << endl
5021  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name() << endl
5022  << "Node: " << TypeNameTraits<Node>::name() << endl;
5023  }
5024  if (isFillComplete()) {
5025  out << "isFillComplete: true" << endl
5026  << "Global dimensions: [" << getGlobalNumRows() << ", "
5027  << getGlobalNumCols() << "]" << endl
5028  << "Global number of entries: " << getGlobalNumEntries() << endl
5029  << endl
5030  << "Global max number of entries in a row: "
5031  << getGlobalMaxNumRowEntries() << endl;
5032  } else {
5033  out << "isFillComplete: false" << endl
5034  << "Global dimensions: [" << getGlobalNumRows() << ", "
5035  << getGlobalNumCols() << "]" << endl;
5036  }
5037  }
5038 
5039  if (vl < VERB_MEDIUM) {
5040  return; // all done!
5041  }
5042 
5043  // Describe the row Map.
5044  if (myRank == 0) {
5045  out << endl
5046  << "Row Map:" << endl;
5047  }
5048  if (getRowMap().is_null()) {
5049  if (myRank == 0) {
5050  out << "null" << endl;
5051  }
5052  } else {
5053  if (myRank == 0) {
5054  out << endl;
5055  }
5056  getRowMap()->describe(out, vl);
5057  }
5058 
5059  // Describe the column Map.
5060  if (myRank == 0) {
5061  out << "Column Map: ";
5062  }
5063  if (getColMap().is_null()) {
5064  if (myRank == 0) {
5065  out << "null" << endl;
5066  }
5067  } else if (getColMap() == getRowMap()) {
5068  if (myRank == 0) {
5069  out << "same as row Map" << endl;
5070  }
5071  } else {
5072  if (myRank == 0) {
5073  out << endl;
5074  }
5075  getColMap()->describe(out, vl);
5076  }
5077 
5078  // Describe the domain Map.
5079  if (myRank == 0) {
5080  out << "Domain Map: ";
5081  }
5082  if (getDomainMap().is_null()) {
5083  if (myRank == 0) {
5084  out << "null" << endl;
5085  }
5086  } else if (getDomainMap() == getRowMap()) {
5087  if (myRank == 0) {
5088  out << "same as row Map" << endl;
5089  }
5090  } else if (getDomainMap() == getColMap()) {
5091  if (myRank == 0) {
5092  out << "same as column Map" << endl;
5093  }
5094  } else {
5095  if (myRank == 0) {
5096  out << endl;
5097  }
5098  getDomainMap()->describe(out, vl);
5099  }
5100 
5101  // Describe the range Map.
5102  if (myRank == 0) {
5103  out << "Range Map: ";
5104  }
5105  if (getRangeMap().is_null()) {
5106  if (myRank == 0) {
5107  out << "null" << endl;
5108  }
5109  } else if (getRangeMap() == getDomainMap()) {
5110  if (myRank == 0) {
5111  out << "same as domain Map" << endl;
5112  }
5113  } else if (getRangeMap() == getRowMap()) {
5114  if (myRank == 0) {
5115  out << "same as row Map" << endl;
5116  }
5117  } else {
5118  if (myRank == 0) {
5119  out << endl;
5120  }
5121  getRangeMap()->describe(out, vl);
5122  }
5123 
5124  // O(P) data
5125  for (int curRank = 0; curRank < numProcs; ++curRank) {
5126  if (myRank == curRank) {
5127  out << "Process rank: " << curRank << endl;
5128  Teuchos::OSTab tab2(out);
5129  if (!staticGraph_->indicesAreAllocated()) {
5130  out << "Graph indices not allocated" << endl;
5131  } else {
5132  out << "Number of allocated entries: "
5133  << staticGraph_->getLocalAllocationSize() << endl;
5134  }
5135  out << "Number of entries: " << getLocalNumEntries() << endl
5136  << "Max number of entries per row: " << getLocalMaxNumRowEntries()
5137  << endl;
5138  }
5139  // Give output time to complete by executing some barriers.
5140  comm->barrier();
5141  comm->barrier();
5142  comm->barrier();
5143  }
5144 
5145  if (vl < VERB_HIGH) {
5146  return; // all done!
5147  }
5148 
5149  // O(N) and O(NNZ) data
5150  for (int curRank = 0; curRank < numProcs; ++curRank) {
5151  if (myRank == curRank) {
5152  out << std::setw(width) << "Proc Rank"
5153  << std::setw(width) << "Global Row"
5154  << std::setw(width) << "Num Entries";
5155  if (vl == VERB_EXTREME) {
5156  out << std::setw(width) << "(Index,Value)";
5157  }
5158  out << endl;
5159  for (size_t r = 0; r < getLocalNumRows(); ++r) {
5160  const size_t nE = getNumEntriesInLocalRow(r);
5161  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5162  out << std::setw(width) << myRank
5163  << std::setw(width) << gid
5164  << std::setw(width) << nE;
5165  if (vl == VERB_EXTREME) {
5166  if (isGloballyIndexed()) {
5167  global_inds_host_view_type rowinds;
5168  values_host_view_type rowvals;
5169  getGlobalRowView(gid, rowinds, rowvals);
5170  for (size_t j = 0; j < nE; ++j) {
5171  out << " (" << rowinds[j]
5172  << ", " << rowvals[j]
5173  << ") ";
5174  }
5175  } else if (isLocallyIndexed()) {
5176  local_inds_host_view_type rowinds;
5177  values_host_view_type rowvals;
5178  getLocalRowView(r, rowinds, rowvals);
5179  for (size_t j = 0; j < nE; ++j) {
5180  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5181  << ", " << rowvals[j]
5182  << ") ";
5183  }
5184  } // globally or locally indexed
5185  } // vl == VERB_EXTREME
5186  out << endl;
5187  } // for each row r on this process
5188  } // if (myRank == curRank)
5189 
5190  // Give output time to complete
5191  comm->barrier();
5192  comm->barrier();
5193  comm->barrier();
5194  } // for each process p
5195 }
5196 
5197 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5199  checkSizes(const SrcDistObject& source) {
5200  // It's not clear what kind of compatibility checks on sizes can
5201  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5202  // compatibility.
5203 
5204  // Currently, the source object must be a RowMatrix with the same
5205  // four template parameters as the target CrsMatrix. We might
5206  // relax this requirement later.
5207  const row_matrix_type* srcRowMat =
5208  dynamic_cast<const row_matrix_type*>(&source);
5209  return (srcRowMat != nullptr);
5210 }
5211 
5212 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5215  const typename crs_graph_type::padding_type& padding,
5216  const bool verbose) {
5217  using Details::padCrsArrays;
5219  using std::endl;
5220  using LO = local_ordinal_type;
5221  using row_ptrs_type =
5222  typename local_graph_device_type::row_map_type::non_const_type;
5223  using range_policy =
5224  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5225  const char tfecfFuncName[] = "applyCrsPadding";
5226  const char suffix[] =
5227  ". Please report this bug to the Tpetra developers.";
5228  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5229 
5230  std::unique_ptr<std::string> prefix;
5231  if (verbose) {
5232  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5233  std::ostringstream os;
5234  os << *prefix << "padding: ";
5235  padding.print(os);
5236  os << endl;
5237  std::cerr << os.str();
5238  }
5239  const int myRank = !verbose ? -1 : [&]() {
5240  auto map = this->getMap();
5241  if (map.is_null()) {
5242  return -1;
5243  }
5244  auto comm = map->getComm();
5245  if (comm.is_null()) {
5246  return -1;
5247  }
5248  return comm->getRank();
5249  }();
5250 
5251  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5252  if (!myGraph_->indicesAreAllocated()) {
5253  if (verbose) {
5254  std::ostringstream os;
5255  os << *prefix << "Call allocateIndices" << endl;
5256  std::cerr << os.str();
5257  }
5258  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5259  }
5260 
5261  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5262  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5263  // size needs to increase. That should be the job of
5264  // padCrsArrays.
5265 
5266  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5267  // would use it directly.
5268 
5269  if (verbose) {
5270  std::ostringstream os;
5271  os << *prefix << "Allocate row_ptrs_beg: "
5272  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5273  std::cerr << os.str();
5274  }
5275  using Kokkos::view_alloc;
5276  using Kokkos::WithoutInitializing;
5277  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5278  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5279  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5280  Kokkos::deep_copy(execution_space(), row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5281 
5282  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) : size_t(row_ptr_beg.extent(0) - 1);
5283  if (verbose) {
5284  std::ostringstream os;
5285  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5286  std::cerr << os.str();
5287  }
5288  row_ptrs_type row_ptr_end(
5289  view_alloc("row_ptr_end", WithoutInitializing), N);
5290 
5291  row_ptrs_type num_row_entries_d;
5292 
5293  const bool refill_num_row_entries =
5294  myGraph_->k_numRowEntries_.extent(0) != 0;
5295 
5296  if (refill_num_row_entries) { // unpacked storage
5297  // We can't assume correct *this capture until C++17, and it's
5298  // likely more efficient just to capture what we need anyway.
5299  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5300  myGraph_->k_numRowEntries_);
5301  Kokkos::parallel_for(
5302  "Fill end row pointers", range_policy(0, N),
5303  KOKKOS_LAMBDA(const size_t i) {
5304  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5305  });
5306  } else {
5307  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5308  // storage, we don't need row_ptr_end to be separate allocation;
5309  // could just have it alias row_ptr_beg+1.
5310  Kokkos::parallel_for(
5311  "Fill end row pointers", range_policy(0, N),
5312  KOKKOS_LAMBDA(const size_t i) {
5313  row_ptr_end(i) = row_ptr_beg(i + 1);
5314  });
5315  }
5316 
5317  if (myGraph_->isGloballyIndexed()) {
5318  padCrsArrays(row_ptr_beg, row_ptr_end,
5319  myGraph_->gblInds_wdv,
5320  valuesUnpacked_wdv, padding, myRank, verbose);
5321  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5322  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5323  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(newValuesLen != newColIndsLen, std::logic_error,
5324  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5325  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5326  << suffix);
5327  } else {
5328  padCrsArrays(row_ptr_beg, row_ptr_end,
5329  myGraph_->lclIndsUnpacked_wdv,
5330  valuesUnpacked_wdv, padding, myRank, verbose);
5331  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5332  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5333  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(newValuesLen != newColIndsLen, std::logic_error,
5334  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5335  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5336  << suffix);
5337  }
5338 
5339  if (refill_num_row_entries) {
5340  Kokkos::parallel_for(
5341  "Fill num entries", range_policy(0, N),
5342  KOKKOS_LAMBDA(const size_t i) {
5343  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5344  });
5345  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5346  }
5347 
5348  if (verbose) {
5349  std::ostringstream os;
5350  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5351  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5352  << ", new size: " << row_ptr_beg.extent(0) << endl;
5353  std::cerr << os.str();
5354  TEUCHOS_ASSERT(myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5355  row_ptr_beg.extent(0));
5356  }
5357  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5358 }
5359 
5360 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5361 void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5362  copyAndPermuteStaticGraph(
5363  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5364  const size_t numSameIDs,
5365  const LocalOrdinal permuteToLIDs[],
5366  const LocalOrdinal permuteFromLIDs[],
5367  const size_t numPermutes) {
5368  using Details::ProfilingRegion;
5369  using std::endl;
5370  using Teuchos::Array;
5371  using Teuchos::ArrayView;
5372  using LO = LocalOrdinal;
5373  using GO = GlobalOrdinal;
5374  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5375  const char suffix[] =
5376  " Please report this bug to the Tpetra developers.";
5377  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5378 
5379  const bool debug = Details::Behavior::debug("CrsGraph");
5380  const bool verbose = Details::Behavior::verbose("CrsGraph");
5381  std::unique_ptr<std::string> prefix;
5382  if (verbose) {
5383  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5384  std::ostringstream os;
5385  os << *prefix << "Start" << endl;
5386  }
5387  const char* const prefix_raw =
5388  verbose ? prefix.get()->c_str() : nullptr;
5389 
5390  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed();
5391  //
5392  // Copy the first numSame row from source to target (this matrix).
5393  // This involves copying rows corresponding to LIDs [0, numSame-1].
5394  //
5395  const map_type& srcRowMap = *(srcMat.getRowMap());
5396  nonconst_global_inds_host_view_type rowInds;
5397  nonconst_values_host_view_type rowVals;
5398  const LO numSameIDs_as_LID = static_cast<LO>(numSameIDs);
5399  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5400  // Global ID for the current row index in the source matrix.
5401  // The first numSameIDs GIDs in the two input lists are the
5402  // same, so sourceGID == targetGID in this case.
5403  const GO sourceGID = srcRowMap.getGlobalElement(sourceLID);
5404  const GO targetGID = sourceGID;
5405 
5406  ArrayView<const GO> rowIndsConstView;
5407  ArrayView<const Scalar> rowValsConstView;
5408 
5409  if (sourceIsLocallyIndexed) {
5410  const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5411  if (rowLength > static_cast<size_t>(rowInds.size())) {
5412  Kokkos::resize(rowInds, rowLength);
5413  Kokkos::resize(rowVals, rowLength);
5414  }
5415  // Resizing invalidates an Array's views, so we must make new
5416  // ones, even if rowLength hasn't changed.
5417  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5418  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5419 
5420  // The source matrix is locally indexed, so we have to get a
5421  // copy. Really it's the GIDs that have to be copied (because
5422  // they have to be converted from LIDs).
5423  size_t checkRowLength = 0;
5424  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5425  rowValsView, checkRowLength);
5426  if (debug) {
5427  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5428  "For "
5429  "global row index "
5430  << sourceGID << ", the source "
5431  "matrix's getNumEntriesInGlobalRow returns a row length "
5432  "of "
5433  << rowLength << ", but getGlobalRowCopy reports "
5434  "a row length of "
5435  << checkRowLength << "." << suffix);
5436  }
5437 
5438  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5439  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5440  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5441  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5442  rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5443  rowIndsView.data(), rowIndsView.extent(0),
5444  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5445  rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5446  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5447  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5448  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5449  // KDDKDD UVM TEMPORARY: KokkosView interface
5450  } else { // source matrix is globally indexed.
5451  global_inds_host_view_type rowIndsView;
5452  values_host_view_type rowValsView;
5453  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5454  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5455  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5456  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5457  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5458  rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5459  rowIndsView.data(), rowIndsView.extent(0),
5460  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5461  rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5462  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5463  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5464  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5465  // KDDKDD UVM TEMPORARY: KokkosView interface
5466  }
5467 
5468  // Applying a permutation to a matrix with a static graph
5469  // means REPLACE-ing entries.
5470  combineGlobalValues(targetGID, rowIndsConstView,
5471  rowValsConstView, REPLACE,
5472  prefix_raw, debug, verbose);
5473  }
5474 
5475  if (verbose) {
5476  std::ostringstream os;
5477  os << *prefix << "Do permutes" << endl;
5478  }
5479 
5480  const map_type& tgtRowMap = *(this->getRowMap());
5481  for (size_t p = 0; p < numPermutes; ++p) {
5482  const GO sourceGID = srcRowMap.getGlobalElement(permuteFromLIDs[p]);
5483  const GO targetGID = tgtRowMap.getGlobalElement(permuteToLIDs[p]);
5484 
5485  ArrayView<const GO> rowIndsConstView;
5486  ArrayView<const Scalar> rowValsConstView;
5487 
5488  if (sourceIsLocallyIndexed) {
5489  const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5490  if (rowLength > static_cast<size_t>(rowInds.size())) {
5491  Kokkos::resize(rowInds, rowLength);
5492  Kokkos::resize(rowVals, rowLength);
5493  }
5494  // Resizing invalidates an Array's views, so we must make new
5495  // ones, even if rowLength hasn't changed.
5496  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5497  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5498 
5499  // The source matrix is locally indexed, so we have to get a
5500  // copy. Really it's the GIDs that have to be copied (because
5501  // they have to be converted from LIDs).
5502  size_t checkRowLength = 0;
5503  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5504  rowValsView, checkRowLength);
5505  if (debug) {
5506  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5507  "For "
5508  "source matrix global row index "
5509  << sourceGID << ", "
5510  "getNumEntriesInGlobalRow returns a row length of "
5511  << rowLength << ", but getGlobalRowCopy a row length of "
5512  << checkRowLength << "." << suffix);
5513  }
5514 
5515  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5516  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5517  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5518  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5519  rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5520  rowIndsView.data(), rowIndsView.extent(0),
5521  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5522  rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5523  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5524  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5525  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5526  // KDDKDD UVM TEMPORARY: KokkosView interface
5527  } else {
5528  global_inds_host_view_type rowIndsView;
5529  values_host_view_type rowValsView;
5530  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5531  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5532  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5533  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5534  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5535  rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5536  rowIndsView.data(), rowIndsView.extent(0),
5537  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5538  rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5539  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5540  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5541  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5542  // KDDKDD UVM TEMPORARY: KokkosView interface
5543  }
5544 
5545  combineGlobalValues(targetGID, rowIndsConstView,
5546  rowValsConstView, REPLACE,
5547  prefix_raw, debug, verbose);
5548  }
5549 
5550  if (verbose) {
5551  std::ostringstream os;
5552  os << *prefix << "Done" << endl;
5553  }
5554 }
5555 
5556 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5557 void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5558  copyAndPermuteNonStaticGraph(
5559  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5560  const size_t numSameIDs,
5561  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5562  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5563  const size_t numPermutes) {
5564  using Details::ProfilingRegion;
5565  using std::endl;
5566  using Teuchos::Array;
5567  using Teuchos::ArrayView;
5568  using LO = LocalOrdinal;
5569  using GO = GlobalOrdinal;
5570  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5571  const char suffix[] =
5572  " Please report this bug to the Tpetra developers.";
5573  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5574 
5575  const bool debug = Details::Behavior::debug("CrsGraph");
5576  const bool verbose = Details::Behavior::verbose("CrsGraph");
5577  std::unique_ptr<std::string> prefix;
5578  if (verbose) {
5579  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5580  std::ostringstream os;
5581  os << *prefix << "Start" << endl;
5582  }
5583  const char* const prefix_raw =
5584  verbose ? prefix.get()->c_str() : nullptr;
5585 
5586  {
5587  using row_graph_type = RowGraph<LO, GO, Node>;
5588  const row_graph_type& srcGraph = *(srcMat.getGraph());
5589  auto padding =
5590  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5591  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5592  applyCrsPadding(*padding, verbose);
5593  }
5594  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed();
5595  //
5596  // Copy the first numSame row from source to target (this matrix).
5597  // This involves copying rows corresponding to LIDs [0, numSame-1].
5598  //
5599  const map_type& srcRowMap = *(srcMat.getRowMap());
5600  const LO numSameIDs_as_LID = static_cast<LO>(numSameIDs);
5601  using gids_type = nonconst_global_inds_host_view_type;
5602  using vals_type = nonconst_values_host_view_type;
5603  gids_type rowInds;
5604  vals_type rowVals;
5605  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5606  // Global ID for the current row index in the source matrix.
5607  // The first numSameIDs GIDs in the two input lists are the
5608  // same, so sourceGID == targetGID in this case.
5609  const GO sourceGID = srcRowMap.getGlobalElement(sourceLID);
5610  const GO targetGID = sourceGID;
5611 
5612  ArrayView<const GO> rowIndsConstView;
5613  ArrayView<const Scalar> rowValsConstView;
5614 
5615  if (sourceIsLocallyIndexed) {
5616  const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5617  if (rowLength > static_cast<size_t>(rowInds.extent(0))) {
5618  Kokkos::resize(rowInds, rowLength);
5619  Kokkos::resize(rowVals, rowLength);
5620  }
5621  // Resizing invalidates an Array's views, so we must make new
5622  // ones, even if rowLength hasn't changed.
5623  gids_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5624  vals_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5625 
5626  // The source matrix is locally indexed, so we have to get a
5627  // copy. Really it's the GIDs that have to be copied (because
5628  // they have to be converted from LIDs).
5629  size_t checkRowLength = 0;
5630  srcMat.getGlobalRowCopy(sourceGID, rowIndsView, rowValsView,
5631  checkRowLength);
5632  if (debug) {
5633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5634  ": For "
5635  "global row index "
5636  << sourceGID << ", the source "
5637  "matrix's getNumEntriesInGlobalRow returns a row length "
5638  "of "
5639  << rowLength << ", but getGlobalRowCopy reports "
5640  "a row length of "
5641  << checkRowLength << "." << suffix);
5642  }
5643  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5644  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar*>(rowValsView.data()), rowLength);
5645  } else { // source matrix is globally indexed.
5646  global_inds_host_view_type rowIndsView;
5647  values_host_view_type rowValsView;
5648  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5649 
5650  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5651  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5652  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5653  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5654  rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5655  rowIndsView.data(), rowIndsView.extent(0),
5656  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5657  rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5658  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5659  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5660  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5661  // KDDKDD UVM TEMPORARY: KokkosView interface
5662  }
5663 
5664  // Combine the data into the target matrix.
5665  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5666  rowValsConstView, prefix_raw, debug, verbose);
5667  }
5668 
5669  if (verbose) {
5670  std::ostringstream os;
5671  os << *prefix << "Do permutes" << endl;
5672  }
5673  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5674  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5675 
5676  const map_type& tgtRowMap = *(this->getRowMap());
5677  for (size_t p = 0; p < numPermutes; ++p) {
5678  const GO sourceGID = srcRowMap.getGlobalElement(permuteFromLIDs[p]);
5679  const GO targetGID = tgtRowMap.getGlobalElement(permuteToLIDs[p]);
5680 
5681  ArrayView<const GO> rowIndsConstView;
5682  ArrayView<const Scalar> rowValsConstView;
5683 
5684  if (sourceIsLocallyIndexed) {
5685  const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5686  if (rowLength > static_cast<size_t>(rowInds.extent(0))) {
5687  Kokkos::resize(rowInds, rowLength);
5688  Kokkos::resize(rowVals, rowLength);
5689  }
5690  // Resizing invalidates an Array's views, so we must make new
5691  // ones, even if rowLength hasn't changed.
5692  gids_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5693  vals_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5694 
5695  // The source matrix is locally indexed, so we have to get a
5696  // copy. Really it's the GIDs that have to be copied (because
5697  // they have to be converted from LIDs).
5698  size_t checkRowLength = 0;
5699  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5700  rowValsView, checkRowLength);
5701  if (debug) {
5702  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5703  "For "
5704  "source matrix global row index "
5705  << sourceGID << ", "
5706  "getNumEntriesInGlobalRow returns a row length of "
5707  << rowLength << ", but getGlobalRowCopy a row length of "
5708  << checkRowLength << "." << suffix);
5709  }
5710  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5711  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar*>(rowValsView.data()), rowLength);
5712  } else {
5713  global_inds_host_view_type rowIndsView;
5714  values_host_view_type rowValsView;
5715  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5716 
5717  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5718  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5719  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5720  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5721  rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5722  rowIndsView.data(), rowIndsView.extent(0),
5723  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5724  rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5725  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5726  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5727  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5728  // KDDKDD UVM TEMPORARY: KokkosView interface
5729  }
5730 
5731  // Combine the data into the target matrix.
5732  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5733  rowValsConstView, prefix_raw, debug, verbose);
5734  }
5735 
5736  if (verbose) {
5737  std::ostringstream os;
5738  os << *prefix << "Done" << endl;
5739  }
5740 }
5741 
5742 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5745  const SrcDistObject& srcObj,
5746  const size_t numSameIDs,
5747  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
5748  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
5749  const CombineMode /*CM*/) {
5750  using Details::Behavior;
5753  using std::endl;
5754 
5755  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
5756  const char tfecfFuncName[] = "copyAndPermute: ";
5757  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
5758 
5759  const bool verbose = Behavior::verbose("CrsMatrix");
5760  std::unique_ptr<std::string> prefix;
5761  if (verbose) {
5762  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
5763  std::ostringstream os;
5764  os << *prefix << endl
5765  << *prefix << " numSameIDs: " << numSameIDs << endl
5766  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
5767  << endl
5768  << *prefix << " "
5769  << dualViewStatusToString(permuteToLIDs, "permuteToLIDs")
5770  << endl
5771  << *prefix << " "
5772  << dualViewStatusToString(permuteFromLIDs, "permuteFromLIDs")
5773  << endl
5774  << *prefix << " "
5775  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
5776  << endl;
5777  std::cerr << os.str();
5778  }
5779 
5780  const auto numPermute = permuteToLIDs.extent(0);
5781  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numPermute != permuteFromLIDs.extent(0),
5782  std::invalid_argument, "permuteToLIDs.extent(0) = " << numPermute << "!= permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0) << ".");
5783 
5784  // This dynamic cast should succeed, because we've already tested
5785  // it in checkSizes().
5787  const RMT& srcMat = dynamic_cast<const RMT&>(srcObj);
5788  if (isStaticGraph()) {
5789  TEUCHOS_ASSERT(!permuteToLIDs.need_sync_host());
5790  auto permuteToLIDs_h = permuteToLIDs.view_host();
5791  TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_host());
5792  auto permuteFromLIDs_h = permuteFromLIDs.view_host();
5793 
5794  copyAndPermuteStaticGraph(srcMat, numSameIDs,
5795  permuteToLIDs_h.data(),
5796  permuteFromLIDs_h.data(),
5797  numPermute);
5798  } else {
5799  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
5800  permuteFromLIDs, numPermute);
5801  }
5802 
5803  if (verbose) {
5804  std::ostringstream os;
5805  os << *prefix << "Done" << endl;
5806  std::cerr << os.str();
5807  }
5808 }
5809 
5810 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5812  packAndPrepare(const SrcDistObject& source,
5813  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
5814  Kokkos::DualView<char*, buffer_device_type>& exports,
5815  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
5816  size_t& constantNumPackets) {
5817  using Details::Behavior;
5820  using std::endl;
5821  using Teuchos::outArg;
5822  using Teuchos::REDUCE_MAX;
5823  using Teuchos::reduceAll;
5824  typedef LocalOrdinal LO;
5825  typedef GlobalOrdinal GO;
5826  const char tfecfFuncName[] = "packAndPrepare: ";
5827  ProfilingRegion regionPAP("Tpetra::CrsMatrix::packAndPrepare");
5828 
5829  const bool debug = Behavior::debug("CrsMatrix");
5830  const bool verbose = Behavior::verbose("CrsMatrix");
5831 
5832  // Processes on which the communicator is null should not participate.
5833  Teuchos::RCP<const Teuchos::Comm<int>> pComm = this->getComm();
5834  if (pComm.is_null()) {
5835  return;
5836  }
5837  const Teuchos::Comm<int>& comm = *pComm;
5838  const int myRank = comm.getSize();
5839 
5840  std::unique_ptr<std::string> prefix;
5841  if (verbose) {
5842  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
5843  std::ostringstream os;
5844  os << *prefix << "Start" << endl
5845  << *prefix << " "
5846  << dualViewStatusToString(exportLIDs, "exportLIDs")
5847  << endl
5848  << *prefix << " "
5849  << dualViewStatusToString(exports, "exports")
5850  << endl
5851  << *prefix << " "
5852  << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
5853  << endl;
5854  std::cerr << os.str();
5855  }
5856 
5857  // Attempt to cast the source object to CrsMatrix. If successful,
5858  // use the source object's packNew() method to pack its data for
5859  // communication. Otherwise, attempt to cast to RowMatrix; if
5860  // successful, use the source object's pack() method. Otherwise,
5861  // the source object doesn't have the right type.
5862  //
5863  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
5864  // RowMatrix to have the same Node type. Unfortunately, we don't
5865  // have a way to ask if the RowMatrix is "a RowMatrix with any
5866  // Node type," since RowMatrix doesn't have a base class. A
5867  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
5868  // not currently exist, would satisfy this requirement.
5869  //
5870  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
5871  // type doesn't technically need to match the target object's
5872  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
5873  // and GO need not be the same, as long as there is no overflow of
5874  // the indices. However, checking for index overflow is global
5875  // and therefore undesirable.
5876 
5877  std::ostringstream msg; // for collecting error messages
5878  int lclBad = 0; // to be set below
5879 
5880  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
5881  const crs_matrix_type* srcCrsMat =
5882  dynamic_cast<const crs_matrix_type*>(&source);
5883  if (srcCrsMat != nullptr) {
5884  if (verbose) {
5885  std::ostringstream os;
5886  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
5887  "calling packNew"
5888  << endl;
5889  std::cerr << os.str();
5890  }
5891  try {
5892  srcCrsMat->packNew(exportLIDs, exports, numPacketsPerLID,
5893  constantNumPackets);
5894  } catch (std::exception& e) {
5895  lclBad = 1;
5896  msg << "Proc " << myRank << ": " << e.what() << std::endl;
5897  }
5898  } else {
5899  using Kokkos::HostSpace;
5900  using Kokkos::subview;
5901  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
5902  using range_type = Kokkos::pair<size_t, size_t>;
5903 
5904  if (verbose) {
5905  std::ostringstream os;
5906  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
5907  << endl;
5908  std::cerr << os.str();
5909  }
5910 
5911  const row_matrix_type* srcRowMat =
5912  dynamic_cast<const row_matrix_type*>(&source);
5913  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(srcRowMat == nullptr, std::invalid_argument,
5914  "The source object of the Import or Export operation is neither a "
5915  "CrsMatrix (with the same template parameters as the target object), "
5916  "nor a RowMatrix (with the same first four template parameters as the "
5917  "target object).");
5918 
5919  // For the RowMatrix case, we need to convert from
5920  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
5921  // so terribly efficient, since packing a non-CrsMatrix
5922  // RowMatrix for Import/Export into a CrsMatrix is not a
5923  // critical case. Thus, we may allocate Teuchos::Array objects
5924  // here and copy to and from Kokkos::*View.
5925 
5926  // View exportLIDs's host data as a Teuchos::ArrayView.
5927  TEUCHOS_ASSERT(!exportLIDs.need_sync_host());
5928  auto exportLIDs_h = exportLIDs.view_host();
5929  Teuchos::ArrayView<const LO> exportLIDs_av(exportLIDs_h.data(),
5930  exportLIDs_h.size());
5931 
5932  // pack() will allocate exports_a as needed. We'll copy back
5933  // into exports (after (re)allocating exports if needed) below.
5934  Teuchos::Array<char> exports_a;
5935 
5936  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
5937  // need to sync, since we're doing write-only access, but we do
5938  // need to mark the DualView as modified on host.
5939 
5940  numPacketsPerLID.clear_sync_state(); // write-only access
5941  numPacketsPerLID.modify_host();
5942  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
5943  Teuchos::ArrayView<size_t> numPacketsPerLID_av(numPacketsPerLID_h.data(),
5944  numPacketsPerLID_h.size());
5945 
5946  // Invoke RowMatrix's legacy pack() interface, using above
5947  // Teuchos::Array* objects.
5948  try {
5949  srcRowMat->pack(exportLIDs_av, exports_a, numPacketsPerLID_av,
5950  constantNumPackets);
5951  } catch (std::exception& e) {
5952  lclBad = 1;
5953  msg << "Proc " << myRank << ": " << e.what() << std::endl;
5954  }
5955 
5956  // Allocate 'exports', and copy exports_a back into it.
5957  const size_t newAllocSize = static_cast<size_t>(exports_a.size());
5958  if (static_cast<size_t>(exports.extent(0)) < newAllocSize) {
5959  const std::string oldLabel = exports.view_device().label();
5960  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
5961  exports = exports_type(newLabel, newAllocSize);
5962  }
5963  // It's safe to assume that we're working on host anyway, so
5964  // just keep exports sync'd to host.
5965  // ignore current device contents
5966  exports.modify_host();
5967 
5968  auto exports_h = exports.view_host();
5969  auto exports_h_sub = subview(exports_h, range_type(0, newAllocSize));
5970 
5971  // Kokkos::deep_copy needs a Kokkos::View input, so turn
5972  // exports_a into a nonowning Kokkos::View first before copying.
5973  typedef typename exports_type::t_host::execution_space HES;
5974  typedef Kokkos::Device<HES, HostSpace> host_device_type;
5975  Kokkos::View<const char*, host_device_type>
5976  exports_a_kv(exports_a.getRawPtr(), newAllocSize);
5977  // DEEP_COPY REVIEW - NOT TESTED
5978  Kokkos::deep_copy(exports_h_sub, exports_a_kv);
5979  }
5980 
5981  if (debug) {
5982  int gblBad = 0; // output argument; to be set below
5983  reduceAll<int, int>(comm, REDUCE_MAX, lclBad, outArg(gblBad));
5984  if (gblBad != 0) {
5985  Tpetra::Details::gathervPrint(std::cerr, msg.str(), comm);
5986  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error,
5987  "packNew() or pack() threw an exception on "
5988  "one or more participating processes.");
5989  }
5990  } else {
5991  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclBad != 0, std::logic_error,
5992  "packNew threw an exception on one "
5993  "or more participating processes. Here is this process' error "
5994  "message: "
5995  << msg.str());
5996  }
5997 
5998  if (verbose) {
5999  std::ostringstream os;
6000  os << *prefix << "packAndPrepare: Done!" << endl
6001  << *prefix << " "
6002  << dualViewStatusToString(exportLIDs, "exportLIDs")
6003  << endl
6004  << *prefix << " "
6005  << dualViewStatusToString(exports, "exports")
6006  << endl
6007  << *prefix << " "
6008  << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
6009  << endl;
6010  std::cerr << os.str();
6011  }
6012 }
6013 
6014 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6015 size_t
6016 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6017  packRow(char exports[],
6018  const size_t offset,
6019  const size_t numEnt,
6020  const GlobalOrdinal gidsIn[],
6021  const impl_scalar_type valsIn[],
6022  const size_t numBytesPerValue) const {
6023  using Kokkos::subview;
6024  using Kokkos::View;
6026  typedef LocalOrdinal LO;
6027  typedef GlobalOrdinal GO;
6028  typedef impl_scalar_type ST;
6029 
6030  if (numEnt == 0) {
6031  // Empty rows always take zero bytes, to ensure sparsity.
6032  return 0;
6033  }
6034 
6035  const GO gid = 0; // packValueCount wants this
6036  const LO numEntLO = static_cast<size_t>(numEnt);
6037 
6038  const size_t numEntBeg = offset;
6039  const size_t numEntLen = PackTraits<LO>::packValueCount(numEntLO);
6040  const size_t gidsBeg = numEntBeg + numEntLen;
6041  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
6042  const size_t valsBeg = gidsBeg + gidsLen;
6043  const size_t valsLen = numEnt * numBytesPerValue;
6044 
6045  char* const numEntOut = exports + numEntBeg;
6046  char* const gidsOut = exports + gidsBeg;
6047  char* const valsOut = exports + valsBeg;
6048 
6049  size_t numBytesOut = 0;
6050  int errorCode = 0;
6051  numBytesOut += PackTraits<LO>::packValue(numEntOut, numEntLO);
6052 
6053  {
6054  Kokkos::pair<int, size_t> p;
6055  p = PackTraits<GO>::packArray(gidsOut, gidsIn, numEnt);
6056  errorCode += p.first;
6057  numBytesOut += p.second;
6058 
6059  p = PackTraits<ST>::packArray(valsOut, valsIn, numEnt);
6060  errorCode += p.first;
6061  numBytesOut += p.second;
6062  }
6063 
6064  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6065  TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
6066  "packRow: "
6067  "numBytesOut = "
6068  << numBytesOut << " != expectedNumBytes = "
6069  << expectedNumBytes << ".");
6070  TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
6071  "packRow: "
6072  "PackTraits::packArray returned a nonzero error code");
6073 
6074  return numBytesOut;
6075 }
6076 
6077 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6078 size_t
6079 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6080  unpackRow(GlobalOrdinal gidsOut[],
6081  impl_scalar_type valsOut[],
6082  const char imports[],
6083  const size_t offset,
6084  const size_t numBytes,
6085  const size_t numEnt,
6086  const size_t numBytesPerValue) {
6087  using Kokkos::subview;
6088  using Kokkos::View;
6090  typedef LocalOrdinal LO;
6091  typedef GlobalOrdinal GO;
6092  typedef impl_scalar_type ST;
6093 
6094  Details::ProfilingRegion region_upack_row(
6095  "Tpetra::CrsMatrix::unpackRow",
6096  "Import/Export");
6097 
6098  if (numBytes == 0) {
6099  // Rows with zero bytes should always have zero entries.
6100  if (numEnt != 0) {
6101  const int myRank = this->getMap()->getComm()->getRank();
6102  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6103  "unpackRow: The number of bytes to unpack numBytes=0, but the "
6104  "number of entries to unpack (as reported by numPacketsPerLID) "
6105  "for this row numEnt="
6106  << numEnt << " != 0.");
6107  }
6108  return 0;
6109  }
6110 
6111  if (numEnt == 0 && numBytes != 0) {
6112  const int myRank = this->getMap()->getComm()->getRank();
6113  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6114  "unpackRow: The number of entries to unpack (as reported by "
6115  "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6116  "numBytes="
6117  << numBytes << " != 0.");
6118  }
6119 
6120  const GO gid = 0; // packValueCount wants this
6121  const LO lid = 0; // packValueCount wants this
6122 
6123  const size_t numEntBeg = offset;
6124  const size_t numEntLen = PackTraits<LO>::packValueCount(lid);
6125  const size_t gidsBeg = numEntBeg + numEntLen;
6126  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
6127  const size_t valsBeg = gidsBeg + gidsLen;
6128  const size_t valsLen = numEnt * numBytesPerValue;
6129 
6130  const char* const numEntIn = imports + numEntBeg;
6131  const char* const gidsIn = imports + gidsBeg;
6132  const char* const valsIn = imports + valsBeg;
6133 
6134  size_t numBytesOut = 0;
6135  int errorCode = 0;
6136  LO numEntOut;
6137  numBytesOut += PackTraits<LO>::unpackValue(numEntOut, numEntIn);
6138  if (static_cast<size_t>(numEntOut) != numEnt ||
6139  numEntOut == static_cast<LO>(0)) {
6140  const int myRank = this->getMap()->getComm()->getRank();
6141  std::ostringstream os;
6142  os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6143  bool firstErrorCondition = false;
6144  if (static_cast<size_t>(numEntOut) != numEnt) {
6145  os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6146  << " does not equal number of entries unpacked from imports "
6147  "buffer numEntOut="
6148  << numEntOut << ".";
6149  firstErrorCondition = true;
6150  }
6151  if (numEntOut == static_cast<LO>(0)) {
6152  if (firstErrorCondition) {
6153  os << " Also, ";
6154  }
6155  os << "Number of entries unpacked from imports buffer numEntOut=0, "
6156  "but number of bytes to unpack for this row numBytes="
6157  << numBytes
6158  << " != 0. This should never happen, since packRow should only "
6159  "ever pack rows with a nonzero number of entries. In this case, "
6160  "the number of entries from numPacketsPerLID is numEnt="
6161  << numEnt
6162  << ".";
6163  }
6164  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str());
6165  }
6166 
6167  {
6168  Kokkos::pair<int, size_t> p;
6169  p = PackTraits<GO>::unpackArray(gidsOut, gidsIn, numEnt);
6170  errorCode += p.first;
6171  numBytesOut += p.second;
6172 
6173  p = PackTraits<ST>::unpackArray(valsOut, valsIn, numEnt);
6174  errorCode += p.first;
6175  numBytesOut += p.second;
6176  }
6177 
6178  TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = " << numBytesOut << " != numBytes = " << numBytes << ".");
6179 
6180  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6181  TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
6182  "unpackRow: "
6183  "numBytesOut = "
6184  << numBytesOut << " != expectedNumBytes = "
6185  << expectedNumBytes << ".");
6186 
6187  TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
6188  "unpackRow: "
6189  "PackTraits::unpackArray returned a nonzero error code");
6190 
6191  return numBytesOut;
6192 }
6193 
6194 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6195 void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6196  allocatePackSpaceNew(Kokkos::DualView<char*, buffer_device_type>& exports,
6197  size_t& totalNumEntries,
6198  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const {
6199  using Details::Behavior;
6201  using std::endl;
6202  typedef impl_scalar_type IST;
6203  typedef LocalOrdinal LO;
6204  typedef GlobalOrdinal GO;
6205  // const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6206 
6207  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6208  // output to std::cerr on every MPI process. This is unwise for
6209  // runs with large numbers of MPI processes.
6210  const bool verbose = Behavior::verbose("CrsMatrix");
6211  std::unique_ptr<std::string> prefix;
6212  if (verbose) {
6213  prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6214  std::ostringstream os;
6215  os << *prefix << "Before:"
6216  << endl
6217  << *prefix << " "
6218  << dualViewStatusToString(exports, "exports")
6219  << endl
6220  << *prefix << " "
6221  << dualViewStatusToString(exportLIDs, "exportLIDs")
6222  << endl;
6223  std::cerr << os.str();
6224  }
6225 
6226  // The number of export LIDs must fit in LocalOrdinal, assuming
6227  // that the LIDs are distinct and valid on the calling process.
6228  const LO numExportLIDs = static_cast<LO>(exportLIDs.extent(0));
6229 
6230  TEUCHOS_ASSERT(!exportLIDs.need_sync_host());
6231  auto exportLIDs_h = exportLIDs.view_host();
6232 
6233  // Count the total number of matrix entries to send.
6234  totalNumEntries = 0;
6235  for (LO i = 0; i < numExportLIDs; ++i) {
6236  const LO lclRow = exportLIDs_h[i];
6237  size_t curNumEntries = this->getNumEntriesInLocalRow(lclRow);
6238  // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6239  // indices as an error. Just consider them nonowned for now.
6240  if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid()) {
6241  curNumEntries = 0;
6242  }
6243  totalNumEntries += curNumEntries;
6244  }
6245 
6246  // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6247  // if sizeof(IST) is a meaningful representation of the amount of
6248  // data in a Scalar instance. (LO and GO are always built-in
6249  // integer types.)
6250  //
6251  // Allocate the exports array. It does NOT need padding for
6252  // alignment, since we use memcpy to write to / read from send /
6253  // receive buffers.
6254  const size_t allocSize =
6255  static_cast<size_t>(numExportLIDs) * sizeof(LO) +
6256  totalNumEntries * (sizeof(IST) + sizeof(GO));
6257  if (static_cast<size_t>(exports.extent(0)) < allocSize) {
6258  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6259 
6260  const std::string oldLabel = exports.view_device().label();
6261  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6262  exports = exports_type(newLabel, allocSize);
6263  }
6264 
6265  if (verbose) {
6266  std::ostringstream os;
6267  os << *prefix << "After:"
6268  << endl
6269  << *prefix << " "
6270  << dualViewStatusToString(exports, "exports")
6271  << endl
6272  << *prefix << " "
6273  << dualViewStatusToString(exportLIDs, "exportLIDs")
6274  << endl;
6275  std::cerr << os.str();
6276  }
6277 }
6278 
6279 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6281  packNew(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6282  Kokkos::DualView<char*, buffer_device_type>& exports,
6283  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6284  size_t& constantNumPackets) const {
6285  // The call to packNew in packAndPrepare catches and handles any exceptions.
6286  Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6287  if (this->isStaticGraph()) {
6289  packCrsMatrixNew(*this, exports, numPacketsPerLID, exportLIDs,
6290  constantNumPackets);
6291  } else {
6292  this->packNonStaticNew(exportLIDs, exports, numPacketsPerLID,
6293  constantNumPackets);
6294  }
6295 }
6296 
6297 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6299  packNonStaticNew(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6300  Kokkos::DualView<char*, buffer_device_type>& exports,
6301  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6302  size_t& constantNumPackets) const {
6303  using Details::Behavior;
6306  using Details::PackTraits;
6307  using Kokkos::View;
6308  using std::endl;
6309  using LO = LocalOrdinal;
6310  using GO = GlobalOrdinal;
6311  using ST = impl_scalar_type;
6312  const char tfecfFuncName[] = "packNonStaticNew: ";
6313 
6314  const bool verbose = Behavior::verbose("CrsMatrix");
6315  std::unique_ptr<std::string> prefix;
6316  if (verbose) {
6317  prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6318  std::ostringstream os;
6319  os << *prefix << "Start" << endl;
6320  std::cerr << os.str();
6321  }
6322 
6323  const size_t numExportLIDs = static_cast<size_t>(exportLIDs.extent(0));
6324  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numExportLIDs != static_cast<size_t>(numPacketsPerLID.extent(0)),
6325  std::invalid_argument, "exportLIDs.size() = " << numExportLIDs << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent(0) << ".");
6326 
6327  // Setting this to zero tells the caller to expect a possibly
6328  // different ("nonconstant") number of packets per local index
6329  // (i.e., a possibly different number of entries per row).
6330  constantNumPackets = 0;
6331 
6332  // The pack buffer 'exports' enters this method possibly
6333  // unallocated. Do the first two parts of "Count, allocate, fill,
6334  // compute."
6335  size_t totalNumEntries = 0;
6336  this->allocatePackSpaceNew(exports, totalNumEntries, exportLIDs);
6337  const size_t bufSize = static_cast<size_t>(exports.extent(0));
6338 
6339  // Write-only host access
6340  exports.clear_sync_state();
6341  exports.modify_host();
6342  auto exports_h = exports.view_host();
6343  if (verbose) {
6344  std::ostringstream os;
6345  os << *prefix << "After marking exports as modified on host, "
6346  << dualViewStatusToString(exports, "exports") << endl;
6347  std::cerr << os.str();
6348  }
6349 
6350  // Read-only host access
6351  auto exportLIDs_h = exportLIDs.view_host();
6352 
6353  // Write-only host access
6354  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6355  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6356  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
6357 
6358  // Compute the number of "packets" (in this case, bytes) per
6359  // export LID (in this case, local index of the row to send), and
6360  // actually pack the data.
6361  auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6362 
6363  // Temporary buffer for global column indices.
6364  typename global_inds_host_view_type::non_const_type gidsIn_k;
6365  if (this->isLocallyIndexed()) { // Need storage for Global IDs
6366  gidsIn_k =
6367  typename global_inds_host_view_type::non_const_type("packGids",
6368  maxRowNumEnt);
6369  }
6370 
6371  size_t offset = 0; // current index into 'exports' array.
6372  for (size_t i = 0; i < numExportLIDs; ++i) {
6373  const LO lclRow = exportLIDs_h[i];
6374 
6375  size_t numBytes = 0;
6376  size_t numEnt = this->getNumEntriesInLocalRow(lclRow);
6377 
6378  // Only pack this row's data if it has a nonzero number of
6379  // entries. We can do this because receiving processes get the
6380  // number of packets, and will know that zero packets means zero
6381  // entries.
6382  if (numEnt == 0) {
6383  numPacketsPerLID_h[i] = 0;
6384  continue;
6385  }
6386 
6387  if (this->isLocallyIndexed()) {
6388  typename global_inds_host_view_type::non_const_type gidsIn;
6389  values_host_view_type valsIn;
6390  // If the matrix is locally indexed on the calling process, we
6391  // have to use its column Map (which it _must_ have in this
6392  // case) to convert to global indices.
6393  local_inds_host_view_type lidsIn;
6394  this->getLocalRowView(lclRow, lidsIn, valsIn);
6395  const map_type& colMap = *(this->getColMap());
6396  for (size_t k = 0; k < numEnt; ++k) {
6397  gidsIn_k[k] = colMap.getGlobalElement(lidsIn[k]);
6398  }
6399  gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0), GO(numEnt)));
6400 
6401  const size_t numBytesPerValue =
6402  PackTraits<ST>::packValueCount(valsIn[0]);
6403  numBytes = this->packRow(exports_h.data(), offset, numEnt,
6404  gidsIn.data(), valsIn.data(),
6405  numBytesPerValue);
6406  } else if (this->isGloballyIndexed()) {
6407  global_inds_host_view_type gidsIn;
6408  values_host_view_type valsIn;
6409  // If the matrix is globally indexed on the calling process,
6410  // then we can use the column indices directly. However, we
6411  // have to get the global row index. The calling process must
6412  // have a row Map, since otherwise it shouldn't be participating
6413  // in packing operations.
6414  const map_type& rowMap = *(this->getRowMap());
6415  const GO gblRow = rowMap.getGlobalElement(lclRow);
6416  this->getGlobalRowView(gblRow, gidsIn, valsIn);
6417 
6418  const size_t numBytesPerValue =
6419  PackTraits<ST>::packValueCount(valsIn[0]);
6420  numBytes = this->packRow(exports_h.data(), offset, numEnt,
6421  gidsIn.data(), valsIn.data(),
6422  numBytesPerValue);
6423  }
6424  // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6425  // nor locally indexed, then it has no entries. Therefore,
6426  // there is nothing to pack. No worries!
6427 
6428  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6429  "First invalid offset into 'exports' pack buffer at index i = " << i
6430  << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " << bufSize << ", offset: " << offset << ", numBytes: " << numBytes << ".");
6431  // numPacketsPerLID_h[i] is the number of "packets" in the
6432  // current local row i. Packet=char (really "byte") so use the
6433  // number of bytes of the packed data for that row.
6434  numPacketsPerLID_h[i] = numBytes;
6435  offset += numBytes;
6436  }
6437 
6438  if (verbose) {
6439  std::ostringstream os;
6440  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6441  << *prefix << " "
6442  << dualViewStatusToString(exports, "exports")
6443  << endl
6444  << *prefix << " "
6445  << dualViewStatusToString(exportLIDs, "exportLIDs")
6446  << endl;
6447  std::cerr << os.str();
6448  }
6449 }
6450 
6451 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6452 LocalOrdinal
6453 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6454  combineGlobalValuesRaw(const LocalOrdinal lclRow,
6455  const LocalOrdinal numEnt,
6456  const impl_scalar_type vals[],
6457  const GlobalOrdinal cols[],
6458  const Tpetra::CombineMode combMode,
6459  const char* const prefix,
6460  const bool debug,
6461  const bool verbose) {
6462  using GO = GlobalOrdinal;
6463 
6464  // mfh 23 Mar 2017: This branch is not thread safe in a debug
6465  // build, due to use of Teuchos::ArrayView; see #229.
6466  const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6467  Teuchos::ArrayView<const GO> cols_av(numEnt == 0 ? nullptr : cols, numEnt);
6468  Teuchos::ArrayView<const Scalar> vals_av(numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*>(vals), numEnt);
6469 
6470  // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6471  // combine modes. combineGlobalValues throws on error; it does
6472  // not return an error code. Thus, if it returns, it succeeded.
6473  combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6474  prefix, debug, verbose);
6475  return numEnt;
6476 }
6477 
6478 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6479 void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6480  combineGlobalValues(
6481  const GlobalOrdinal globalRowIndex,
6482  const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6483  const Teuchos::ArrayView<const Scalar>& values,
6484  const Tpetra::CombineMode combineMode,
6485  const char* const prefix,
6486  const bool debug,
6487  const bool verbose) {
6488  const char tfecfFuncName[] = "combineGlobalValues: ";
6489 
6490  if (isStaticGraph()) {
6491  // INSERT doesn't make sense for a static graph, since you
6492  // aren't allowed to change the structure of the graph.
6493  // However, all the other combine modes work.
6494  if (combineMode == ADD) {
6495  sumIntoGlobalValues(globalRowIndex, columnIndices, values);
6496  } else if (combineMode == REPLACE) {
6497  replaceGlobalValues(globalRowIndex, columnIndices, values);
6498  } else if (combineMode == ABSMAX) {
6499  using ::Tpetra::Details::AbsMax;
6500  AbsMax<Scalar> f;
6501  this->template transformGlobalValues<AbsMax<Scalar>>(globalRowIndex,
6502  columnIndices,
6503  values, f);
6504  } else if (combineMode == INSERT) {
6505  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isStaticGraph() && combineMode == INSERT,
6506  std::invalid_argument,
6507  "INSERT combine mode is forbidden "
6508  "if the matrix has a static (const) graph (i.e., was "
6509  "constructed with the CrsMatrix constructor that takes a "
6510  "const CrsGraph pointer).");
6511  } else {
6512  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error,
6513  "Invalid combine mode; should "
6514  "never get here! "
6515  "Please report this bug to the Tpetra developers.");
6516  }
6517  } else { // The matrix has a dynamic graph.
6518  if (combineMode == ADD || combineMode == INSERT) {
6519  // For a dynamic graph, all incoming column indices are
6520  // inserted into the target graph. Duplicate indices will
6521  // have their values summed. In this context, ADD and INSERT
6522  // are equivalent. We need to call insertGlobalValues()
6523  // anyway if the column indices don't yet exist in this row,
6524  // so we just call insertGlobalValues() for both cases.
6525  insertGlobalValuesFilteredChecked(globalRowIndex,
6526  columnIndices, values, prefix, debug, verbose);
6527  }
6528  // FIXME (mfh 14 Mar 2012):
6529  //
6530  // Implementing ABSMAX or REPLACE for a dynamic graph would
6531  // require modifying assembly to attach a possibly different
6532  // combine mode to each inserted (i, j, A_ij) entry. For
6533  // example, consider two different Export operations to the same
6534  // target CrsMatrix, the first with ABSMAX combine mode and the
6535  // second with REPLACE. This isn't a common use case, so we
6536  // won't mess with it for now.
6537  else if (combineMode == ABSMAX) {
6538  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6539  !isStaticGraph() && combineMode == ABSMAX, std::logic_error,
6540  "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6541  "implemented.");
6542  } else if (combineMode == REPLACE) {
6543  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6544  !isStaticGraph() && combineMode == REPLACE, std::logic_error,
6545  "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6546  "implemented.");
6547  } else {
6548  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6549  true, std::logic_error,
6550  "Should never get here! Please report this "
6551  "bug to the Tpetra developers.");
6552  }
6553  }
6554 }
6555 
6556 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6558  unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6559  Kokkos::DualView<char*, buffer_device_type> imports,
6560  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6561  const size_t constantNumPackets,
6562  const CombineMode combineMode) {
6563  using Details::Behavior;
6566  using std::endl;
6567  const char tfecfFuncName[] = "unpackAndCombine: ";
6568  ProfilingRegion regionUAC("Tpetra::CrsMatrix::unpackAndCombine");
6569 
6570  const bool debug = Behavior::debug("CrsMatrix");
6571  const bool verbose = Behavior::verbose("CrsMatrix");
6572  constexpr int numValidModes = 5;
6573  const CombineMode validModes[numValidModes] =
6574  {ADD, REPLACE, ABSMAX, INSERT, ZERO};
6575  const char* validModeNames[numValidModes] =
6576  {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6577 
6578  std::unique_ptr<std::string> prefix;
6579  if (verbose) {
6580  prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6581  std::ostringstream os;
6582  os << *prefix << "Start:" << endl
6583  << *prefix << " "
6584  << dualViewStatusToString(importLIDs, "importLIDs")
6585  << endl
6586  << *prefix << " "
6587  << dualViewStatusToString(imports, "imports")
6588  << endl
6589  << *prefix << " "
6590  << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
6591  << endl
6592  << *prefix << " constantNumPackets: " << constantNumPackets
6593  << endl
6594  << *prefix << " combineMode: " << combineModeToString(combineMode)
6595  << endl;
6596  std::cerr << os.str();
6597  }
6598 
6599  if (debug) {
6600  if (std::find(validModes, validModes + numValidModes, combineMode) ==
6601  validModes + numValidModes) {
6602  std::ostringstream os;
6603  os << "Invalid combine mode. Valid modes are {";
6604  for (int k = 0; k < numValidModes; ++k) {
6605  os << validModeNames[k];
6606  if (k < numValidModes - 1) {
6607  os << ", ";
6608  }
6609  }
6610  os << "}.";
6611  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
6612  }
6613  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(importLIDs.extent(0) != numPacketsPerLID.extent(0),
6614  std::invalid_argument, "importLIDs.extent(0)=" << importLIDs.extent(0) << " != numPacketsPerLID.extent(0)=" << numPacketsPerLID.extent(0) << ".");
6615  }
6616 
6617  if (combineMode == ZERO) {
6618  return; // nothing to do
6619  }
6620 
6621  if (debug) {
6622  using Teuchos::reduceAll;
6623  std::unique_ptr<std::ostringstream> msg(new std::ostringstream());
6624  int lclBad = 0;
6625  try {
6626  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6627  constantNumPackets, combineMode,
6628  verbose);
6629  } catch (std::exception& e) {
6630  lclBad = 1;
6631  *msg << e.what();
6632  }
6633  int gblBad = 0;
6634  const Teuchos::Comm<int>& comm = *(this->getComm());
6635  reduceAll<int, int>(comm, Teuchos::REDUCE_MAX,
6636  lclBad, Teuchos::outArg(gblBad));
6637  if (gblBad != 0) {
6638  // mfh 22 Oct 2017: 'prefix' might be null, since it is only
6639  // initialized in a debug build. Thus, we get the process
6640  // rank again here. This is an error message, so the small
6641  // run-time cost doesn't matter. See #1887.
6642  std::ostringstream os;
6643  os << "Proc " << comm.getRank() << ": " << msg->str() << endl;
6644  msg = std::unique_ptr<std::ostringstream>(new std::ostringstream());
6645  ::Tpetra::Details::gathervPrint(*msg, os.str(), comm);
6646  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, std::endl
6647  << "unpackAndCombineImpl "
6648  "threw an exception on one or more participating processes: "
6649  << endl
6650  << msg->str());
6651  }
6652  } else {
6653  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6654  constantNumPackets, combineMode,
6655  verbose);
6656  }
6657 
6658  if (verbose) {
6659  std::ostringstream os;
6660  os << *prefix << "Done!" << endl
6661  << *prefix << " "
6662  << dualViewStatusToString(importLIDs, "importLIDs")
6663  << endl
6664  << *prefix << " "
6665  << dualViewStatusToString(imports, "imports")
6666  << endl
6667  << *prefix << " "
6668  << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
6669  << endl;
6670  std::cerr << os.str();
6671  }
6672 }
6673 
6674 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6677  const Kokkos::DualView<const local_ordinal_type*,
6678  buffer_device_type>& importLIDs,
6679  Kokkos::DualView<char*, buffer_device_type> imports,
6680  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6681  const size_t constantNumPackets,
6682  const CombineMode combineMode,
6683  const bool verbose) {
6684  Details::ProfilingRegion region_unpack_and_combine_impl(
6685  "Tpetra::CrsMatrix::unpackAndCombineImpl",
6686  "Import/Export");
6687  using std::endl;
6688  const char tfecfFuncName[] = "unpackAndCombineImpl";
6689  std::unique_ptr<std::string> prefix;
6690  if (verbose) {
6691  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
6692  std::ostringstream os;
6693  os << *prefix << "isStaticGraph(): "
6694  << (isStaticGraph() ? "true" : "false")
6695  << ", importLIDs.extent(0): "
6696  << importLIDs.extent(0)
6697  << ", imports.extent(0): "
6698  << imports.extent(0)
6699  << ", numPacketsPerLID.extent(0): "
6700  << numPacketsPerLID.extent(0)
6701  << endl;
6702  std::cerr << os.str();
6703  }
6704 
6705  if (isStaticGraph()) {
6706  using Details::unpackCrsMatrixAndCombineNew;
6707  unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
6708  importLIDs, constantNumPackets,
6709  combineMode);
6710  } else {
6711  {
6712  using padding_type = typename crs_graph_type::padding_type;
6713  std::unique_ptr<padding_type> padding;
6714  try {
6715  padding = myGraph_->computePaddingForCrsMatrixUnpack(
6716  importLIDs, imports, numPacketsPerLID, verbose);
6717  } catch (std::exception& e) {
6718  const auto rowMap = getRowMap();
6719  const auto comm = rowMap.is_null() ? Teuchos::null : rowMap->getComm();
6720  const int myRank = comm.is_null() ? -1 : comm->getRank();
6721  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "Proc " << myRank << ": "
6722  "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
6723  "threw an exception: "
6724  << e.what());
6725  }
6726  if (verbose) {
6727  std::ostringstream os;
6728  os << *prefix << "Call applyCrsPadding" << endl;
6729  std::cerr << os.str();
6730  }
6731  applyCrsPadding(*padding, verbose);
6732  }
6733  if (verbose) {
6734  std::ostringstream os;
6735  os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
6736  std::cerr << os.str();
6737  }
6738  unpackAndCombineImplNonStatic(importLIDs, imports,
6739  numPacketsPerLID,
6740  constantNumPackets,
6741  combineMode);
6742  }
6743 
6744  if (verbose) {
6745  std::ostringstream os;
6746  os << *prefix << "Done" << endl;
6747  std::cerr << os.str();
6748  }
6749 }
6750 
6751 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6752 void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6753  unpackAndCombineImplNonStatic(
6754  const Kokkos::DualView<const local_ordinal_type*,
6755  buffer_device_type>& importLIDs,
6756  Kokkos::DualView<char*, buffer_device_type> imports,
6757  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6758  const size_t constantNumPackets,
6759  const CombineMode combineMode) {
6760  using Details::Behavior;
6763  using Details::PackTraits;
6764  using Details::ScalarViewTraits;
6765  using Kokkos::MemoryUnmanaged;
6766  using Kokkos::subview;
6767  using Kokkos::View;
6768  using std::endl;
6769  using LO = LocalOrdinal;
6770  using GO = GlobalOrdinal;
6771  using ST = impl_scalar_type;
6772  using size_type = typename Teuchos::ArrayView<LO>::size_type;
6773  using HES =
6774  typename View<int*, device_type>::host_mirror_type::execution_space;
6775  using pair_type = std::pair<typename View<int*, HES>::size_type,
6776  typename View<int*, HES>::size_type>;
6777  using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
6778  using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
6779  const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
6780 
6781  const bool debug = Behavior::debug("CrsMatrix");
6782  const bool verbose = Behavior::verbose("CrsMatrix");
6783  std::unique_ptr<std::string> prefix;
6784  if (verbose) {
6785  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
6786  std::ostringstream os;
6787  os << *prefix << endl; // we've already printed DualViews' statuses
6788  std::cerr << os.str();
6789  }
6790  const char* const prefix_raw =
6791  verbose ? prefix.get()->c_str() : nullptr;
6792 
6793  const size_type numImportLIDs = importLIDs.extent(0);
6794  if (combineMode == ZERO || numImportLIDs == 0) {
6795  return; // nothing to do; no need to combine entries
6796  }
6797 
6798  Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
6799  "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
6800  "Import/Export");
6801 
6802  // We're unpacking on host. This is read-only host access.
6803  if (imports.need_sync_host()) {
6804  imports.sync_host();
6805  }
6806  auto imports_h = imports.view_host();
6807 
6808  // Read-only host access.
6809  if (numPacketsPerLID.need_sync_host()) {
6810  numPacketsPerLID.sync_host();
6811  }
6812  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
6813 
6814  TEUCHOS_ASSERT(!importLIDs.need_sync_host());
6815  auto importLIDs_h = importLIDs.view_host();
6816 
6817  size_t numBytesPerValue;
6818  {
6819  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
6820  // with run-time size? We already assume that all entries in both the
6821  // source and target matrices have the same size. If the calling process
6822  // owns at least one entry in either matrix, we can use that entry to set
6823  // the size. However, it is possible that the calling process owns no
6824  // entries. In that case, we're in trouble. One way to fix this would be
6825  // for each row's data to contain the run-time size. This is only
6826  // necessary if the size is not a compile-time constant.
6827  Scalar val;
6828  numBytesPerValue = PackTraits<ST>::packValueCount(val);
6829  }
6830 
6831  // Determine the maximum number of entries in any one row
6832  size_t offset = 0;
6833  size_t maxRowNumEnt = 0;
6834  for (size_type i = 0; i < numImportLIDs; ++i) {
6835  const size_t numBytes = numPacketsPerLID_h[i];
6836  if (numBytes == 0) {
6837  continue; // empty buffer for that row means that the row is empty
6838  }
6839  // We need to unpack a nonzero number of entries for this row.
6840  if (debug) {
6841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(offset + numBytes > size_t(imports_h.extent(0)),
6842  std::logic_error, ": At local row index importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", offset (=" << offset << ") + numBytes (=" << numBytes << ") > "
6843  "imports_h.extent(0)="
6844  << imports_h.extent(0) << ".");
6845  }
6846  LO numEntLO = 0;
6847 
6848  if (debug) {
6849  const size_t theNumBytes =
6851  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(theNumBytes > numBytes, std::logic_error, ": theNumBytes=" << theNumBytes << " > numBytes = " << numBytes << ".");
6852  }
6853  const char* const inBuf = imports_h.data() + offset;
6854  const size_t actualNumBytes =
6855  PackTraits<LO>::unpackValue(numEntLO, inBuf);
6856 
6857  if (debug) {
6858  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(actualNumBytes > numBytes, std::logic_error, ": At i=" << i << ", actualNumBytes=" << actualNumBytes << " > numBytes=" << numBytes << ".");
6859  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numEntLO == 0, std::logic_error,
6860  ": At local row index "
6861  "importLIDs_h[i="
6862  << i << "]=" << importLIDs_h[i] << ", "
6863  "the number of entries read from the packed data is "
6864  "numEntLO="
6865  << numEntLO << ", but numBytes=" << numBytes
6866  << " != 0.");
6867  }
6868 
6869  maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
6870  offset += numBytes;
6871  }
6872 
6873  // Temporary space to cache incoming global column indices and
6874  // values. Column indices come in as global indices, in case the
6875  // source object's column Map differs from the target object's
6876  // (this's) column Map.
6877  View<GO*, HES> gblColInds;
6878  View<LO*, HES> lclColInds;
6879  View<ST*, HES> vals;
6880  {
6881  GO gid = 0;
6882  LO lid = 0;
6883  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
6884  // with run-time size? We already assume that all entries in both the
6885  // source and target matrices have the same size. If the calling process
6886  // owns at least one entry in either matrix, we can use that entry to set
6887  // the size. However, it is possible that the calling process owns no
6888  // entries. In that case, we're in trouble. One way to fix this would be
6889  // for each row's data to contain the run-time size. This is only
6890  // necessary if the size is not a compile-time constant.
6891  Scalar val;
6892  gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
6893  gid, maxRowNumEnt, "gids");
6894  lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
6895  lid, maxRowNumEnt, "lids");
6896  vals = ScalarViewTraits<ST, HES>::allocateArray(
6897  val, maxRowNumEnt, "vals");
6898  }
6899 
6900  offset = 0;
6901  for (size_type i = 0; i < numImportLIDs; ++i) {
6902  const size_t numBytes = numPacketsPerLID_h[i];
6903  if (numBytes == 0) {
6904  continue; // empty buffer for that row means that the row is empty
6905  }
6906  LO numEntLO = 0;
6907  const char* const inBuf = imports_h.data() + offset;
6908  (void)PackTraits<LO>::unpackValue(numEntLO, inBuf);
6909 
6910  const size_t numEnt = static_cast<size_t>(numEntLO);
6911  ;
6912  const LO lclRow = importLIDs_h[i];
6913 
6914  gids_out_type gidsOut = subview(gblColInds, pair_type(0, numEnt));
6915  vals_out_type valsOut = subview(vals, pair_type(0, numEnt));
6916 
6917  const size_t numBytesOut =
6918  unpackRow(gidsOut.data(), valsOut.data(), imports_h.data(),
6919  offset, numBytes, numEnt, numBytesPerValue);
6920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numBytes != numBytesOut, std::logic_error, ": At i=" << i << ", numBytes=" << numBytes << " != numBytesOut=" << numBytesOut << ".");
6921 
6922  const ST* const valsRaw = const_cast<const ST*>(valsOut.data());
6923  const GO* const gidsRaw = const_cast<const GO*>(gidsOut.data());
6924  combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
6925  combineMode, prefix_raw, debug, verbose);
6926  // Don't update offset until current LID has succeeded.
6927  offset += numBytes;
6928  } // for each import LID i
6929 
6930  if (verbose) {
6931  std::ostringstream os;
6932  os << *prefix << "Done" << endl;
6933  std::cerr << os.str();
6934  }
6935 }
6936 
6937 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6938 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
6940  getColumnMapMultiVector(const MV& X_domainMap,
6941  const bool force) const {
6942  using Teuchos::null;
6943  using Teuchos::RCP;
6944  using Teuchos::rcp;
6945 
6946  TEUCHOS_TEST_FOR_EXCEPTION(
6947  !this->hasColMap(), std::runtime_error,
6948  "Tpetra::CrsMatrix::getColumn"
6949  "MapMultiVector: You may only call this method if the matrix has a "
6950  "column Map. If the matrix does not yet have a column Map, you should "
6951  "first call fillComplete (with domain and range Map if necessary).");
6952 
6953  // If the graph is not fill complete, then the Import object (if
6954  // one should exist) hasn't been constructed yet.
6955  TEUCHOS_TEST_FOR_EXCEPTION(
6956  !this->getGraph()->isFillComplete(), std::runtime_error,
6957  "Tpetra::"
6958  "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
6959  "this matrix's graph is fill complete.");
6960 
6961  const size_t numVecs = X_domainMap.getNumVectors();
6962  RCP<const import_type> importer = this->getGraph()->getImporter();
6963  RCP<const map_type> colMap = this->getColMap();
6964 
6965  RCP<MV> X_colMap; // null by default
6966 
6967  // If the Import object is trivial (null), then we don't need a
6968  // separate column Map multivector. Just return null in that
6969  // case. The caller is responsible for knowing not to use the
6970  // returned null pointer.
6971  //
6972  // If the Import is nontrivial, then we do need a separate
6973  // column Map multivector for the Import operation. Check in
6974  // that case if we have to (re)create the column Map
6975  // multivector.
6976  if (!importer.is_null() || force) {
6977  if (importMV_.is_null() || importMV_->getNumVectors() != numVecs) {
6978  X_colMap = rcp(new MV(colMap, numVecs));
6979 
6980  // Cache the newly created multivector for later reuse.
6981  importMV_ = X_colMap;
6982  } else { // Yay, we can reuse the cached multivector!
6983  X_colMap = importMV_;
6984  // mfh 09 Jan 2013: We don't have to fill with zeros first,
6985  // because the Import uses INSERT combine mode, which overwrites
6986  // existing entries.
6987  //
6988  // X_colMap->putScalar (ZERO);
6989  }
6990  }
6991  return X_colMap;
6992 }
6993 
6994 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6995 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
6998  const bool force) const {
6999  using Teuchos::null;
7000  using Teuchos::RCP;
7001  using Teuchos::rcp;
7002 
7003  // If the graph is not fill complete, then the Export object (if
7004  // one should exist) hasn't been constructed yet.
7005  TEUCHOS_TEST_FOR_EXCEPTION(
7006  !this->getGraph()->isFillComplete(), std::runtime_error,
7007  "Tpetra::"
7008  "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7009  "matrix's graph is fill complete.");
7010 
7011  const size_t numVecs = Y_rangeMap.getNumVectors();
7012  RCP<const export_type> exporter = this->getGraph()->getExporter();
7013  // Every version of the constructor takes either a row Map, or a
7014  // graph (all of whose constructors take a row Map). Thus, the
7015  // matrix always has a row Map.
7016  RCP<const map_type> rowMap = this->getRowMap();
7017 
7018  RCP<MV> Y_rowMap; // null by default
7019 
7020  // If the Export object is trivial (null), then we don't need a
7021  // separate row Map multivector. Just return null in that case.
7022  // The caller is responsible for knowing not to use the returned
7023  // null pointer.
7024  //
7025  // If the Export is nontrivial, then we do need a separate row
7026  // Map multivector for the Export operation. Check in that case
7027  // if we have to (re)create the row Map multivector.
7028  if (!exporter.is_null() || force) {
7029  if (exportMV_.is_null() || exportMV_->getNumVectors() != numVecs) {
7030  Y_rowMap = rcp(new MV(rowMap, numVecs));
7031  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7032  } else { // Yay, we can reuse the cached multivector!
7033  Y_rowMap = exportMV_;
7034  }
7035  }
7036  return Y_rowMap;
7037 }
7038 
7039 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7041  removeEmptyProcessesInPlace(const Teuchos::RCP<const map_type>& newMap) {
7042  TEUCHOS_TEST_FOR_EXCEPTION(
7043  myGraph_.is_null(), std::logic_error,
7044  "Tpetra::CrsMatrix::"
7045  "removeEmptyProcessesInPlace: This method does not work when the matrix "
7046  "was created with a constant graph (that is, when it was created using "
7047  "the version of its constructor that takes an RCP<const CrsGraph>). "
7048  "This is because the matrix is not allowed to modify the graph in that "
7049  "case, but removing empty processes requires modifying the graph.");
7050  myGraph_->removeEmptyProcessesInPlace(newMap);
7051  // Even though CrsMatrix's row Map (as returned by getRowMap())
7052  // comes from its CrsGraph, CrsMatrix still implements DistObject,
7053  // so we also have to change the DistObject's Map.
7054  this->map_ = this->getRowMap();
7055  // In the nonconst graph case, staticGraph_ is just a const
7056  // pointer to myGraph_. This assignment is probably redundant,
7057  // but it doesn't hurt.
7058  staticGraph_ = Teuchos::rcp_const_cast<const Graph>(myGraph_);
7059 }
7060 
7061 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7062 Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
7064  add(const Scalar& alpha,
7066  const Scalar& beta,
7067  const Teuchos::RCP<const map_type>& domainMap,
7068  const Teuchos::RCP<const map_type>& rangeMap,
7069  const Teuchos::RCP<Teuchos::ParameterList>& params) const {
7070  using std::endl;
7071  using Teuchos::Array;
7072  using Teuchos::ArrayView;
7073  using Teuchos::ParameterList;
7074  using Teuchos::RCP;
7075  using Teuchos::rcp;
7076  using Teuchos::rcp_implicit_cast;
7077  using Teuchos::sublist;
7078  using LO = local_ordinal_type;
7079  using GO = global_ordinal_type;
7080  using crs_matrix_type =
7082  const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7083 
7084  const bool debug = Details::Behavior::debug("CrsMatrix");
7085  const bool verbose = Details::Behavior::verbose("CrsMatrix");
7086  std::unique_ptr<std::string> prefix;
7087  if (verbose) {
7088  prefix = this->createPrefix("CrsMatrix", "add");
7089  std::ostringstream os;
7090  os << *prefix << "Start" << endl;
7091  std::cerr << os.str();
7092  }
7093 
7094  const crs_matrix_type& B = *this; // a convenient abbreviation
7095  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7096  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7097 
7098  // If the user didn't supply a domain or range Map, then try to
7099  // get one from B first (if it has them), then from A (if it has
7100  // them). If we don't have any domain or range Maps, scold the
7101  // user.
7102  RCP<const map_type> A_domainMap = A.getDomainMap();
7103  RCP<const map_type> A_rangeMap = A.getRangeMap();
7104  RCP<const map_type> B_domainMap = B.getDomainMap();
7105  RCP<const map_type> B_rangeMap = B.getRangeMap();
7106 
7107  RCP<const map_type> theDomainMap = domainMap;
7108  RCP<const map_type> theRangeMap = rangeMap;
7109 
7110  if (domainMap.is_null()) {
7111  if (B_domainMap.is_null()) {
7112  TEUCHOS_TEST_FOR_EXCEPTION(
7113  A_domainMap.is_null(), std::invalid_argument,
7114  "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7115  "then you must supply a nonnull domain Map to this method.");
7116  theDomainMap = A_domainMap;
7117  } else {
7118  theDomainMap = B_domainMap;
7119  }
7120  }
7121  if (rangeMap.is_null()) {
7122  if (B_rangeMap.is_null()) {
7123  TEUCHOS_TEST_FOR_EXCEPTION(
7124  A_rangeMap.is_null(), std::invalid_argument,
7125  "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7126  "then you must supply a nonnull range Map to this method.");
7127  theRangeMap = A_rangeMap;
7128  } else {
7129  theRangeMap = B_rangeMap;
7130  }
7131  }
7132 
7133  if (debug) {
7134  // In debug mode, check that A and B have matching domain and
7135  // range Maps, if they have domain and range Maps at all. (If
7136  // they aren't fill complete, then they may not yet have them.)
7137  if (!A_domainMap.is_null() && !A_rangeMap.is_null()) {
7138  if (!B_domainMap.is_null() && !B_rangeMap.is_null()) {
7139  TEUCHOS_TEST_FOR_EXCEPTION(!B_domainMap->isSameAs(*A_domainMap),
7140  std::invalid_argument,
7141  errPfx << "The input RowMatrix A must have a domain Map "
7142  "which is the same as (isSameAs) this RowMatrix's "
7143  "domain Map.");
7144  TEUCHOS_TEST_FOR_EXCEPTION(!B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7145  errPfx << "The input RowMatrix A must have a range Map "
7146  "which is the same as (isSameAs) this RowMatrix's range "
7147  "Map.");
7148  TEUCHOS_TEST_FOR_EXCEPTION(!domainMap.is_null() &&
7149  !domainMap->isSameAs(*B_domainMap),
7150  std::invalid_argument,
7151  errPfx << "The input domain Map must be the same as "
7152  "(isSameAs) this RowMatrix's domain Map.");
7153  TEUCHOS_TEST_FOR_EXCEPTION(!rangeMap.is_null() &&
7154  !rangeMap->isSameAs(*B_rangeMap),
7155  std::invalid_argument,
7156  errPfx << "The input range Map must be the same as "
7157  "(isSameAs) this RowMatrix's range Map.");
7158  }
7159  } else if (!B_domainMap.is_null() && !B_rangeMap.is_null()) {
7160  TEUCHOS_TEST_FOR_EXCEPTION(!domainMap.is_null() &&
7161  !domainMap->isSameAs(*B_domainMap),
7162  std::invalid_argument,
7163  errPfx << "The input domain Map must be the same as "
7164  "(isSameAs) this RowMatrix's domain Map.");
7165  TEUCHOS_TEST_FOR_EXCEPTION(!rangeMap.is_null() && !rangeMap->isSameAs(*B_rangeMap),
7166  std::invalid_argument,
7167  errPfx << "The input range Map must be the same as "
7168  "(isSameAs) this RowMatrix's range Map.");
7169  } else {
7170  TEUCHOS_TEST_FOR_EXCEPTION(domainMap.is_null() || rangeMap.is_null(),
7171  std::invalid_argument, errPfx << "If neither A nor B "
7172  "have a domain and range Map, then you must supply a "
7173  "nonnull domain and range Map to this method.");
7174  }
7175  }
7176 
7177  // What parameters do we pass to C's constructor? Do we call
7178  // fillComplete on C after filling it? And if so, what parameters
7179  // do we pass to C's fillComplete call?
7180  bool callFillComplete = true;
7181  RCP<ParameterList> constructorSublist;
7182  RCP<ParameterList> fillCompleteSublist;
7183  if (!params.is_null()) {
7184  callFillComplete =
7185  params->get("Call fillComplete", callFillComplete);
7186  constructorSublist = sublist(params, "Constructor parameters");
7187  fillCompleteSublist = sublist(params, "fillComplete parameters");
7188  }
7189 
7190  RCP<const map_type> A_rowMap = A.getRowMap();
7191  RCP<const map_type> B_rowMap = B.getRowMap();
7192  RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7193  RCP<crs_matrix_type> C; // The result matrix.
7194 
7195  // If A and B's row Maps are the same, we can compute an upper
7196  // bound on the number of entries in each row of C, before
7197  // actually computing the sum. A reasonable upper bound is the
7198  // sum of the two entry counts in each row.
7199  if (A_rowMap->isSameAs(*B_rowMap)) {
7200  const LO localNumRows = static_cast<LO>(A_rowMap->getLocalNumElements());
7201  Array<size_t> C_maxNumEntriesPerRow(localNumRows, 0);
7202 
7203  // Get the number of entries in each row of A.
7204  if (alpha != ZERO) {
7205  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7206  const size_t A_numEntries = A.getNumEntriesInLocalRow(localRow);
7207  C_maxNumEntriesPerRow[localRow] += A_numEntries;
7208  }
7209  }
7210  // Get the number of entries in each row of B.
7211  if (beta != ZERO) {
7212  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7213  const size_t B_numEntries = B.getNumEntriesInLocalRow(localRow);
7214  C_maxNumEntriesPerRow[localRow] += B_numEntries;
7215  }
7216  }
7217  // Construct the result matrix C.
7218  if (constructorSublist.is_null()) {
7219  C = rcp(new crs_matrix_type(C_rowMap, C_maxNumEntriesPerRow()));
7220  } else {
7221  C = rcp(new crs_matrix_type(C_rowMap, C_maxNumEntriesPerRow(),
7222  constructorSublist));
7223  }
7224  // Since A and B have the same row Maps, we could add them
7225  // together all at once and merge values before we call
7226  // insertGlobalValues. However, we don't really need to, since
7227  // we've already allocated enough space in each row of C for C
7228  // to do the merge itself.
7229  } else { // the row Maps of A and B are not the same
7230  // Construct the result matrix C.
7231  // true: !A_rowMap->isSameAs (*B_rowMap)
7232  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, errPfx << "The row maps must "
7233  "be the same for statically allocated matrices, to ensure "
7234  "that there is sufficient space to do the addition.");
7235  }
7236 
7237  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null(), std::logic_error,
7238  errPfx << "C should not be null at this point. "
7239  "Please report this bug to the Tpetra developers.");
7240 
7241  if (verbose) {
7242  std::ostringstream os;
7243  os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7244  std::cerr << os.str();
7245  }
7246  using gids_type = nonconst_global_inds_host_view_type;
7247  using vals_type = nonconst_values_host_view_type;
7248  gids_type ind;
7249  vals_type val;
7250 
7251  if (alpha != ZERO) {
7252  const LO A_localNumRows = static_cast<LO>(A_rowMap->getLocalNumElements());
7253  for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7254  size_t A_numEntries = A.getNumEntriesInLocalRow(localRow);
7255  const GO globalRow = A_rowMap->getGlobalElement(localRow);
7256  if (A_numEntries > static_cast<size_t>(ind.size())) {
7257  Kokkos::resize(ind, A_numEntries);
7258  Kokkos::resize(val, A_numEntries);
7259  }
7260  gids_type indView = Kokkos::subview(ind, std::make_pair((size_t)0, A_numEntries));
7261  vals_type valView = Kokkos::subview(val, std::make_pair((size_t)0, A_numEntries));
7262  A.getGlobalRowCopy(globalRow, indView, valView, A_numEntries);
7263 
7264  if (alpha != ONE) {
7265  for (size_t k = 0; k < A_numEntries; ++k) {
7266  valView[k] *= alpha;
7267  }
7268  }
7269  C->insertGlobalValues(globalRow, A_numEntries,
7270  reinterpret_cast<Scalar*>(valView.data()),
7271  indView.data());
7272  }
7273  }
7274 
7275  if (beta != ZERO) {
7276  const LO B_localNumRows = static_cast<LO>(B_rowMap->getLocalNumElements());
7277  for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7278  size_t B_numEntries = B.getNumEntriesInLocalRow(localRow);
7279  const GO globalRow = B_rowMap->getGlobalElement(localRow);
7280  if (B_numEntries > static_cast<size_t>(ind.size())) {
7281  Kokkos::resize(ind, B_numEntries);
7282  Kokkos::resize(val, B_numEntries);
7283  }
7284  gids_type indView = Kokkos::subview(ind, std::make_pair((size_t)0, B_numEntries));
7285  vals_type valView = Kokkos::subview(val, std::make_pair((size_t)0, B_numEntries));
7286  B.getGlobalRowCopy(globalRow, indView, valView, B_numEntries);
7287 
7288  if (beta != ONE) {
7289  for (size_t k = 0; k < B_numEntries; ++k) {
7290  valView[k] *= beta;
7291  }
7292  }
7293  C->insertGlobalValues(globalRow, B_numEntries,
7294  reinterpret_cast<Scalar*>(valView.data()),
7295  indView.data());
7296  }
7297  }
7298 
7299  if (callFillComplete) {
7300  if (verbose) {
7301  std::ostringstream os;
7302  os << *prefix << "Call fillComplete on C" << endl;
7303  std::cerr << os.str();
7304  }
7305  if (fillCompleteSublist.is_null()) {
7306  C->fillComplete(theDomainMap, theRangeMap);
7307  } else {
7308  C->fillComplete(theDomainMap, theRangeMap, fillCompleteSublist);
7309  }
7310  } else if (verbose) {
7311  std::ostringstream os;
7312  os << *prefix << "Do NOT call fillComplete on C" << endl;
7313  std::cerr << os.str();
7314  }
7315 
7316  if (verbose) {
7317  std::ostringstream os;
7318  os << *prefix << "Done" << endl;
7319  std::cerr << os.str();
7320  }
7321  return rcp_implicit_cast<row_matrix_type>(C);
7322 }
7323 
7324 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7327  const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7328  const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>>& domainTransfer,
7329  const Teuchos::RCP<const map_type>& domainMap,
7330  const Teuchos::RCP<const map_type>& rangeMap,
7331  const Teuchos::RCP<Teuchos::ParameterList>& params) const {
7332  using Details::Behavior;
7337  using std::endl;
7338  using Teuchos::ArrayRCP;
7339  using Teuchos::ArrayView;
7340  using Teuchos::Comm;
7341  using Teuchos::ParameterList;
7342  using Teuchos::RCP;
7343  typedef LocalOrdinal LO;
7344  typedef GlobalOrdinal GO;
7345  typedef node_type NT;
7346  typedef CrsMatrix<Scalar, LO, GO, NT> this_CRS_type;
7347  typedef Vector<int, LO, GO, NT> IntVectorType;
7348  using Teuchos::as;
7349 
7350  const bool debug = Behavior::debug("CrsMatrix");
7351  const bool verbose = Behavior::verbose("CrsMatrix");
7352  int MyPID = getComm()->getRank();
7353 
7354  std::unique_ptr<std::string> verbosePrefix;
7355  if (verbose) {
7356  verbosePrefix =
7357  this->createPrefix("CrsMatrix", "transferAndFillComplete");
7358  std::ostringstream os;
7359  os << "Start" << endl;
7360  std::cerr << os.str();
7361  }
7362 
7363  //
7364  // Get the caller's parameters
7365  //
7366  bool isMM = false; // optimize for matrix-matrix ops.
7367  bool reverseMode = false; // Are we in reverse mode?
7368  bool restrictComm = false; // Do we need to restrict the communicator?
7369 
7370  int mm_optimization_core_count =
7371  Behavior::TAFC_OptimizationCoreCount();
7372  RCP<ParameterList> matrixparams; // parameters for the destination matrix
7373  bool overrideAllreduce = false;
7374  bool useKokkosPath = false;
7375  if (!params.is_null()) {
7376  matrixparams = sublist(params, "CrsMatrix");
7377  reverseMode = params->get("Reverse Mode", reverseMode);
7378  useKokkosPath = params->get("TAFC: use kokkos path", useKokkosPath);
7379  restrictComm = params->get("Restrict Communicator", restrictComm);
7380  auto& slist = params->sublist("matrixmatrix: kernel params", false);
7381  isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
7382  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
7383 
7384  overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
7385  if (getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7386  if (reverseMode) isMM = false;
7387  }
7388 
7389  // Only used in the sparse matrix-matrix multiply (isMM) case.
7390  std::shared_ptr<::Tpetra::Details::CommRequest> iallreduceRequest;
7391  int mismatch = 0;
7392  int reduced_mismatch = 0;
7393  if (isMM && !overrideAllreduce) {
7394  // Test for pathological matrix transfer
7395  const bool source_vals = !getGraph()->getImporter().is_null();
7396  const bool target_vals = !(rowTransfer.getExportLIDs().size() == 0 ||
7397  rowTransfer.getRemoteLIDs().size() == 0);
7398  mismatch = (source_vals != target_vals) ? 1 : 0;
7399  iallreduceRequest =
7400  ::Tpetra::Details::iallreduce(mismatch, reduced_mismatch,
7401  Teuchos::REDUCE_MAX, *(getComm()));
7402  }
7403 
7404 #ifdef HAVE_TPETRA_MMM_TIMINGS
7405  using Teuchos::TimeMonitor;
7406  std::string label;
7407  if (!params.is_null())
7408  label = params->get("Timer Label", label);
7409  std::string prefix = std::string("Tpetra ") + label + std::string(": ");
7410  std::string tlstr;
7411  {
7412  std::ostringstream os;
7413  if (isMM)
7414  os << ":MMOpt";
7415  else
7416  os << ":MMLegacy";
7417  tlstr = os.str();
7418  }
7419 
7420  Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") + tlstr));
7421 #endif
7422 
7423  // Make sure that the input argument rowTransfer is either an
7424  // Import or an Export. Import and Export are the only two
7425  // subclasses of Transfer that we defined, but users might
7426  // (unwisely, for now at least) decide to implement their own
7427  // subclasses. Exclude this possibility.
7428  const import_type* xferAsImport = dynamic_cast<const import_type*>(&rowTransfer);
7429  const export_type* xferAsExport = dynamic_cast<const export_type*>(&rowTransfer);
7430  TEUCHOS_TEST_FOR_EXCEPTION(
7431  xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7432  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7433  "argument must be either an Import or an Export, and its template "
7434  "parameters must match the corresponding template parameters of the "
7435  "CrsMatrix.");
7436 
7437  // Make sure that the input argument domainTransfer is either an
7438  // Import or an Export. Import and Export are the only two
7439  // subclasses of Transfer that we defined, but users might
7440  // (unwisely, for now at least) decide to implement their own
7441  // subclasses. Exclude this possibility.
7442  Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type>(domainTransfer);
7443  Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type>(domainTransfer);
7444 
7445  if (!domainTransfer.is_null()) {
7446  TEUCHOS_TEST_FOR_EXCEPTION(
7447  (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7448  "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7449  "argument must be either an Import or an Export, and its template "
7450  "parameters must match the corresponding template parameters of the "
7451  "CrsMatrix.");
7452 
7453  TEUCHOS_TEST_FOR_EXCEPTION(
7454  (xferAsImport != nullptr || !xferDomainAsImport.is_null()) &&
7455  ((xferAsImport != nullptr && xferDomainAsImport.is_null()) ||
7456  (xferAsImport == nullptr && !xferDomainAsImport.is_null())),
7457  std::invalid_argument,
7458  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7459  "arguments must be of the same type (either Import or Export).");
7460 
7461  TEUCHOS_TEST_FOR_EXCEPTION(
7462  (xferAsExport != nullptr || !xferDomainAsExport.is_null()) &&
7463  ((xferAsExport != nullptr && xferDomainAsExport.is_null()) ||
7464  (xferAsExport == nullptr && !xferDomainAsExport.is_null())),
7465  std::invalid_argument,
7466  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7467  "arguments must be of the same type (either Import or Export).");
7468  } // domainTransfer != null
7469 
7470  // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7471  // if the source Map is not distributed but the target Map is?
7472  const bool communication_needed = rowTransfer.getSourceMap()->isDistributed();
7473 
7474  // Get the new domain and range Maps. We need some of them for
7475  // error checking, now that we have the reverseMode parameter.
7476  RCP<const map_type> MyRowMap = reverseMode ? rowTransfer.getSourceMap() : rowTransfer.getTargetMap();
7477  RCP<const map_type> MyColMap; // create this below
7478  RCP<const map_type> MyDomainMap = !domainMap.is_null() ? domainMap : getDomainMap();
7479  RCP<const map_type> MyRangeMap = !rangeMap.is_null() ? rangeMap : getRangeMap();
7480  RCP<const map_type> BaseRowMap = MyRowMap;
7481  RCP<const map_type> BaseDomainMap = MyDomainMap;
7482 
7483  // If the user gave us a nonnull destMat, then check whether it's
7484  // "pristine." That means that it has no entries.
7485  //
7486  // FIXME (mfh 15 May 2014) If this is not true on all processes,
7487  // then this exception test may hang. It would be better to
7488  // forward an error flag to the next communication phase.
7489  if (!destMat.is_null()) {
7490  // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7491  // whether a graph or matrix has no entries on the calling
7492  // process, is that it is neither locally nor globally indexed.
7493  // This may change eventually with the Kokkos refactor version
7494  // of Tpetra, so it would be better just to check the quantity
7495  // of interest directly. Note that with the Kokkos refactor
7496  // version of Tpetra, asking for the total number of entries in
7497  // a graph or matrix that is not fill complete might require
7498  // computation (kernel launch), since it is not thread scalable
7499  // to update a count every time an entry is inserted.
7500  const bool NewFlag = !destMat->getGraph()->isLocallyIndexed() &&
7501  !destMat->getGraph()->isGloballyIndexed();
7502  TEUCHOS_TEST_FOR_EXCEPTION(
7503  !NewFlag, std::invalid_argument,
7504  "Tpetra::CrsMatrix::"
7505  "transferAndFillComplete: The input argument 'destMat' is only allowed "
7506  "to be nonnull, if its graph is empty (neither locally nor globally "
7507  "indexed).");
7508  // FIXME (mfh 15 May 2014) At some point, we want to change
7509  // graphs and matrices so that their DistObject Map
7510  // (this->getMap()) may differ from their row Map. This will
7511  // make redistribution for 2-D distributions more efficient. I
7512  // hesitate to change this check, because I'm not sure how much
7513  // the code here depends on getMap() and getRowMap() being the
7514  // same.
7515  TEUCHOS_TEST_FOR_EXCEPTION(
7516  !destMat->getRowMap()->isSameAs(*MyRowMap), std::invalid_argument,
7517  "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7518  "input argument 'destMat' is not the same as the (row) Map specified "
7519  "by the input argument 'rowTransfer'.");
7520  TEUCHOS_TEST_FOR_EXCEPTION(
7521  !destMat->checkSizes(*this), std::invalid_argument,
7522  "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7523  "destination matrix, but checkSizes() indicates that it is not a legal "
7524  "legal target for redistribution from the source matrix (*this). This "
7525  "may mean that they do not have the same dimensions.");
7526  }
7527 
7528  // If forward mode (the default), then *this's (row) Map must be
7529  // the same as the source Map of the Transfer. If reverse mode,
7530  // then *this's (row) Map must be the same as the target Map of
7531  // the Transfer.
7532  //
7533  // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7534  // and matrices so that their DistObject Map (this->getMap()) may
7535  // differ from their row Map. This will make redistribution for
7536  // 2-D distributions more efficient. I hesitate to change this
7537  // check, because I'm not sure how much the code here depends on
7538  // getMap() and getRowMap() being the same.
7539  TEUCHOS_TEST_FOR_EXCEPTION(
7540  !(reverseMode || getRowMap()->isSameAs(*rowTransfer.getSourceMap())),
7541  std::invalid_argument,
7542  "Tpetra::CrsMatrix::transferAndFillComplete: "
7543  "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7544  TEUCHOS_TEST_FOR_EXCEPTION(
7545  !(!reverseMode || getRowMap()->isSameAs(*rowTransfer.getTargetMap())),
7546  std::invalid_argument,
7547  "Tpetra::CrsMatrix::transferAndFillComplete: "
7548  "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7549 
7550  // checks for domainTransfer
7551  TEUCHOS_TEST_FOR_EXCEPTION(
7552  !xferDomainAsImport.is_null() && !xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7553  std::invalid_argument,
7554  "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7555  "argument must be the same as the rebalanced domain map 'domainMap'");
7556 
7557  TEUCHOS_TEST_FOR_EXCEPTION(
7558  !xferDomainAsExport.is_null() && !xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7559  std::invalid_argument,
7560  "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7561  "argument must be the same as the rebalanced domain map 'domainMap'");
7562 
7563  // The basic algorithm here is:
7564  //
7565  // 1. Call the moral equivalent of "Distor.do" to handle the import.
7566  // 2. Copy all the Imported and Copy/Permuted data into the raw
7567  // CrsMatrix / CrsGraphData pointers, still using GIDs.
7568  // 3. Call an optimized version of MakeColMap that avoids the
7569  // Directory lookups (since the importer knows who owns all the
7570  // GIDs) AND reindexes to LIDs.
7571  // 4. Call expertStaticFillComplete()
7572 
7573  // Get information from the Importer
7574  const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7575  ArrayView<const LO> ExportLIDs = reverseMode ? rowTransfer.getRemoteLIDs() : rowTransfer.getExportLIDs();
7576  auto RemoteLIDs = reverseMode ? rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
7577  auto PermuteToLIDs = reverseMode ? rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
7578  auto PermuteFromLIDs = reverseMode ? rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
7579  Distributor& Distor = rowTransfer.getDistributor();
7580 
7581  // Owning PIDs
7582  Teuchos::Array<int> SourcePids;
7583 
7584  // Temp variables for sub-communicators
7585  RCP<const map_type> ReducedRowMap, ReducedColMap,
7586  ReducedDomainMap, ReducedRangeMap;
7587  RCP<const Comm<int>> ReducedComm;
7588 
7589  // If the user gave us a null destMat, then construct the new
7590  // destination matrix. We will replace its column Map later.
7591  if (destMat.is_null()) {
7592  destMat = rcp(new this_CRS_type(MyRowMap, 0, matrixparams));
7593  }
7594 
7595  /***************************************************/
7596  /***** 1) First communicator restriction phase ****/
7597  /***************************************************/
7598  if (restrictComm) {
7599 #ifdef HAVE_TPETRA_MMM_TIMINGS
7600  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrictComm")));
7601 #endif
7602  ReducedRowMap = MyRowMap->removeEmptyProcesses();
7603  ReducedComm = ReducedRowMap.is_null() ? Teuchos::null : ReducedRowMap->getComm();
7604  destMat->removeEmptyProcessesInPlace(ReducedRowMap);
7605 
7606  ReducedDomainMap = MyRowMap.getRawPtr() == MyDomainMap.getRawPtr() ? ReducedRowMap : MyDomainMap->replaceCommWithSubset(ReducedComm);
7607  ReducedRangeMap = MyRowMap.getRawPtr() == MyRangeMap.getRawPtr() ? ReducedRowMap : MyRangeMap->replaceCommWithSubset(ReducedComm);
7608 
7609  // Reset the "my" maps
7610  MyRowMap = ReducedRowMap;
7611  MyDomainMap = ReducedDomainMap;
7612  MyRangeMap = ReducedRangeMap;
7613 
7614  // Update my PID, if we've restricted the communicator
7615  if (!ReducedComm.is_null()) {
7616  MyPID = ReducedComm->getRank();
7617  } else {
7618  MyPID = -2; // For debugging
7619  }
7620  } else {
7621  ReducedComm = MyRowMap->getComm();
7622  }
7623 
7624  /***************************************************/
7625  /***** 2) From Tpetra::DistObject::doTransfer() ****/
7626  /***************************************************/
7627  // Get the owning PIDs
7628  RCP<const import_type> MyImporter = getGraph()->getImporter();
7629 
7630  // check whether domain maps of source matrix and base domain map is the same
7631  bool bSameDomainMap = BaseDomainMap->isSameAs(*getDomainMap());
7632 
7633  if (!restrictComm && !MyImporter.is_null() && bSameDomainMap) {
7634 #ifdef HAVE_TPETRA_MMM_TIMINGS
7635  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs same map")));
7636 #endif
7637  // Same domain map as source matrix
7638  //
7639  // NOTE: This won't work for restrictComm (because the Import
7640  // doesn't know the restricted PIDs), though writing an
7641  // optimized version for that case would be easy (Import an
7642  // IntVector of the new PIDs). Might want to add this later.
7643  Import_Util::getPids(*MyImporter, SourcePids, false);
7644  } else if (restrictComm && !MyImporter.is_null() && bSameDomainMap) {
7645  // Same domain map as source matrix (restricted communicator)
7646  // We need one import from the domain to the column map
7647 #ifdef HAVE_TPETRA_MMM_TIMINGS
7648  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs restricted comm")));
7649 #endif
7650  IntVectorType SourceDomain_pids(getDomainMap(), true);
7651  IntVectorType SourceCol_pids(getColMap());
7652  // SourceDomain_pids contains the restricted pids
7653  SourceDomain_pids.putScalar(MyPID);
7654 
7655  SourceCol_pids.doImport(SourceDomain_pids, *MyImporter, INSERT);
7656  SourcePids.resize(getColMap()->getLocalNumElements());
7657  SourceCol_pids.get1dCopy(SourcePids());
7658  } else if (MyImporter.is_null()) {
7659  // Matrix has no off-process entries
7660 #ifdef HAVE_TPETRA_MMM_TIMINGS
7661  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs all local entries")));
7662 #endif
7663  SourcePids.resize(getColMap()->getLocalNumElements());
7664  SourcePids.assign(getColMap()->getLocalNumElements(), MyPID);
7665  } else if (!MyImporter.is_null() &&
7666  !domainTransfer.is_null()) {
7667  // general implementation for rectangular matrices with
7668  // domain map different than SourceMatrix domain map.
7669  // User has to provide a DomainTransfer object. We need
7670  // to communications (import/export)
7671 #ifdef HAVE_TPETRA_MMM_TIMINGS
7672  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs rectangular case")));
7673 #endif
7674 
7675  // TargetDomain_pids lives on the rebalanced new domain map
7676  IntVectorType TargetDomain_pids(domainMap);
7677  TargetDomain_pids.putScalar(MyPID);
7678 
7679  // SourceDomain_pids lives on the non-rebalanced old domain map
7680  IntVectorType SourceDomain_pids(getDomainMap());
7681 
7682  // SourceCol_pids lives on the non-rebalanced old column map
7683  IntVectorType SourceCol_pids(getColMap());
7684 
7685  if (!reverseMode && !xferDomainAsImport.is_null()) {
7686  SourceDomain_pids.doExport(TargetDomain_pids, *xferDomainAsImport, INSERT);
7687  } else if (reverseMode && !xferDomainAsExport.is_null()) {
7688  SourceDomain_pids.doExport(TargetDomain_pids, *xferDomainAsExport, INSERT);
7689  } else if (!reverseMode && !xferDomainAsExport.is_null()) {
7690  SourceDomain_pids.doImport(TargetDomain_pids, *xferDomainAsExport, INSERT);
7691  } else if (reverseMode && !xferDomainAsImport.is_null()) {
7692  SourceDomain_pids.doImport(TargetDomain_pids, *xferDomainAsImport, INSERT);
7693  } else {
7694  TEUCHOS_TEST_FOR_EXCEPTION(
7695  true, std::logic_error,
7696  "Tpetra::CrsMatrix::"
7697  "transferAndFillComplete: Should never get here! "
7698  "Please report this bug to a Tpetra developer.");
7699  }
7700  SourceCol_pids.doImport(SourceDomain_pids, *MyImporter, INSERT);
7701  SourcePids.resize(getColMap()->getLocalNumElements());
7702  SourceCol_pids.get1dCopy(SourcePids());
7703  } else if (!MyImporter.is_null() &&
7704  BaseDomainMap->isSameAs(*BaseRowMap) &&
7705  getDomainMap()->isSameAs(*getRowMap())) {
7706  // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
7707 #ifdef HAVE_TPETRA_MMM_TIMINGS
7708  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs query import")));
7709 #endif
7710 
7711  IntVectorType TargetRow_pids(domainMap);
7712  IntVectorType SourceRow_pids(getRowMap());
7713  IntVectorType SourceCol_pids(getColMap());
7714 
7715  TargetRow_pids.putScalar(MyPID);
7716  if (!reverseMode && xferAsImport != nullptr) {
7717  SourceRow_pids.doExport(TargetRow_pids, *xferAsImport, INSERT);
7718  } else if (reverseMode && xferAsExport != nullptr) {
7719  SourceRow_pids.doExport(TargetRow_pids, *xferAsExport, INSERT);
7720  } else if (!reverseMode && xferAsExport != nullptr) {
7721  SourceRow_pids.doImport(TargetRow_pids, *xferAsExport, INSERT);
7722  } else if (reverseMode && xferAsImport != nullptr) {
7723  SourceRow_pids.doImport(TargetRow_pids, *xferAsImport, INSERT);
7724  } else {
7725  TEUCHOS_TEST_FOR_EXCEPTION(
7726  true, std::logic_error,
7727  "Tpetra::CrsMatrix::"
7728  "transferAndFillComplete: Should never get here! "
7729  "Please report this bug to a Tpetra developer.");
7730  }
7731 
7732  SourceCol_pids.doImport(SourceRow_pids, *MyImporter, INSERT);
7733  SourcePids.resize(getColMap()->getLocalNumElements());
7734  SourceCol_pids.get1dCopy(SourcePids());
7735  } else {
7736  TEUCHOS_TEST_FOR_EXCEPTION(
7737  true, std::invalid_argument,
7738  "Tpetra::CrsMatrix::"
7739  "transferAndFillComplete: This method only allows either domainMap == "
7740  "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
7741  "getDomainMap () == getRowMap ()).");
7742  }
7743 
7744  // Tpetra-specific stuff
7745  size_t constantNumPackets = destMat->constantNumberOfPackets();
7746  {
7747 #ifdef HAVE_TPETRA_MMM_TIMINGS
7748  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC reallocate buffers")));
7749 #endif
7750  if (constantNumPackets == 0) {
7751  destMat->reallocArraysForNumPacketsPerLid(ExportLIDs.size(),
7752  RemoteLIDs.view_host().size());
7753  } else {
7754  // There are a constant number of packets per element. We
7755  // already know (from the number of "remote" (incoming)
7756  // elements) how many incoming elements we expect, so we can
7757  // resize the buffer accordingly.
7758  const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
7759  destMat->reallocImportsIfNeeded(rbufLen, false, nullptr);
7760  }
7761  }
7762 
7763  // Pack & Prepare w/ owning PIDs
7764  {
7765 #ifdef HAVE_TPETRA_MMM_TIMINGS
7766  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC pack and prepare")));
7767 #endif
7768  if (debug) {
7769  using std::cerr;
7770  using std::endl;
7771  using Teuchos::outArg;
7772  using Teuchos::REDUCE_MAX;
7773  using Teuchos::reduceAll;
7774  RCP<const Teuchos::Comm<int>> comm = this->getComm();
7775  const int myRank = comm->getRank();
7776 
7777  std::ostringstream errStrm;
7778  int lclErr = 0;
7779  int gblErr = 0;
7780 
7781  Teuchos::ArrayView<size_t> numExportPacketsPerLID;
7782  try {
7783  // packAndPrepare* methods modify numExportPacketsPerLID_.
7784  destMat->numExportPacketsPerLID_.modify_host();
7785  numExportPacketsPerLID =
7786  getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7787  } catch (std::exception& e) {
7788  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
7789  << e.what() << std::endl;
7790  lclErr = 1;
7791  } catch (...) {
7792  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
7793  "an exception not a subclass of std::exception"
7794  << std::endl;
7795  lclErr = 1;
7796  }
7797 
7798  if (!comm.is_null()) {
7799  reduceAll<int, int>(*comm, REDUCE_MAX, lclErr, outArg(gblErr));
7800  }
7801  if (gblErr != 0) {
7802  ::Tpetra::Details::gathervPrint(cerr, errStrm.str(), *comm);
7803  TEUCHOS_TEST_FOR_EXCEPTION(
7804  true, std::runtime_error,
7805  "getArrayViewFromDualView threw an "
7806  "exception on at least one process.");
7807  }
7808 
7809  if (verbose) {
7810  std::ostringstream os;
7811  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
7812  << std::endl;
7813  std::cerr << os.str();
7814  }
7815  try {
7817  destMat->exports_,
7818  numExportPacketsPerLID,
7819  ExportLIDs,
7820  SourcePids,
7821  constantNumPackets);
7822  } catch (std::exception& e) {
7823  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
7824  << e.what() << std::endl;
7825  lclErr = 1;
7826  } catch (...) {
7827  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
7828  "an exception not a subclass of std::exception"
7829  << std::endl;
7830  lclErr = 1;
7831  }
7832 
7833  if (verbose) {
7834  std::ostringstream os;
7835  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
7836  << std::endl;
7837  std::cerr << os.str();
7838  }
7839 
7840  if (!comm.is_null()) {
7841  reduceAll<int, int>(*comm, REDUCE_MAX, lclErr, outArg(gblErr));
7842  }
7843  if (gblErr != 0) {
7844  ::Tpetra::Details::gathervPrint(cerr, errStrm.str(), *comm);
7845  TEUCHOS_TEST_FOR_EXCEPTION(
7846  true, std::runtime_error,
7847  "packCrsMatrixWithOwningPIDs threw an "
7848  "exception on at least one process.");
7849  }
7850  } else {
7851  // packAndPrepare* methods modify numExportPacketsPerLID_.
7852  destMat->numExportPacketsPerLID_.modify_host();
7853  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
7854  getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7855  if (verbose) {
7856  std::ostringstream os;
7857  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
7858  << std::endl;
7859  std::cerr << os.str();
7860  }
7862  destMat->exports_,
7863  numExportPacketsPerLID,
7864  ExportLIDs,
7865  SourcePids,
7866  constantNumPackets);
7867  if (verbose) {
7868  std::ostringstream os;
7869  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
7870  << std::endl;
7871  std::cerr << os.str();
7872  }
7873  }
7874  }
7875 
7876  // Do the exchange of remote data.
7877  {
7878 #ifdef HAVE_TPETRA_MMM_TIMINGS
7879  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs exchange remote data")));
7880 #endif
7881  if (!communication_needed) {
7882  if (verbose) {
7883  std::ostringstream os;
7884  os << *verbosePrefix << "Communication not needed" << std::endl;
7885  std::cerr << os.str();
7886  }
7887  } else {
7888  if (reverseMode) {
7889  if (constantNumPackets == 0) { // variable number of packets per LID
7890  if (verbose) {
7891  std::ostringstream os;
7892  os << *verbosePrefix << "Reverse mode, variable # packets / LID"
7893  << std::endl;
7894  std::cerr << os.str();
7895  }
7896  // Make sure that host has the latest version, since we're
7897  // using the version on host. If host has the latest
7898  // version, syncing to host does nothing.
7899  destMat->numExportPacketsPerLID_.sync_host();
7900  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
7901  getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7902  destMat->numImportPacketsPerLID_.sync_host();
7903  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
7904  getArrayViewFromDualView(destMat->numImportPacketsPerLID_);
7905 
7906  if (verbose) {
7907  std::ostringstream os;
7908  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
7909  << std::endl;
7910  std::cerr << os.str();
7911  }
7912  Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
7913  destMat->numImportPacketsPerLID_.view_host());
7914  if (verbose) {
7915  std::ostringstream os;
7916  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
7917  << std::endl;
7918  std::cerr << os.str();
7919  }
7920 
7921  size_t totalImportPackets = 0;
7922  for (Array_size_type i = 0; i < numImportPacketsPerLID.size(); ++i) {
7923  totalImportPackets += numImportPacketsPerLID[i];
7924  }
7925 
7926  // Reallocation MUST go before setting the modified flag,
7927  // because it may clear out the flags.
7928  destMat->reallocImportsIfNeeded(totalImportPackets, verbose,
7929  verbosePrefix.get());
7930  destMat->imports_.modify_host();
7931  auto hostImports = destMat->imports_.view_host();
7932  // This is a legacy host pack/unpack path, so use the host
7933  // version of exports_.
7934  destMat->exports_.sync_host();
7935  auto hostExports = destMat->exports_.view_host();
7936  if (verbose) {
7937  std::ostringstream os;
7938  os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
7939  << std::endl;
7940  std::cerr << os.str();
7941  }
7942  Distor.doReversePostsAndWaits(hostExports,
7943  numExportPacketsPerLID,
7944  hostImports,
7945  numImportPacketsPerLID);
7946  if (verbose) {
7947  std::ostringstream os;
7948  os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
7949  << std::endl;
7950  std::cerr << os.str();
7951  }
7952  } else { // constant number of packets per LID
7953  if (verbose) {
7954  std::ostringstream os;
7955  os << *verbosePrefix << "Reverse mode, constant # packets / LID"
7956  << std::endl;
7957  std::cerr << os.str();
7958  }
7959  destMat->imports_.modify_host();
7960  auto hostImports = destMat->imports_.view_host();
7961  // This is a legacy host pack/unpack path, so use the host
7962  // version of exports_.
7963  destMat->exports_.sync_host();
7964  auto hostExports = destMat->exports_.view_host();
7965  if (verbose) {
7966  std::ostringstream os;
7967  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
7968  << std::endl;
7969  std::cerr << os.str();
7970  }
7971  Distor.doReversePostsAndWaits(hostExports,
7972  constantNumPackets,
7973  hostImports);
7974  if (verbose) {
7975  std::ostringstream os;
7976  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
7977  << std::endl;
7978  std::cerr << os.str();
7979  }
7980  }
7981  } else { // forward mode (the default)
7982  if (constantNumPackets == 0) { // variable number of packets per LID
7983  if (verbose) {
7984  std::ostringstream os;
7985  os << *verbosePrefix << "Forward mode, variable # packets / LID"
7986  << std::endl;
7987  std::cerr << os.str();
7988  }
7989  // Make sure that host has the latest version, since we're
7990  // using the version on host. If host has the latest
7991  // version, syncing to host does nothing.
7992  destMat->numExportPacketsPerLID_.sync_host();
7993  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
7994  getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7995  destMat->numImportPacketsPerLID_.sync_host();
7996  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
7997  getArrayViewFromDualView(destMat->numImportPacketsPerLID_);
7998  if (verbose) {
7999  std::ostringstream os;
8000  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8001  << std::endl;
8002  std::cerr << os.str();
8003  }
8004  Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8005  destMat->numImportPacketsPerLID_.view_host());
8006  if (verbose) {
8007  std::ostringstream os;
8008  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8009  << std::endl;
8010  std::cerr << os.str();
8011  }
8012 
8013  size_t totalImportPackets = 0;
8014  for (Array_size_type i = 0; i < numImportPacketsPerLID.size(); ++i) {
8015  totalImportPackets += numImportPacketsPerLID[i];
8016  }
8017 
8018  // Reallocation MUST go before setting the modified flag,
8019  // because it may clear out the flags.
8020  destMat->reallocImportsIfNeeded(totalImportPackets, verbose,
8021  verbosePrefix.get());
8022  destMat->imports_.modify_host();
8023  auto hostImports = destMat->imports_.view_host();
8024  // This is a legacy host pack/unpack path, so use the host
8025  // version of exports_.
8026  destMat->exports_.sync_host();
8027  auto hostExports = destMat->exports_.view_host();
8028  if (verbose) {
8029  std::ostringstream os;
8030  os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8031  << std::endl;
8032  std::cerr << os.str();
8033  }
8034  Distor.doPostsAndWaits(hostExports,
8035  numExportPacketsPerLID,
8036  hostImports,
8037  numImportPacketsPerLID);
8038  if (verbose) {
8039  std::ostringstream os;
8040  os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8041  << std::endl;
8042  std::cerr << os.str();
8043  }
8044  } else { // constant number of packets per LID
8045  if (verbose) {
8046  std::ostringstream os;
8047  os << *verbosePrefix << "Forward mode, constant # packets / LID"
8048  << std::endl;
8049  std::cerr << os.str();
8050  }
8051  destMat->imports_.modify_host();
8052  auto hostImports = destMat->imports_.view_host();
8053  // This is a legacy host pack/unpack path, so use the host
8054  // version of exports_.
8055  destMat->exports_.sync_host();
8056  auto hostExports = destMat->exports_.view_host();
8057  if (verbose) {
8058  std::ostringstream os;
8059  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8060  << std::endl;
8061  std::cerr << os.str();
8062  }
8063  Distor.doPostsAndWaits(hostExports,
8064  constantNumPackets,
8065  hostImports);
8066  if (verbose) {
8067  std::ostringstream os;
8068  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8069  << std::endl;
8070  std::cerr << os.str();
8071  }
8072  }
8073  }
8074  }
8075  }
8076 
8077  /*********************************************************************/
8078  /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8079  /*********************************************************************/
8080 
8081  bool runOnHost = std::is_same_v<typename device_type::memory_space, Kokkos::HostSpace> && !useKokkosPath;
8082 
8083  Teuchos::Array<int> RemotePids;
8084  if (runOnHost) {
8085  Teuchos::Array<int> TargetPids;
8086  // Backwards compatibility measure. We'll use this again below.
8087 
8088  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8089  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8090  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8091  destMat->numImportPacketsPerLID_.modify_host(); // FIXME
8092 
8093 #ifdef HAVE_TPETRA_MMM_TIMINGS
8094  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8095 #endif
8096  ArrayRCP<size_t> CSR_rowptr;
8097  ArrayRCP<GO> CSR_colind_GID;
8098  ArrayRCP<LO> CSR_colind_LID;
8099  ArrayRCP<Scalar> CSR_vals;
8100 
8101  destMat->imports_.sync_device();
8102  destMat->numImportPacketsPerLID_.sync_device();
8103 
8104  size_t N = BaseRowMap->getLocalNumElements();
8105 
8106  auto RemoteLIDs_d = RemoteLIDs.view_device();
8107  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8108  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8109 
8111  *this,
8112  RemoteLIDs_d,
8113  destMat->imports_.view_device(), // hostImports
8114  destMat->numImportPacketsPerLID_.view_device(), // numImportPacketsPerLID
8115  NumSameIDs,
8116  PermuteToLIDs_d,
8117  PermuteFromLIDs_d,
8118  N,
8119  MyPID,
8120  CSR_rowptr,
8121  CSR_colind_GID,
8122  CSR_vals,
8123  SourcePids(),
8124  TargetPids);
8125 
8126  // If LO and GO are the same, we can reuse memory when
8127  // converting the column indices from global to local indices.
8128  if (typeid(LO) == typeid(GO)) {
8129  CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO>(CSR_colind_GID);
8130  } else {
8131  CSR_colind_LID.resize(CSR_colind_GID.size());
8132  }
8133  CSR_colind_LID.resize(CSR_colind_GID.size());
8134 
8135  // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8136  // owned entries. Convert them to the actual PID.
8137  // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
8138  for (size_t i = 0; i < static_cast<size_t>(TargetPids.size()); i++) {
8139  if (TargetPids[i] == -1) TargetPids[i] = MyPID;
8140  }
8141 #ifdef HAVE_TPETRA_MMM_TIMINGS
8142  tmCopySPRdata = Teuchos::null;
8143 #endif
8144  /**************************************************************/
8145  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8146  /**************************************************************/
8147  // Call an optimized version of makeColMap that avoids the
8148  // Directory lookups (since the Import object knows who owns all
8149  // the GIDs).
8150  if (verbose) {
8151  std::ostringstream os;
8152  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8153  << std::endl;
8154  std::cerr << os.str();
8155  }
8156  {
8157 #ifdef HAVE_TPETRA_MMM_TIMINGS
8158  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8159 #endif
8160  Import_Util::lowCommunicationMakeColMapAndReindexSerial(CSR_rowptr(),
8161  CSR_colind_LID(),
8162  CSR_colind_GID(),
8163  BaseDomainMap,
8164  TargetPids,
8165  RemotePids,
8166  MyColMap);
8167  }
8168 
8169  if (verbose) {
8170  std::ostringstream os;
8171  os << *verbosePrefix << "restrictComm="
8172  << (restrictComm ? "true" : "false") << std::endl;
8173  std::cerr << os.str();
8174  }
8175 
8176  /*******************************************************/
8177  /**** 4) Second communicator restriction phase ****/
8178  /*******************************************************/
8179  {
8180 #ifdef HAVE_TPETRA_MMM_TIMINGS
8181  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8182 #endif
8183  if (restrictComm) {
8184  ReducedColMap = (MyRowMap.getRawPtr() == MyColMap.getRawPtr()) ? ReducedRowMap : MyColMap->replaceCommWithSubset(ReducedComm);
8185  MyColMap = ReducedColMap; // Reset the "my" maps
8186  }
8187 
8188  // Replace the col map
8189  if (verbose) {
8190  std::ostringstream os;
8191  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8192  std::cerr << os.str();
8193  }
8194  destMat->replaceColMap(MyColMap);
8195 
8196  // Short circuit if the processor is no longer in the communicator
8197  //
8198  // NOTE: Epetra replaces modifies all "removed" processes so they
8199  // have a dummy (serial) Map that doesn't touch the original
8200  // communicator. Duplicating that here might be a good idea.
8201  if (ReducedComm.is_null()) {
8202  if (verbose) {
8203  std::ostringstream os;
8204  os << *verbosePrefix << "I am no longer in the communicator; "
8205  "returning"
8206  << std::endl;
8207  std::cerr << os.str();
8208  }
8209  return;
8210  }
8211  }
8212 
8213  /***************************************************/
8214  /**** 5) Sort ****/
8215  /***************************************************/
8216  if ((!reverseMode && xferAsImport != nullptr) ||
8217  (reverseMode && xferAsExport != nullptr)) {
8218  if (verbose) {
8219  std::ostringstream os;
8220  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8221  std::cerr << os.str();
8222  }
8223 #ifdef HAVE_TPETRA_MMM_TIMINGS
8224  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8225 #endif
8226  Import_Util::sortCrsEntries(CSR_rowptr(),
8227  CSR_colind_LID(),
8228  CSR_vals());
8229  } else if ((!reverseMode && xferAsExport != nullptr) ||
8230  (reverseMode && xferAsImport != nullptr)) {
8231  if (verbose) {
8232  std::ostringstream os;
8233  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8234  << endl;
8235  std::cerr << os.str();
8236  }
8237 #ifdef HAVE_TPETRA_MMM_TIMINGS
8238  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8239 #endif
8240  Import_Util::sortAndMergeCrsEntries(CSR_rowptr(),
8241  CSR_colind_LID(),
8242  CSR_vals());
8243  if (CSR_rowptr[N] != static_cast<size_t>(CSR_vals.size())) {
8244  CSR_colind_LID.resize(CSR_rowptr[N]);
8245  CSR_vals.resize(CSR_rowptr[N]);
8246  }
8247  } else {
8248  TEUCHOS_TEST_FOR_EXCEPTION(
8249  true, std::logic_error,
8250  "Tpetra::CrsMatrix::"
8251  "transferAndFillComplete: Should never get here! "
8252  "Please report this bug to a Tpetra developer.");
8253  }
8254  /***************************************************/
8255  /**** 6) Reset the colmap and the arrays ****/
8256  /***************************************************/
8257 
8258  if (verbose) {
8259  std::ostringstream os;
8260  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8261  std::cerr << os.str();
8262  }
8263 
8264  // Call constructor for the new matrix (restricted as needed)
8265  //
8266  // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8267  // refactor version of CrsMatrix, though it reserves the right to
8268  // make a deep copy of the arrays.
8269  {
8270 #ifdef HAVE_TPETRA_MMM_TIMINGS
8271  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8272 #endif
8273  destMat->setAllValues(CSR_rowptr, CSR_colind_LID, CSR_vals);
8274  }
8275 
8276  } else {
8277  // run on device
8278 
8279  // Backwards compatibility measure. We'll use this again below.
8280 
8281  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8282  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8283  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8284  destMat->numImportPacketsPerLID_.modify_host(); // FIXME
8285 
8286 #ifdef HAVE_TPETRA_MMM_TIMINGS
8287  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8288 #endif
8289  ArrayRCP<size_t> CSR_rowptr;
8290  ArrayRCP<GO> CSR_colind_GID;
8291  ArrayRCP<LO> CSR_colind_LID;
8292  ArrayRCP<Scalar> CSR_vals;
8293 
8294  destMat->imports_.sync_device();
8295  destMat->numImportPacketsPerLID_.sync_device();
8296 
8297  size_t N = BaseRowMap->getLocalNumElements();
8298 
8299  auto RemoteLIDs_d = RemoteLIDs.view_device();
8300  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8301  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8302 
8303  Kokkos::View<size_t*, device_type> CSR_rowptr_d;
8304  Kokkos::View<GO*, device_type> CSR_colind_GID_d;
8305  Kokkos::View<LO*, device_type> CSR_colind_LID_d;
8306  Kokkos::View<impl_scalar_type*, device_type> CSR_vals_d;
8307  Kokkos::View<int*, device_type> TargetPids_d;
8308 
8310  *this,
8311  RemoteLIDs_d,
8312  destMat->imports_.view_device(), // hostImports
8313  destMat->numImportPacketsPerLID_.view_device(), // numImportPacketsPerLID
8314  NumSameIDs,
8315  PermuteToLIDs_d,
8316  PermuteFromLIDs_d,
8317  N,
8318  MyPID,
8319  CSR_rowptr_d,
8320  CSR_colind_GID_d,
8321  CSR_vals_d,
8322  SourcePids(),
8323  TargetPids_d);
8324 
8325  Kokkos::resize(CSR_colind_LID_d, CSR_colind_GID_d.size());
8326 
8327 #ifdef HAVE_TPETRA_MMM_TIMINGS
8328  tmCopySPRdata = Teuchos::null;
8329 #endif
8330  /**************************************************************/
8331  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8332  /**************************************************************/
8333  // Call an optimized version of makeColMap that avoids the
8334  // Directory lookups (since the Import object knows who owns all
8335  // the GIDs).
8336  if (verbose) {
8337  std::ostringstream os;
8338  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8339  << std::endl;
8340  std::cerr << os.str();
8341  }
8342  {
8343 #ifdef HAVE_TPETRA_MMM_TIMINGS
8344  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8345 #endif
8346  Import_Util::lowCommunicationMakeColMapAndReindex(CSR_rowptr_d,
8347  CSR_colind_LID_d,
8348  CSR_colind_GID_d,
8349  BaseDomainMap,
8350  TargetPids_d,
8351  RemotePids,
8352  MyColMap);
8353  }
8354 
8355  if (verbose) {
8356  std::ostringstream os;
8357  os << *verbosePrefix << "restrictComm="
8358  << (restrictComm ? "true" : "false") << std::endl;
8359  std::cerr << os.str();
8360  }
8361 
8362  /*******************************************************/
8363  /**** 4) Second communicator restriction phase ****/
8364  /*******************************************************/
8365  {
8366 #ifdef HAVE_TPETRA_MMM_TIMINGS
8367  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8368 #endif
8369  if (restrictComm) {
8370  ReducedColMap = (MyRowMap.getRawPtr() == MyColMap.getRawPtr()) ? ReducedRowMap : MyColMap->replaceCommWithSubset(ReducedComm);
8371  MyColMap = ReducedColMap; // Reset the "my" maps
8372  }
8373 
8374  // Replace the col map
8375  if (verbose) {
8376  std::ostringstream os;
8377  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8378  std::cerr << os.str();
8379  }
8380  destMat->replaceColMap(MyColMap);
8381 
8382  // Short circuit if the processor is no longer in the communicator
8383  //
8384  // NOTE: Epetra replaces modifies all "removed" processes so they
8385  // have a dummy (serial) Map that doesn't touch the original
8386  // communicator. Duplicating that here might be a good idea.
8387  if (ReducedComm.is_null()) {
8388  if (verbose) {
8389  std::ostringstream os;
8390  os << *verbosePrefix << "I am no longer in the communicator; "
8391  "returning"
8392  << std::endl;
8393  std::cerr << os.str();
8394  }
8395  return;
8396  }
8397  }
8398 
8399  /***************************************************/
8400  /**** 5) Sort ****/
8401  /***************************************************/
8402 
8403  if ((!reverseMode && xferAsImport != nullptr) ||
8404  (reverseMode && xferAsExport != nullptr)) {
8405  if (verbose) {
8406  std::ostringstream os;
8407  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8408  std::cerr << os.str();
8409  }
8410 #ifdef HAVE_TPETRA_MMM_TIMINGS
8411  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8412 #endif
8413  Import_Util::sortCrsEntries(CSR_rowptr_d,
8414  CSR_colind_LID_d,
8415  CSR_vals_d);
8416  } else if ((!reverseMode && xferAsExport != nullptr) ||
8417  (reverseMode && xferAsImport != nullptr)) {
8418  if (verbose) {
8419  std::ostringstream os;
8420  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8421  << endl;
8422  std::cerr << os.str();
8423  }
8424 #ifdef HAVE_TPETRA_MMM_TIMINGS
8425  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8426 #endif
8427  Import_Util::sortAndMergeCrsEntries(CSR_rowptr_d,
8428  CSR_colind_LID_d,
8429  CSR_vals_d);
8430  } else {
8431  TEUCHOS_TEST_FOR_EXCEPTION(
8432  true, std::logic_error,
8433  "Tpetra::CrsMatrix::"
8434  "transferAndFillComplete: Should never get here! "
8435  "Please report this bug to a Tpetra developer.");
8436  }
8437 
8438  /***************************************************/
8439  /**** 6) Reset the colmap and the arrays ****/
8440  /***************************************************/
8441 
8442  if (verbose) {
8443  std::ostringstream os;
8444  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8445  std::cerr << os.str();
8446  }
8447 
8448  {
8449 #ifdef HAVE_TPETRA_MMM_TIMINGS
8450  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8451 #endif
8452  destMat->setAllValues(CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d);
8453  }
8454 
8455  } // if (runOnHost) .. else ..
8456 
8457  /***************************************************/
8458  /**** 7) Build Importer & Call ESFC ****/
8459  /***************************************************/
8460 #ifdef HAVE_TPETRA_MMM_TIMINGS
8461  RCP<TimeMonitor> tmIESFC = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC build importer and esfc"))));
8462 #endif
8463  // Pre-build the importer using the existing PIDs
8464  Teuchos::ParameterList esfc_params;
8465 
8466  RCP<import_type> MyImport;
8467 
8468  // Fulfull the non-blocking allreduce on reduced_mismatch.
8469  if (iallreduceRequest.get() != nullptr) {
8470  if (verbose) {
8471  std::ostringstream os;
8472  os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8473  << endl;
8474  std::cerr << os.str();
8475  }
8476  iallreduceRequest->wait();
8477  if (reduced_mismatch != 0) {
8478  isMM = false;
8479  }
8480  }
8481 
8482  if (isMM) {
8483 #ifdef HAVE_TPETRA_MMM_TIMINGS
8484  Teuchos::TimeMonitor MMisMM(*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8485 #endif
8486  // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8487 
8488  if (verbose) {
8489  std::ostringstream os;
8490  os << *verbosePrefix << "Getting CRS pointers" << endl;
8491  std::cerr << os.str();
8492  }
8493 
8494  Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8495  Teuchos::ArrayRCP<int> type3PIDs;
8496  auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8497  auto colind = getCrsGraph()->getLocalIndicesHost();
8498 
8499  if (verbose) {
8500  std::ostringstream os;
8501  os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8502  std::cerr << os.str();
8503  }
8504 
8505  {
8506 #ifdef HAVE_TPETRA_MMM_TIMINGS
8507  TimeMonitor tm_rnd(*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8508 #endif
8509  Import_Util::reverseNeighborDiscovery(*this,
8510  rowptr,
8511  colind,
8512  rowTransfer,
8513  MyImporter,
8514  MyDomainMap,
8515  type3PIDs,
8516  type3LIDs,
8517  ReducedComm);
8518  }
8519 
8520  if (verbose) {
8521  std::ostringstream os;
8522  os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8523  std::cerr << os.str();
8524  }
8525 
8526  Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8527  Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8528 
8529  Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8530  Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8531 
8532  const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8533  // from EpetraExt_MMHelpers.cpp: build_type2_exports
8534  std::vector<bool> IsOwned(numCols, true);
8535  std::vector<int> SentTo(numCols, -1);
8536  if (!MyImporter.is_null()) {
8537  for (auto&& rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8538  IsOwned[rlid] = false;
8539  }
8540  }
8541 
8542  std::vector<std::pair<int, GO>> usrtg;
8543  usrtg.reserve(TEPID2.size());
8544 
8545  {
8546  const auto& colMap = *(this->getColMap()); // *this is sourcematrix
8547  for (Array_size_type i = 0; i < TEPID2.size(); ++i) {
8548  const LO row = TELID2[i];
8549  const int pid = TEPID2[i];
8550  for (auto j = rowptr[row]; j < rowptr[row + 1]; ++j) {
8551  const int col = colind[j];
8552  if (IsOwned[col] && SentTo[col] != pid) {
8553  SentTo[col] = pid;
8554  GO gid = colMap.getGlobalElement(col);
8555  usrtg.push_back(std::pair<int, GO>(pid, gid));
8556  }
8557  }
8558  }
8559  }
8560 
8561  // This sort can _not_ be omitted.[
8562  std::sort(usrtg.begin(), usrtg.end()); // default comparator does the right thing, now sorted in gid order
8563  auto eopg = std ::unique(usrtg.begin(), usrtg.end());
8564  // 25 Jul 2018: Could just ignore the entries at and after eopg.
8565  usrtg.erase(eopg, usrtg.end());
8566 
8567  const Array_size_type type2_us_size = usrtg.size();
8568  Teuchos::ArrayRCP<int> EPID2 = Teuchos::arcp(new int[type2_us_size], 0, type2_us_size, true);
8569  Teuchos::ArrayRCP<LO> ELID2 = Teuchos::arcp(new LO[type2_us_size], 0, type2_us_size, true);
8570 
8571  int pos = 0;
8572  for (auto&& p : usrtg) {
8573  EPID2[pos] = p.first;
8574  ELID2[pos] = this->getDomainMap()->getLocalElement(p.second);
8575  pos++;
8576  }
8577 
8578  Teuchos::ArrayView<int> EPID3 = type3PIDs();
8579  Teuchos::ArrayView<LO> ELID3 = type3LIDs();
8580  GO InfGID = std::numeric_limits<GO>::max();
8581  int InfPID = INT_MAX;
8582 #ifdef TPETRA_MIN3
8583 #undef TPETRA_MIN3
8584 #endif // TPETRA_MIN3
8585 #define TPETRA_MIN3(x, y, z) ((x) < (y) ? (std::min(x, z)) : (std::min(y, z)))
8586  int i1 = 0, i2 = 0, i3 = 0;
8587  int Len1 = EPID1.size();
8588  int Len2 = EPID2.size();
8589  int Len3 = EPID3.size();
8590 
8591  int MyLen = Len1 + Len2 + Len3;
8592  Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen], 0, MyLen, true);
8593  Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen], 0, MyLen, true);
8594  int iloc = 0; // will be the size of the userExportLID/PIDs
8595 
8596  while (i1 < Len1 || i2 < Len2 || i3 < Len3) {
8597  int PID1 = (i1 < Len1) ? (EPID1[i1]) : InfPID;
8598  int PID2 = (i2 < Len2) ? (EPID2[i2]) : InfPID;
8599  int PID3 = (i3 < Len3) ? (EPID3[i3]) : InfPID;
8600 
8601  GO GID1 = (i1 < Len1) ? getDomainMap()->getGlobalElement(ELID1[i1]) : InfGID;
8602  GO GID2 = (i2 < Len2) ? getDomainMap()->getGlobalElement(ELID2[i2]) : InfGID;
8603  GO GID3 = (i3 < Len3) ? getDomainMap()->getGlobalElement(ELID3[i3]) : InfGID;
8604 
8605  int MIN_PID = TPETRA_MIN3(PID1, PID2, PID3);
8606  GO MIN_GID = TPETRA_MIN3(((PID1 == MIN_PID) ? GID1 : InfGID), ((PID2 == MIN_PID) ? GID2 : InfGID), ((PID3 == MIN_PID) ? GID3 : InfGID));
8607 #ifdef TPETRA_MIN3
8608 #undef TPETRA_MIN3
8609 #endif // TPETRA_MIN3
8610  bool added_entry = false;
8611 
8612  if (PID1 == MIN_PID && GID1 == MIN_GID) {
8613  userExportLIDs[iloc] = ELID1[i1];
8614  userExportPIDs[iloc] = EPID1[i1];
8615  i1++;
8616  added_entry = true;
8617  iloc++;
8618  }
8619  if (PID2 == MIN_PID && GID2 == MIN_GID) {
8620  if (!added_entry) {
8621  userExportLIDs[iloc] = ELID2[i2];
8622  userExportPIDs[iloc] = EPID2[i2];
8623  added_entry = true;
8624  iloc++;
8625  }
8626  i2++;
8627  }
8628  if (PID3 == MIN_PID && GID3 == MIN_GID) {
8629  if (!added_entry) {
8630  userExportLIDs[iloc] = ELID3[i3];
8631  userExportPIDs[iloc] = EPID3[i3];
8632  iloc++;
8633  }
8634  i3++;
8635  }
8636  }
8637 
8638  if (verbose) {
8639  std::ostringstream os;
8640  os << *verbosePrefix << "Create Import" << std::endl;
8641  std::cerr << os.str();
8642  }
8643 
8644 #ifdef HAVE_TPETRA_MMM_TIMINGS
8645  auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
8646 #endif
8647  Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
8648  // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
8649  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
8650  MyImport = rcp(new import_type(MyDomainMap,
8651  MyColMap,
8652  RemotePids,
8653  userExportLIDs.view(0, iloc).getConst(),
8654  userExportPIDs.view(0, iloc).getConst(),
8655  plist));
8656 
8657  if (verbose) {
8658  std::ostringstream os;
8659  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
8660  std::cerr << os.str();
8661  }
8662 
8663  {
8664 #ifdef HAVE_TPETRA_MMM_TIMINGS
8665  TimeMonitor esfc(*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
8666  esfc_params.set("Timer Label", label + std::string("isMM eSFC"));
8667 #endif
8668  if (!params.is_null())
8669  esfc_params.set("compute global constants", params->get("compute global constants", true));
8670  destMat->expertStaticFillComplete(MyDomainMap, MyRangeMap, MyImport, Teuchos::null, rcp(new Teuchos::ParameterList(esfc_params)));
8671  }
8672 
8673  } // if(isMM)
8674  else {
8675 #ifdef HAVE_TPETRA_MMM_TIMINGS
8676  TimeMonitor MMnotMMblock(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
8677 #endif
8678  if (verbose) {
8679  std::ostringstream os;
8680  os << *verbosePrefix << "Create Import" << std::endl;
8681  std::cerr << os.str();
8682  }
8683 
8684 #ifdef HAVE_TPETRA_MMM_TIMINGS
8685  TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
8686 #endif
8687  Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
8688  mypars->set("Timer Label", "notMMFrom_tAFC");
8689  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
8690  MyImport = rcp(new import_type(MyDomainMap, MyColMap, RemotePids, mypars));
8691 
8692  if (verbose) {
8693  std::ostringstream os;
8694  os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
8695  std::cerr << os.str();
8696  }
8697 
8698 #ifdef HAVE_TPETRA_MMM_TIMINGS
8699  TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
8700  esfc_params.set("Timer Label", prefix + std::string("notMM eSFC"));
8701 #else
8702  esfc_params.set("Timer Label", std::string("notMM eSFC"));
8703 #endif
8704 
8705  if (!params.is_null()) {
8706  esfc_params.set("compute global constants",
8707  params->get("compute global constants", true));
8708  }
8709  destMat->expertStaticFillComplete(MyDomainMap, MyRangeMap,
8710  MyImport, Teuchos::null,
8711  rcp(new Teuchos::ParameterList(esfc_params)));
8712  }
8713 
8714 #ifdef HAVE_TPETRA_MMM_TIMINGS
8715  tmIESFC = Teuchos::null;
8716 #endif
8717 
8718  if (verbose) {
8719  std::ostringstream os;
8720  os << *verbosePrefix << "Done" << endl;
8721  std::cerr << os.str();
8722  }
8723 } // transferAndFillComplete
8724 
8725 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8727  importAndFillComplete(Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& destMatrix,
8728  const import_type& importer,
8729  const Teuchos::RCP<const map_type>& domainMap,
8730  const Teuchos::RCP<const map_type>& rangeMap,
8731  const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8732  transferAndFillComplete(destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
8733 }
8734 
8735 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8737  importAndFillComplete(Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& destMatrix,
8738  const import_type& rowImporter,
8739  const import_type& domainImporter,
8740  const Teuchos::RCP<const map_type>& domainMap,
8741  const Teuchos::RCP<const map_type>& rangeMap,
8742  const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8743  transferAndFillComplete(destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
8744 }
8745 
8746 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8748  exportAndFillComplete(Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& destMatrix,
8749  const export_type& exporter,
8750  const Teuchos::RCP<const map_type>& domainMap,
8751  const Teuchos::RCP<const map_type>& rangeMap,
8752  const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8753  transferAndFillComplete(destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
8754 }
8755 
8756 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8758  exportAndFillComplete(Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& destMatrix,
8759  const export_type& rowExporter,
8760  const export_type& domainExporter,
8761  const Teuchos::RCP<const map_type>& domainMap,
8762  const Teuchos::RCP<const map_type>& rangeMap,
8763  const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8764  transferAndFillComplete(destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
8765 }
8766 
8767 } // namespace Tpetra
8768 
8769 //
8770 // Explicit instantiation macro
8771 //
8772 // Must be expanded from within the Tpetra namespace!
8773 //
8774 
8775 #define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
8776  \
8777  template class CrsMatrix<SCALAR, LO, GO, NODE>;
8778 
8779 #define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO, SI, LO, GO, NODE) \
8780  \
8781  template Teuchos::RCP<CrsMatrix<SO, LO, GO, NODE>> \
8782  CrsMatrix<SI, LO, GO, NODE>::convert<SO>() const;
8783 
8784 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8785  template <> \
8786  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8787  importAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8788  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8789  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8790  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
8791  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8792  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8793  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8794  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8795  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8796  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8797  const Teuchos::RCP<Teuchos::ParameterList>& params);
8798 
8799 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8800  template <> \
8801  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8802  importAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8803  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8804  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8805  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
8806  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8807  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8808  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
8809  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8810  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8811  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8812  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8813  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8814  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8815  const Teuchos::RCP<Teuchos::ParameterList>& params);
8816 
8817 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8818  template <> \
8819  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8820  exportAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8821  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8822  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8823  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
8824  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8825  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8826  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8827  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8828  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8829  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8830  const Teuchos::RCP<Teuchos::ParameterList>& params);
8831 
8832 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8833  template <> \
8834  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8835  exportAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8836  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8837  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8838  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
8839  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8840  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8841  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
8842  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8843  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8844  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8845  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8846  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8847  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8848  const Teuchos::RCP<Teuchos::ParameterList>& params);
8849 
8850 #define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO, NODE) \
8851  TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
8852  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8853  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8854  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8855  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
8856 
8857 #endif // TPETRA_CRSMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X...
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
bool indicesAreSorted_
Whether the graph&#39;s indices are sorted in each row, on this process.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Functor for the the ABSMAX CombineMode of Import and Export operations.
void checkInternalState() const
Check that this object&#39;s state is sane; throw if it&#39;s not.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types...
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
mag_type getNormInf() const
Compute and return the infinity norm of the matrix.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
typename device_type::execution_space execution_space
The Kokkos execution space.
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object&#39;s data for an Import or Export.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
void getGlobalRowView(GlobalOrdinal GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices...
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
mag_type getNorm1(bool assumeSymmetric=false) const
Compute and return the 1-norm of the matrix.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Traits class for packing / unpacking data of type T.
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular...
Declaration of Tpetra::Details::EquilibrationInfo.
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
bool noRedundancies_
Whether the graph&#39;s indices are non-redundant (merged) in each row, on this process.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
bool isDistributed() const
Whether this is a globally distributed object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
static bool debug()
Whether Tpetra is in debug mode.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix&#39;s communicator...
Kokkos::View< size_t *, Kokkos::LayoutLeft, device_type >::host_mirror_type num_row_entries_type
Row offsets for &quot;1-D&quot; storage.
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T...
Scalar scalar_type
The type of each entry in the matrix.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of column indices for all rows Valid when isLocallyIndexed is true If OptimizedStorage...
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
Details::EquilibrationInfo< typename Kokkos::ArithTraits< SC >::val_type, typename NT::device_type > computeRowOneNorms(const Tpetra::RowMatrix< SC, LO, GO, NT > &A)
Compute global row one-norms (&quot;row sums&quot;) of the input sparse matrix A, in a way suitable for one-sid...
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void sort(View &view, const size_t &size)
Convenience wrapper for std::sort for host-accessible views.
Details::EquilibrationInfo< typename Kokkos::ArithTraits< SC >::val_type, typename NT::device_type > computeRowAndColumnOneNorms(const Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool assumeSymmetric)
Compute global row and column one-norms (&quot;row sums&quot; and &quot;column sums&quot;) of the input sparse matrix A...
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular...
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y...
size_t global_size_t
Global size_t object.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow)...
bool isFillComplete() const override
Whether the matrix is fill complete.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row&#39;s entries.
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Functions for manipulating CRS arrays.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
bool isNodeGlobalElement(global_ordinal_type globalIndex) const
Whether the given global index is owned by this Map on the calling process.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for &quot;new&quot; DistObject inter...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node >> &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified, possibly nondefault Kokkos Node type.
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void copyConvert(const OutputViewType &dst, const InputViewType &src)
Copy values from the 1-D Kokkos::View src, to the 1-D Kokkos::View dst, of the same length...
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
bool fillComplete_
Whether the matrix is fill complete.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using global indices.
Replace old value with maximum of magnitudes of old and new values.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
Abstract base class for objects that can be the source of an Import or Export operation.
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
typename Node::device_type device_type
The Kokkos device type.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process...
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps...
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
std::string description() const override
A one-line description of this object.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
Replace existing values with new values.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
Replace old values with zero.
const row_ptrs_host_view_type & getRowPtrsUnpackedHost() const
Get the unpacked row pointers on host. Lazily make a copy from device.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is &quot;imbalanced&quot; in the number of entries per row...
bool isLocallyComplete() const
Is this Export or Import locally complete?
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
local_map_type getLocalMap() const
Get the LocalMap for Kokkos-Kernels.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
A parallel distribution of indices over processes.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const CombineMode CM=ADD)
Insert one or more entries into the matrix, using local column indices.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Details::EStorageStatus storageStatus_
Status of the matrix&#39;s storage, when not in a fill-complete state.
A read-only, row-oriented interface to a sparse matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
A distributed dense vector.
Declaration of Tpetra::Details::iallreduce.
void reduce()
Sum values of a locally replicated multivector across all processes.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process...
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using local row and column indices.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
Definition: Tpetra_Util.cpp:69
Accumulate new values into existing values (may not be supported in all classes)
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
void rightScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Right-scale a KokkosSparse::CrsMatrix.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Description of Tpetra&#39;s behavior.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
Declaration of Tpetra::computeRowAndColumnOneNorms.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.