Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_CRSMATRIX_DEF_HPP
43 #define TPETRA_CRSMATRIX_DEF_HPP
44 
52 
53 #include "Tpetra_LocalCrsMatrixOperator.hpp"
54 #include "Tpetra_Import_Util.hpp"
55 #include "Tpetra_Import_Util2.hpp"
56 #include "Tpetra_RowMatrix.hpp"
57 
63 #include "Tpetra_Details_gathervPrint.hpp"
64 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
65 #include "Tpetra_Details_leftScaleLocalCrsMatrix.hpp"
67 #include "Tpetra_Details_rightScaleLocalCrsMatrix.hpp"
68 #include "KokkosSparse_getDiagCopy.hpp"
69 #include "Tpetra_Details_copyConvert.hpp"
72 #include "Tpetra_Details_packCrsMatrix.hpp"
73 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
75 #include "Teuchos_FancyOStream.hpp"
76 #include "Teuchos_RCP.hpp"
77 #include "Teuchos_DataAccess.hpp"
78 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
79 
80 #include <memory>
81 #include <sstream>
82 #include <typeinfo>
83 #include <utility>
84 #include <vector>
85 
86 using Teuchos::rcpFromRef;
87 
88 namespace Tpetra {
89 
90 namespace { // (anonymous)
91 
92  template<class T, class BinaryFunction>
93  T atomic_binary_function_update (volatile T* const dest,
94  const T& inputVal,
95  BinaryFunction f)
96  {
97  T oldVal = *dest;
98  T assume;
99 
100  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
101  // POWER architectures, because 'newval' depends on 'assume',
102  // which depends on 'oldVal', which depends on '*dest'. This
103  // sets up a chain of read dependencies that should ensure
104  // correct behavior given a sane memory model.
105  do {
106  assume = oldVal;
107  T newVal = f (assume, inputVal);
108  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
109  } while (assume != oldVal);
110 
111  return oldVal;
112  }
113 } // namespace (anonymous)
114 
115 //
116 // Users must never rely on anything in the Details namespace.
117 //
118 namespace Details {
119 
129 template<class Scalar>
130 struct AbsMax {
132  Scalar operator() (const Scalar& x, const Scalar& y) {
133  typedef Teuchos::ScalarTraits<Scalar> STS;
134  return std::max (STS::magnitude (x), STS::magnitude (y));
135  }
136 };
137 
138 } // namespace Details
139 } // namespace Tpetra
140 
141 namespace Tpetra {
142 
143  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
144  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
145  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
146  size_t maxNumEntriesPerRow,
147  const ProfileType pftype,
148  const Teuchos::RCP<Teuchos::ParameterList>& params) :
149  dist_object_type (rowMap),
150  storageStatus_ (pftype == StaticProfile ?
151  ::Tpetra::Details::STORAGE_1D_UNPACKED :
152  ::Tpetra::Details::STORAGE_2D),
153  fillComplete_ (false),
154  frobNorm_ (-STM::one ())
155  {
156  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t, "
157  "ProfileType[, RCP<ParameterList>]): ";
158  Teuchos::RCP<crs_graph_type> graph;
159  try {
160  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
161  pftype, params));
162  }
163  catch (std::exception& e) {
164  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
165  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
166  "size_t, ProfileType[, RCP<ParameterList>]) threw an exception: "
167  << e.what ());
168  }
169  // myGraph_ not null means that the matrix owns the graph. That's
170  // different than the const CrsGraph constructor, where the matrix
171  // does _not_ own the graph.
172  myGraph_ = graph;
173  staticGraph_ = myGraph_;
174  resumeFill (params);
176  }
177 
178  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
180  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
181  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
182  const ProfileType pftype,
183  const Teuchos::RCP<Teuchos::ParameterList>& params) :
184  dist_object_type (rowMap),
185  storageStatus_ (pftype == StaticProfile ?
186  ::Tpetra::Details::STORAGE_1D_UNPACKED :
187  ::Tpetra::Details::STORAGE_2D),
188  fillComplete_ (false),
189  frobNorm_ (-STM::one ())
190  {
191  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
192  "ArrayView<const size_t>, ProfileType[, RCP<ParameterList>]): ";
193  Teuchos::RCP<crs_graph_type> graph;
194  try {
195  graph = Teuchos::rcp (new crs_graph_type (rowMap, numEntPerRowToAlloc,
196  pftype, params));
197  }
198  catch (std::exception &e) {
199  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
200  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
201  "ArrayView<const size_t>, ProfileType[, RCP<ParameterList>]) threw "
202  "an exception: " << e.what ());
203  }
204  // myGraph_ not null means that the matrix owns the graph. That's
205  // different than the const CrsGraph constructor, where the matrix
206  // does _not_ own the graph.
207  myGraph_ = graph;
208  staticGraph_ = graph;
209  resumeFill (params);
211  }
212 
213 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
214  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
216  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
217  const Teuchos::ArrayRCP<const size_t>& numEntPerRowToAlloc,
218  const ProfileType pftype,
219  const Teuchos::RCP<Teuchos::ParameterList>& params) :
220  dist_object_type (rowMap),
221  storageStatus_ (pftype == StaticProfile ?
222  ::Tpetra::Details::STORAGE_1D_UNPACKED :
223  ::Tpetra::Details::STORAGE_2D),
224  fillComplete_ (false),
225  frobNorm_ (-STM::one ())
226  {
227  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
228  "ArrayRCP<const size_t>, ProfileType[, RCP<ParameterList>]): ";
229  Teuchos::RCP<crs_graph_type> graph;
230  try {
231  graph = Teuchos::rcp (new crs_graph_type (rowMap, numEntPerRowToAlloc (),
232  pftype, params));
233  }
234  catch (std::exception &e) {
235  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
236  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
237  "ArrayView<const size_t>, ProfileType[, RCP<ParameterList>]) threw "
238  "an exception: " << e.what ());
239  }
240  // myGraph_ not null means that the matrix owns the graph. That's
241  // different than the const CrsGraph constructor, where the matrix
242  // does _not_ own the graph.
243  myGraph_ = graph;
244  staticGraph_ = graph;
245  resumeFill (params);
247  }
248 #endif // TPETRA_ENABLE_DEPRECATED_CODE
249 
250  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
251  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
252  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
253  const Teuchos::RCP<const map_type>& colMap,
254  const size_t maxNumEntPerRow,
255  const ProfileType pftype,
256  const Teuchos::RCP<Teuchos::ParameterList>& params) :
257  dist_object_type (rowMap),
258  storageStatus_ (pftype == StaticProfile ?
259  ::Tpetra::Details::STORAGE_1D_UNPACKED :
260  ::Tpetra::Details::STORAGE_2D),
261  fillComplete_ (false),
262  frobNorm_ (-STM::one ())
263  {
264  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, RCP<const Map>, "
265  "size_t, ProfileType[, RCP<ParameterList>]): ";
266 
267 #ifdef HAVE_TPETRA_DEBUG
268  // An artifact of debugging something a while back.
269  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
270  (! staticGraph_.is_null (), std::logic_error,
271  "staticGraph_ is not null at the beginning of the constructor. "
272  "Please report this bug to the Tpetra developers.");
273  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
274  (! myGraph_.is_null (), std::logic_error,
275  "myGraph_ is not null at the beginning of the constructor. "
276  "Please report this bug to the Tpetra developers.");
277 #endif // HAVE_TPETRA_DEBUG
278 
279  Teuchos::RCP<crs_graph_type> graph;
280  try {
281  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
282  maxNumEntPerRow,
283  pftype, params));
284  }
285  catch (std::exception &e) {
286  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
287  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
288  "RCP<const Map>, size_t, ProfileType[, RCP<ParameterList>]) threw an "
289  "exception: " << e.what ());
290  }
291  // myGraph_ not null means that the matrix owns the graph. That's
292  // different than the const CrsGraph constructor, where the matrix
293  // does _not_ own the graph.
294  myGraph_ = graph;
295  staticGraph_ = myGraph_;
296  resumeFill (params);
298  }
299 
300  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
302  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
303  const Teuchos::RCP<const map_type>& colMap,
304  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
305  const ProfileType pftype,
306  const Teuchos::RCP<Teuchos::ParameterList>& params) :
307  dist_object_type (rowMap),
308  storageStatus_ (pftype == StaticProfile ?
309  ::Tpetra::Details::STORAGE_1D_UNPACKED :
310  ::Tpetra::Details::STORAGE_2D),
311  fillComplete_ (false),
312  frobNorm_ (-STM::one ())
313  {
314  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, RCP<const Map>, "
315  "ArrayView<const size_t>, ProfileType[, RCP<ParameterList>]): ";
316  Teuchos::RCP<crs_graph_type> graph;
317  try {
318  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
319  numEntPerRowToAlloc,
320  pftype, params));
321  }
322  catch (std::exception &e) {
323  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
324  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
325  "RCP<const Map>, ArrayView<const size_t>, ProfileType[, "
326  "RCP<ParameterList>]) threw an exception: " << e.what ());
327  }
328  // myGraph_ not null means that the matrix owns the graph. That's
329  // different than the const CrsGraph constructor, where the matrix
330  // does _not_ own the graph.
331  myGraph_ = graph;
332  staticGraph_ = graph;
333  resumeFill (params);
335  }
336 
337 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
338  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
340  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
341  const Teuchos::RCP<const map_type>& colMap,
342  const Teuchos::ArrayRCP<const size_t>& numEntPerRowToAlloc,
343  const ProfileType pftype,
344  const Teuchos::RCP<Teuchos::ParameterList>& params) :
345  dist_object_type (rowMap),
346  storageStatus_ (pftype == StaticProfile ?
347  ::Tpetra::Details::STORAGE_1D_UNPACKED :
348  ::Tpetra::Details::STORAGE_2D),
349  fillComplete_ (false),
350  frobNorm_ (-STM::one ())
351  {
352  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, RCP<const Map>, "
353  "ArrayRCP<const size_t>, ProfileType[, RCP<ParameterList>]): ";
354  Teuchos::RCP<crs_graph_type> graph;
355  try {
356  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
357  numEntPerRowToAlloc (),
358  pftype, params));
359  }
360  catch (std::exception &e) {
361  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
362  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
363  "RCP<const Map>, ArrayView<const size_t>, ProfileType[, "
364  "RCP<ParameterList>]) threw an exception: " << e.what ());
365  }
366  // myGraph_ not null means that the matrix owns the graph. That's
367  // different than the const CrsGraph constructor, where the matrix
368  // does _not_ own the graph.
369  myGraph_ = graph;
370  staticGraph_ = graph;
371  resumeFill (params);
373  }
374 #endif // TPETRA_ENABLE_DEPRECATED_CODE
375 
376  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
377  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
378  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
379  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
380  dist_object_type (graph->getRowMap ()),
381  staticGraph_ (graph),
382  storageStatus_ (::Tpetra::Details::STORAGE_1D_PACKED),
383  fillComplete_ (false),
384  frobNorm_ (-STM::one ())
385  {
386  typedef typename local_matrix_type::values_type values_type;
387  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
388  "RCP<ParameterList>]): ";
389  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
390  (graph.is_null (), std::runtime_error, "Input graph is null.");
391  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
392  (! graph->isFillComplete (), std::runtime_error, "Input graph is not "
393  "fill complete. You must call fillComplete on the graph before using "
394  "it to construct a CrsMatrix. Note that calling resumeFill on the "
395  "graph makes it not fill complete, even if you had previously called "
396  "fillComplete. In that case, you must call fillComplete on the graph "
397  "again.");
398 
399  // The graph is fill complete, so it is locally indexed and has a
400  // fixed structure. This means we can allocate the (1-D) array of
401  // values and build the local matrix right now. Note that the
402  // local matrix's number of columns comes from the column Map, not
403  // the domain Map.
404 
405  const size_t numCols = graph->getColMap ()->getNodeNumElements ();
406  auto lclGraph = graph->getLocalGraph ();
407  const size_t numEnt = lclGraph.entries.extent (0);
408  values_type val ("Tpetra::CrsMatrix::val", numEnt);
409 
410  auto lclMat = std::make_shared<local_matrix_type>
411  ("Tpetra::CrsMatrix::lclMatrix_", numCols, val, lclGraph);
412  lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
413 
414  // FIXME (22 Jun 2016) I would very much like to get rid of
415  // k_values1D_ at some point. I find it confusing to have all
416  // these extra references lying around.
417  k_values1D_ = lclMat->values;
418 
420  }
421 
422  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
424  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
425  const typename local_matrix_type::values_type& values,
426  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
427  dist_object_type (graph->getRowMap ()),
428  staticGraph_ (graph),
429  storageStatus_ (::Tpetra::Details::STORAGE_1D_PACKED),
430  fillComplete_ (false),
431  frobNorm_ (-STM::one ())
432  {
433  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>,local_matrix_type::values_type,[, "
434  "RCP<ParameterList>]): ";
435  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
436  (graph.is_null (), std::runtime_error, "Input graph is null.");
437  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
438  (! graph->isFillComplete (), std::runtime_error, "Input graph is not "
439  "fill complete. You must call fillComplete on the graph before using "
440  "it to construct a CrsMatrix. Note that calling resumeFill on the "
441  "graph makes it not fill complete, even if you had previously called "
442  "fillComplete. In that case, you must call fillComplete on the graph "
443  "again.");
444 
445  // The graph is fill complete, so it is locally indexed and has a
446  // fixed structure. This means we can allocate the (1-D) array of
447  // values and build the local matrix right now. Note that the
448  // local matrix's number of columns comes from the column Map, not
449  // the domain Map.
450 
451  const size_t numCols = graph->getColMap ()->getNodeNumElements ();
452  auto lclGraph = graph->getLocalGraph ();
453 
454  auto lclMat = std::make_shared<local_matrix_type>
455  ("Tpetra::CrsMatrix::lclMatrix_", numCols, values, lclGraph);
456  lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
457 
458  // FIXME (22 Jun 2016) I would very much like to get rid of
459  // k_values1D_ at some point. I find it confusing to have all
460  // these extra references lying around.
461  k_values1D_ = lclMat->values;
462 
464  }
465 
466 
467 
468  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
470  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
471  const Teuchos::RCP<const map_type>& colMap,
472  const typename local_matrix_type::row_map_type& rowPointers,
473  const typename local_graph_type::entries_type::non_const_type& columnIndices,
474  const typename local_matrix_type::values_type& values,
475  const Teuchos::RCP<Teuchos::ParameterList>& params) :
476  dist_object_type (rowMap),
477  storageStatus_ (::Tpetra::Details::STORAGE_1D_PACKED),
478  fillComplete_ (false),
479  frobNorm_ (-STM::one ())
480  {
481  using Teuchos::RCP;
482  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
483  "RCP<const Map>, ptr, ind, val[, params]): ";
484  const char suffix[] = ". Please report this bug to the Tpetra developers.";
485 
486  // Check the user's input. Note that this might throw only on
487  // some processes but not others, causing deadlock. We prefer
488  // deadlock due to exceptions to segfaults, because users can
489  // catch exceptions.
490  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
491  (values.extent (0) != columnIndices.extent (0),
492  std::invalid_argument, "Input arrays don't have matching dimensions. "
493  "values.extent(0) = " << values.extent (0) << " != "
494  "columnIndices.extent(0) = " << columnIndices.extent (0) << ".");
495 #ifdef HAVE_TPETRA_DEBUG
496  if (rowPointers.extent (0) != 0) {
497  const size_t numEnt =
498  ::Tpetra::Details::getEntryOnHost (rowPointers, rowPointers.extent (0) - 1);
499  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
500  (numEnt != static_cast<size_t> (columnIndices.extent (0)) ||
501  numEnt != static_cast<size_t> (values.extent (0)),
502  std::invalid_argument, "Last entry of rowPointers says that the matrix"
503  " has " << numEnt << " entr" << (numEnt != 1 ? "ies" : "y") << ", but "
504  "the dimensions of columnIndices and values don't match this. "
505  "columnIndices.extent(0) = " << columnIndices.extent (0) <<
506  " and values.extent(0) = " << values.extent (0) << ".");
507  }
508 #endif // HAVE_TPETRA_DEBUG
509 
510  RCP<crs_graph_type> graph;
511  try {
512  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
513  columnIndices, params));
514  }
515  catch (std::exception& e) {
516  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
517  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
518  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
519  << e.what ());
520  }
521  // The newly created CrsGraph _must_ have a local graph at this
522  // point. We don't really care whether CrsGraph's constructor
523  // deep-copies or shallow-copies the input, but the dimensions
524  // have to be right. That's how we tell whether the CrsGraph has
525  // a local graph.
526  auto lclGraph = graph->getLocalGraph ();
527  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
528  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
529  lclGraph.entries.extent (0) != columnIndices.extent (0),
530  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
531  "ind[, params]) did not set the local graph correctly." << suffix);
532  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
533  (lclGraph.entries.extent (0) != values.extent (0),
534  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
535  "params]) did not set the local graph correctly. "
536  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
537  << " != values.extent(0) = " << values.extent (0) << suffix);
538 
539  // myGraph_ not null means that the matrix owns the graph. This
540  // is true because the column indices come in as nonconst,
541  // implying shared ownership.
542  myGraph_ = graph;
543  staticGraph_ = graph;
544 
545  // The graph may not be fill complete yet. However, it is locally
546  // indexed (since we have a column Map) and has a fixed structure
547  // (due to the input arrays). This means we can allocate the
548  // (1-D) array of values and build the local matrix right now.
549  // Note that the local matrix's number of columns comes from the
550  // column Map, not the domain Map.
551 
552  const size_t numCols = graph->getColMap ()->getNodeNumElements ();
553 
554  auto lclMat = std::make_shared<local_matrix_type>
555  ("Tpetra::CrsMatrix::lclMatrix_", numCols, values, lclGraph);
556  lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
557 
558  auto newValues = lclMat->values;
559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
560  (newValues.extent (0) != values.extent (0),
561  std::logic_error, "Local matrix's constructor did not set the "
562  "values correctly. newValues.extent(0) = " <<
563  newValues.extent (0) << " != values.extent(0) = " <<
564  values.extent (0) << suffix);
565 
566  // FIXME (22 Jun 2016) I would very much like to get rid of
567  // k_values1D_ at some point. I find it confusing to have all
568  // these extra references lying around.
569  this->k_values1D_ = newValues;
570 
572  }
573 
574  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
576  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
577  const Teuchos::RCP<const map_type>& colMap,
578  const Teuchos::ArrayRCP<size_t>& ptr,
579  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
580  const Teuchos::ArrayRCP<Scalar>& val,
581  const Teuchos::RCP<Teuchos::ParameterList>& params) :
582  dist_object_type (rowMap),
583  storageStatus_ (::Tpetra::Details::STORAGE_1D_PACKED),
584  fillComplete_ (false),
585  frobNorm_ (-STM::one ())
586  {
587  using Kokkos::Compat::getKokkosViewDeepCopy;
588  using Teuchos::av_reinterpret_cast;
589  using Teuchos::RCP;
590  typedef typename local_matrix_type::values_type values_type;
591  typedef impl_scalar_type IST;
592  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
593  "RCP<const Map>, ptr, ind, val[, params]): ";
594 
595  RCP<crs_graph_type> graph;
596  try {
597  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
598  ind, params));
599  }
600  catch (std::exception& e) {
601  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
602  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
603  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
604  "RCP<ParameterList>]) threw an exception: " << e.what ());
605  }
606  // myGraph_ not null means that the matrix owns the graph. This
607  // is true because the column indices come in as nonconst,
608  // implying shared ownership.
609  myGraph_ = graph;
610  staticGraph_ = graph;
611 
612  // The graph may not be fill complete yet. However, it is locally
613  // indexed (since we have a column Map) and has a fixed structure
614  // (due to the input arrays). This means we can allocate the
615  // (1-D) array of values and build the local matrix right now.
616  // Note that the local matrix's number of columns comes from the
617  // column Map, not the domain Map.
618 
619  // The graph _must_ have a local graph at this point. We don't
620  // really care whether CrsGraph's constructor deep-copies or
621  // shallow-copies the input, but the dimensions have to be right.
622  // That's how we tell whether the CrsGraph has a local graph.
623  auto lclGraph = staticGraph_->getLocalGraph ();
624  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
625  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
626  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
627  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
628  "ptr, ind[, params]) did not set the local graph correctly. "
629  "Please report this bug to the Tpetra developers.");
630 
631  const size_t numCols =
632  staticGraph_->getColMap ()->getNodeNumElements ();
633  values_type valIn =
634  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
635 
636  auto lclMat = std::make_shared<local_matrix_type>
637  ("Tpetra::CrsMatrix::lclMatrix_", numCols, valIn, lclGraph);
638  lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
639 
640  // FIXME (22 Jun 2016) I would very much like to get rid of
641  // k_values1D_ at some point. I find it confusing to have all
642  // these extra references lying around.
643  this->k_values1D_ = lclMat->values;
644 
646  }
647 
648  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
650  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
651  const Teuchos::RCP<const map_type>& colMap,
652  const local_matrix_type& lclMatrix,
653  const Teuchos::RCP<Teuchos::ParameterList>& params) :
654  dist_object_type (rowMap),
655  lclMatrix_ (std::make_shared<local_multiply_op_type>
656  (std::make_shared<local_matrix_type> (lclMatrix))),
657  k_values1D_ (lclMatrix.values),
658  storageStatus_ (::Tpetra::Details::STORAGE_1D_PACKED),
659  fillComplete_ (true),
660  frobNorm_ (-STM::one ())
661  {
662  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
663  "RCP<const Map>, local_matrix_type[, RCP<ParameterList>]): ";
664  Teuchos::RCP<crs_graph_type> graph;
665  try {
666  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
667  lclMatrix.graph, params));
668  }
669  catch (std::exception& e) {
670  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
671  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
672  "RCP<const Map>, local_graph_type[, RCP<ParameterList>]) threw an "
673  "exception: " << e.what ());
674  }
675  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
676  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
677  "<const Map>, RCP<const Map>, local_graph_type[, RCP<ParameterList>]) "
678  "did not produce a fill-complete graph. Please report this bug to the "
679  "Tpetra developers.");
680  // myGraph_ not null means that the matrix owns the graph. This
681  // is true because the column indices come in as nonconst through
682  // the matrix, implying shared ownership.
683  myGraph_ = graph;
684  staticGraph_ = graph;
685 
686  const bool callComputeGlobalConstants = params.get () == nullptr ||
687  params->get ("compute global constants", true);
688  if (callComputeGlobalConstants) {
689  this->computeGlobalConstants ();
690  }
691 
692  // Sanity checks at the end.
693 #ifdef HAVE_TPETRA_DEBUG
694  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive (), std::logic_error,
695  "We're at the end of fillComplete(), but isFillActive() is true. "
696  "Please report this bug to the Tpetra developers.");
697  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete (), std::logic_error,
698  "We're at the end of fillComplete(), but isFillComplete() is false. "
699  "Please report this bug to the Tpetra developers.");
700 #endif // HAVE_TPETRA_DEBUG
702  }
703 
704  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
706  CrsMatrix (const local_matrix_type& lclMatrix,
707  const Teuchos::RCP<const map_type>& rowMap,
708  const Teuchos::RCP<const map_type>& colMap,
709  const Teuchos::RCP<const map_type>& domainMap,
710  const Teuchos::RCP<const map_type>& rangeMap,
711  const Teuchos::RCP<Teuchos::ParameterList>& params) :
712  dist_object_type (rowMap),
713  lclMatrix_ (std::make_shared<local_multiply_op_type>
714  (std::make_shared<local_matrix_type> (lclMatrix))),
715  k_values1D_ (lclMatrix.values),
716  storageStatus_ (::Tpetra::Details::STORAGE_1D_PACKED),
717  fillComplete_ (true),
718  frobNorm_ (-STM::one ())
719  {
720  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
721  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_matrix_type[, "
722  "RCP<ParameterList>]): ";
723  Teuchos::RCP<crs_graph_type> graph;
724  try {
725  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
726  domainMap, rangeMap, params));
727  }
728  catch (std::exception& e) {
729  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
730  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
731  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_type[, "
732  "RCP<ParameterList>]) threw an exception: " << e.what ());
733  }
734  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
735  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
736  "<const Map>, RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_type[, "
737  "RCP<ParameterList>]) did not produce a fill-complete graph. Please report this "
738  "bug to the Tpetra developers.");
739  // myGraph_ not null means that the matrix owns the graph. This
740  // is true because the column indices come in as nonconst through
741  // the matrix, implying shared ownership.
742  myGraph_ = graph;
743  staticGraph_ = graph;
744 
745  const bool callComputeGlobalConstants = params.get () == nullptr ||
746  params->get ("compute global constants", true);
747  if (callComputeGlobalConstants) {
748  this->computeGlobalConstants ();
749  }
750 
751  // Sanity checks at the end.
752 #ifdef HAVE_TPETRA_DEBUG
753  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive (), std::logic_error,
754  "We're at the end of fillComplete(), but isFillActive() is true. "
755  "Please report this bug to the Tpetra developers.");
756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete (), std::logic_error,
757  "We're at the end of fillComplete(), but isFillComplete() is false. "
758  "Please report this bug to the Tpetra developers.");
759 #endif // HAVE_TPETRA_DEBUG
761  }
762 
763  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
765  CrsMatrix (const local_matrix_type& lclMatrix,
766  const Teuchos::RCP<const map_type>& rowMap,
767  const Teuchos::RCP<const map_type>& colMap,
768  const Teuchos::RCP<const map_type>& domainMap,
769  const Teuchos::RCP<const map_type>& rangeMap,
770  const Teuchos::RCP<const import_type>& importer,
771  const Teuchos::RCP<const export_type>& exporter,
772  const Teuchos::RCP<Teuchos::ParameterList>& params) :
773  dist_object_type (rowMap),
774  lclMatrix_ (std::make_shared<local_multiply_op_type>
775  (std::make_shared<local_matrix_type> (lclMatrix))),
776  k_values1D_ (lclMatrix.values),
777  storageStatus_ (::Tpetra::Details::STORAGE_1D_PACKED),
778  fillComplete_ (true),
779  frobNorm_ (-STM::one ())
780  {
781  using Teuchos::rcp;
782  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
783  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
784 
785  Teuchos::RCP<crs_graph_type> graph;
786  try {
787  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
788  domainMap, rangeMap, importer,
789  exporter, params));
790  }
791  catch (std::exception& e) {
792  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
793  (true, std::runtime_error, "CrsGraph constructor "
794  "(local_graph_type, Map, Map, Map, Map, Import, Export, "
795  "params) threw: " << e.what ());
796  }
797  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
798  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
799  "constructor (local_graph_type, Map, Map, Map, Map, Import, "
800  "Export, params) did not produce a fill-complete graph. "
801  "Please report this bug to the Tpetra developers.");
802  // myGraph_ not null means that the matrix owns the graph. This
803  // is true because the column indices come in as nonconst through
804  // the matrix, implying shared ownership.
805  myGraph_ = graph;
806  staticGraph_ = graph;
807 
808  const bool callComputeGlobalConstants = params.get () == nullptr ||
809  params->get ("compute global constants", true);
810  if (callComputeGlobalConstants) {
811  this->computeGlobalConstants ();
812  }
813 
814  // Sanity checks at the end.
815 #ifdef HAVE_TPETRA_DEBUG
816  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive (), std::logic_error,
817  "We're at the end of fillComplete(), but isFillActive() is true. "
818  "Please report this bug to the Tpetra developers.");
819  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete (), std::logic_error,
820  "We're at the end of fillComplete(), but isFillComplete() is false. "
821  "Please report this bug to the Tpetra developers.");
822 #endif // HAVE_TPETRA_DEBUG
824  }
825 
826  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
829  const Teuchos::DataAccess copyOrView)
830  : CrsMatrix (source.getCrsGraph (), source.getLocalValuesView ())
831  {
832  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
833  "const CrsMatrix&, const Teuchos::DataAccess): ";
834  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
835  (! source.isFillComplete (), std::invalid_argument,
836  "Source graph must be fillComplete().");
837 
838  if (copyOrView == Teuchos::Copy) {
839  using values_type = typename local_matrix_type::values_type;
840  values_type vals = source.getLocalValuesView ();
841  using Kokkos::view_alloc;
842  using Kokkos::WithoutInitializing;
843  values_type newvals (view_alloc ("val", WithoutInitializing),
844  vals.extent (0));
845  Kokkos::deep_copy (newvals, vals);
846  k_values1D_ = newvals;
847  if (source.isFillComplete ()) {
848  fillComplete (source.getDomainMap (), source.getRangeMap ());
849  }
850  }
851  else if (copyOrView == Teuchos::View) {
852  return;
853  }
854  else {
855  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
856  (true, std::invalid_argument, "Second argument 'copyOrView' "
857  "has an invalid value " << copyOrView << ". Valid values "
858  "include Teuchos::Copy = " << Teuchos::Copy << " and "
859  "Teuchos::View = " << Teuchos::View << ".");
860  }
861  }
862 
863 
864 
865  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
866  void
869  {
870  std::swap(crs_matrix.importMV_, this->importMV_); // mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
871  std::swap(crs_matrix.exportMV_, this->exportMV_); // mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
872  std::swap(crs_matrix.staticGraph_, this->staticGraph_); // Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node>>
873  std::swap(crs_matrix.myGraph_, this->myGraph_); // Teuchos::RCP< CrsGraph<LocalOrdinal, GlobalOrdinal, Node>>
874  std::swap(crs_matrix.lclMatrix_, this->lclMatrix_); // KokkosSparse::CrsMatrix<impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type>
875  std::swap(crs_matrix.k_values1D_, this->k_values1D_); // KokkosSparse::CrsMatrix<impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type>::values_type
876  std::swap(crs_matrix.values2D_, this->values2D_); // Teuchos::ArrayRCP<Teuchos::Array<Kokkos::Details::ArithTraits<Scalar>::val_type>>
877  std::swap(crs_matrix.storageStatus_, this->storageStatus_); // ::Tpetra::Details::EStorageStatus (enum f/m Tpetra_CrsGraph_decl.hpp)
878  std::swap(crs_matrix.fillComplete_, this->fillComplete_); // bool
879  std::swap(crs_matrix.nonlocals_, this->nonlocals_); // std::map<GO, pair<Teuchos::Array<GO>,Teuchos::Array<Scalar>>
880  std::swap(crs_matrix.frobNorm_, this->frobNorm_); // mutable Kokkos::Details::ArithTraits<impl_scalar_type>::mag_type
881  }
882 
883 
884  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
885  Teuchos::RCP<const Teuchos::Comm<int> >
887  getComm () const {
888  return getCrsGraphRef ().getComm ();
889  }
890 
891 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
892  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
893  TPETRA_DEPRECATED
894  Teuchos::RCP<Node>
896  getNode () const {
897  return Teuchos::null;
898  }
899 #endif // TPETRA_ENABLE_DEPRECATED_CODE
900 
901  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
904  getProfileType () const {
905  return this->getCrsGraphRef ().getProfileType ();
906  }
907 
908  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
909  bool
911  isFillComplete () const {
912  return fillComplete_;
913  }
914 
915  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
916  bool
918  isFillActive () const {
919  return ! fillComplete_;
920  }
921 
922  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
923  bool
926  return this->getCrsGraphRef ().isStorageOptimized ();
927  }
928 
929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
930  bool
933  return getCrsGraphRef ().isLocallyIndexed ();
934  }
935 
936  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
937  bool
940  return getCrsGraphRef ().isGloballyIndexed ();
941  }
942 
943  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
944  bool
946  hasColMap () const {
947  return getCrsGraphRef ().hasColMap ();
948  }
949 
950  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
954  return getCrsGraphRef ().getGlobalNumEntries ();
955  }
956 
957  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
958  size_t
961  return getCrsGraphRef ().getNodeNumEntries ();
962  }
963 
964  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
968  return getCrsGraphRef ().getGlobalNumRows ();
969  }
970 
971  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
975  return getCrsGraphRef ().getGlobalNumCols ();
976  }
977 
978  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
979  size_t
981  getNodeNumRows () const {
982  return getCrsGraphRef ().getNodeNumRows ();
983  }
984 
985  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
986  size_t
988  getNodeNumCols () const {
989  return getCrsGraphRef ().getNodeNumCols ();
990  }
991 
992 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
993  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
994  global_size_t TPETRA_DEPRECATED
996  getGlobalNumDiags () const {
997  return this->getGlobalNumDiagsImpl ();
998  }
999 
1000  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1001  size_t TPETRA_DEPRECATED
1002  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1003  getNodeNumDiags () const {
1004  return this->getNodeNumDiagsImpl ();
1005  }
1006 
1007  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1009  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1010  getGlobalNumDiagsImpl () const {
1011  const crs_graph_type& G = this->getCrsGraphRef ();
1012  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
1013  return dynamic_cast<const HDM&> (G).getGlobalNumDiagsImpl ();
1014  }
1015 
1016  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1017  size_t
1018  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1019  getNodeNumDiagsImpl () const {
1020  const crs_graph_type& G = this->getCrsGraphRef ();
1021  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
1022  return dynamic_cast<const HDM&> (G).getNodeNumDiagsImpl ();
1023  }
1024 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1025 
1026  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1027  size_t
1029  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
1030  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
1031  }
1032 
1033  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1034  size_t
1036  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
1037  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
1038  }
1039 
1040  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1041  size_t
1044  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
1045  }
1046 
1047  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1048  size_t
1051  return getCrsGraphRef ().getNodeMaxNumRowEntries ();
1052  }
1053 
1054  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1055  GlobalOrdinal
1057  getIndexBase () const {
1058  return getRowMap ()->getIndexBase ();
1059  }
1060 
1061  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1062  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
1064  getRowMap () const {
1065  return getCrsGraphRef ().getRowMap ();
1066  }
1067 
1068  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1069  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
1071  getColMap () const {
1072  return getCrsGraphRef ().getColMap ();
1073  }
1074 
1075  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1076  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
1078  getDomainMap () const {
1079  return getCrsGraphRef ().getDomainMap ();
1080  }
1081 
1082  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1083  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
1085  getRangeMap () const {
1086  return getCrsGraphRef ().getRangeMap ();
1087  }
1088 
1089  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1090  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
1092  getGraph () const {
1093  if (staticGraph_ != Teuchos::null) {
1094  return staticGraph_;
1095  }
1096  return myGraph_;
1097  }
1098 
1099  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1100  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
1102  getCrsGraph () const {
1103  if (staticGraph_ != Teuchos::null) {
1104  return staticGraph_;
1105  }
1106  return myGraph_;
1107  }
1108 
1109  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1112  getCrsGraphRef () const {
1113  if (! this->staticGraph_.is_null ()) {
1114  return * (this->staticGraph_);
1115  }
1116  else {
1117 #ifdef HAVE_TPETRA_DEBUG
1118  const char tfecfFuncName[] = "getCrsGraphRef: ";
1119  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1120  (this->myGraph_.is_null (), std::logic_error,
1121  "Both staticGraph_ and myGraph_ are null. "
1122  "Please report this bug to the Tpetra developers.");
1123 #endif // HAVE_TPETRA_DEBUG
1124  return * (this->myGraph_);
1125  }
1126  }
1127 
1128  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1129  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_type
1132  {
1133  return lclMatrix_.get () == nullptr ?
1134  local_matrix_type () :
1135  lclMatrix_->getLocalMatrix ();
1136  }
1137 
1138 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1139  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1140  bool TPETRA_DEPRECATED
1142  isLowerTriangular () const {
1143  return this->isLowerTriangularImpl ();
1144  }
1145 
1146  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1147  bool TPETRA_DEPRECATED
1148  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1149  isUpperTriangular () const {
1150  return this->isUpperTriangularImpl ();
1151  }
1152 
1153  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1154  bool
1155  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1156  isLowerTriangularImpl () const {
1157  const crs_graph_type& G = this->getCrsGraphRef ();
1158  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
1159  return dynamic_cast<const HDM&> (G).isLowerTriangularImpl ();
1160  }
1161 
1162  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1163  bool
1164  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1165  isUpperTriangularImpl () const {
1166  const crs_graph_type& G = this->getCrsGraphRef ();
1167  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
1168  return dynamic_cast<const HDM&> (G).isUpperTriangularImpl ();
1169  }
1170 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1171 
1172  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1173  bool
1175  isStaticGraph () const {
1176  return myGraph_.is_null ();
1177  }
1178 
1179  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1180  bool
1183  return true;
1184  }
1185 
1186  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1187  bool
1190  return true;
1191  }
1192 
1193  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1194  Teuchos::ArrayRCP<Teuchos::Array<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type> >
1197  {
1198  using Teuchos::arcp;
1199  using Teuchos::Array;
1200  using Teuchos::ArrayRCP;
1201  typedef impl_scalar_type IST;
1202  typedef LocalOrdinal LO;
1203  const char tfecfFuncName[] = "allocateValues2D: ";
1204 
1205  const crs_graph_type& graph = this->getCrsGraphRef ();
1206  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1207  (! graph.indicesAreAllocated (), std::runtime_error,
1208  "Graph indices must be allocated before values.");
1209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1210  (graph.getProfileType () == StaticProfile, std::runtime_error,
1211  "Graph indices must be allocated in a dynamic profile.");
1212 
1213  const LO lclNumRows = graph.getNodeNumRows ();
1214  Teuchos::ArrayRCP<Teuchos::Array<IST> > values2D (lclNumRows);
1215  if (! graph.lclInds2D_.is_null ()) {
1216  for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
1217  values2D[lclRow].resize (graph.lclInds2D_[lclRow].size ());
1218  }
1219  }
1220  else if (! graph.gblInds2D_.is_null ()) {
1221  for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
1222  values2D[lclRow].resize (graph.gblInds2D_[lclRow].size ());
1223  }
1224  }
1225  return values2D;
1226  }
1227 
1228  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1229  void
1231  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas)
1232  {
1233  using ::Tpetra::Details::ProfilingRegion;
1234  const char tfecfFuncName[] = "allocateValues: ";
1235  ProfilingRegion regionAllocateValues ("Tpetra::CrsMatrix::allocateValues");
1236 
1237 #ifdef HAVE_TPETRA_DEBUG
1238  const char suffix[] = " Please report this bug to the Tpetra developers.";
1239 
1240  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1241  (this->staticGraph_.is_null (), std::logic_error,
1242  "staticGraph_ is null." << suffix);
1243 
1244  // If the graph indices are already allocated, then gas should be
1245  // GraphAlreadyAllocated. Otherwise, gas should be
1246  // GraphNotYetAllocated.
1247  if ((gas == GraphAlreadyAllocated) != this->staticGraph_->indicesAreAllocated ()) {
1248  const char err1[] = "The caller has asserted that the graph is ";
1249  const char err2[] = "already allocated, but the static graph says "
1250  "that its indices are ";
1251  const char err3[] = "already allocated. Please report this bug to "
1252  "the Tpetra developers.";
1253  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1254  (gas == GraphAlreadyAllocated && ! this->staticGraph_->indicesAreAllocated (),
1255  std::logic_error, err1 << err2 << "not " << err3);
1256  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1257  (gas != GraphAlreadyAllocated && this->staticGraph_->indicesAreAllocated (),
1258  std::logic_error, err1 << "not " << err2 << err3);
1259  }
1260 
1261  // If the graph is unallocated, then it had better be a
1262  // matrix-owned graph. ("Matrix-owned graph" means that the
1263  // matrix gets to define the graph structure. If the CrsMatrix
1264  // constructor that takes an RCP<const CrsGraph> was used, then
1265  // the matrix does _not_ own the graph.)
1266  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1267  (! this->staticGraph_->indicesAreAllocated () &&
1268  this->myGraph_.is_null (), std::logic_error,
1269  "The static graph says that its indices are not allocated, "
1270  "but the graph is not owned by the matrix." << suffix);
1271 #endif // HAVE_TPETRA_DEBUG
1272 
1273  if (gas == GraphNotYetAllocated) {
1274 #ifdef HAVE_TPETRA_DEBUG
1275  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1276  (this->myGraph_.is_null (), std::logic_error,
1277  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1278 #endif // HAVE_TPETRA_DEBUG
1279  try {
1280  this->myGraph_->allocateIndices (lg);
1281  }
1282  catch (std::exception& e) {
1283  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1284  (true, std::runtime_error, "CrsGraph::allocateIndices "
1285  "threw an exception: " << e.what ());
1286  }
1287  catch (...) {
1288  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1289  (true, std::runtime_error, "CrsGraph::allocateIndices "
1290  "threw an exception not a subclass of std::exception.");
1291  }
1292  }
1293 
1294  // Allocate matrix values.
1295  if (this->getProfileType () == StaticProfile) {
1296  // "Static profile" means that the number of matrix entries in
1297  // each row was fixed at the time the CrsMatrix constructor was
1298  // called. This lets us use 1-D storage for the matrix's
1299  // values. ("1-D storage" means the same as that used by the
1300  // three arrays in the compressed sparse row storage format.)
1301 
1302 #ifdef HAVE_TPETRA_DEBUG
1303  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1304  (this->staticGraph_.is_null (), std::logic_error,
1305  "this->getProfileType() == StaticProfile, but staticGraph_ is null."
1306  << suffix);
1307 #endif // HAVE_TPETRA_DEBUG
1308 
1309  const size_t lclNumRows = this->staticGraph_->getNodeNumRows ();
1310  typename Graph::local_graph_type::row_map_type k_ptrs =
1311  this->staticGraph_->k_rowPtrs_;
1312  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1313  (k_ptrs.extent (0) != lclNumRows+1, std::logic_error,
1314  "With StaticProfile, row offsets array has length "
1315  << k_ptrs.extent (0) << " != (lclNumRows+1) = "
1316  << (lclNumRows+1) << ".");
1317 
1318  const size_t lclTotalNumEntries =
1319  ::Tpetra::Details::getEntryOnHost (k_ptrs, lclNumRows);
1320 
1321  // Allocate array of (packed???) matrix values.
1322  typedef typename local_matrix_type::values_type values_type;
1323  this->k_values1D_ =
1324  values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1325  }
1326  else {
1327  // "Dynamic profile" means the number of matrix entries in each
1328  // row is not fixed and may expand. Thus, we store the matrix's
1329  // values in "2-D storage," meaning an array of arrays. The
1330  // outer array has as many inner arrays as there are rows in the
1331  // matrix, and each inner array stores the values in that row.
1332  this->values2D_ = this->allocateValues2D ();
1333  }
1334  }
1335 
1336  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1337  void
1339  getAllValues (Teuchos::ArrayRCP<const size_t>& rowPointers,
1340  Teuchos::ArrayRCP<const LocalOrdinal>& columnIndices,
1341  Teuchos::ArrayRCP<const Scalar>& values) const
1342  {
1343  using Teuchos::RCP;
1344  const char tfecfFuncName[] = "getAllValues: ";
1345  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1346  columnIndices.size () != values.size (), std::runtime_error,
1347  "Requires that columnIndices and values are the same size.");
1348 
1349  RCP<const crs_graph_type> relevantGraph = getCrsGraph ();
1350  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1351  relevantGraph.is_null (), std::runtime_error,
1352  "Requires that getCrsGraph() is not null.");
1353  try {
1354  rowPointers = relevantGraph->getNodeRowPtrs ();
1355  }
1356  catch (std::exception &e) {
1357  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1358  true, std::runtime_error,
1359  "Caught exception while calling graph->getNodeRowPtrs(): "
1360  << e.what ());
1361  }
1362  try {
1363  columnIndices = relevantGraph->getNodePackedIndices ();
1364  }
1365  catch (std::exception &e) {
1366  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1367  true, std::runtime_error,
1368  "Caught exception while calling graph->getNodePackedIndices(): "
1369  << e.what ());
1370  }
1371  Teuchos::ArrayRCP<const impl_scalar_type> vals =
1372  Kokkos::Compat::persistingView (k_values1D_);
1373  values = Teuchos::arcp_reinterpret_cast<const Scalar> (vals);
1374  }
1375 
1376  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1377  void
1379  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1380  {
1382  using ::Tpetra::Details::ProfilingRegion;
1383  using Kokkos::create_mirror_view;
1384  using Teuchos::arcp_const_cast;
1385  using Teuchos::Array;
1386  using Teuchos::ArrayRCP;
1387  using Teuchos::null;
1388  using Teuchos::RCP;
1389  using Teuchos::rcp;
1390  typedef typename local_matrix_type::row_map_type row_map_type;
1391  typedef typename Graph::local_graph_type::entries_type::non_const_type lclinds_1d_type;
1392  typedef typename local_matrix_type::values_type values_type;
1393  ProfilingRegion regionFLGAM ("Tpetra::CrsGraph::fillLocalGraphAndMatrix");
1394 
1395 #ifdef HAVE_TPETRA_DEBUG
1396  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1397  "fillComplete or expertStaticFillComplete): ";
1398 #endif // HAVE_TPETRA_DEBUG
1399 
1400 #ifdef HAVE_TPETRA_DEBUG
1401  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1402  // matrix owns the graph, which means myGraph_ is not null.
1403  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1404  (myGraph_.is_null (), std::logic_error, "The nonconst graph (myGraph_) "
1405  "is null. This means that the matrix has a const (a.k.a. \"static\") "
1406  "graph. fillComplete or expertStaticFillComplete should never call "
1407  "fillLocalGraphAndMatrix in that case. "
1408  "Please report this bug to the Tpetra developers.");
1409 #endif // HAVE_TPETRA_DEBUG
1410 
1411  const size_t lclNumRows = this->getNodeNumRows ();
1412 
1413  // This method's goal is to fill in the three arrays (compressed
1414  // sparse row format) that define the sparse graph's and matrix's
1415  // structure, and the sparse matrix's values.
1416  //
1417  // Use the nonconst version of row_map_type for k_ptrs,
1418  // because row_map_type is const and we need to modify k_ptrs here.
1419  typename row_map_type::non_const_type k_ptrs;
1420  row_map_type k_ptrs_const;
1421  lclinds_1d_type k_inds;
1422  values_type k_vals;
1423 
1424  // Get references to the data in myGraph_, so we can modify them
1425  // as well. Note that we only call fillLocalGraphAndMatrix() if
1426  // the matrix owns the graph, which means myGraph_ is not null.
1427  lclinds_1d_type k_lclInds1D_ = myGraph_->k_lclInds1D_;
1428 
1429  typedef decltype (myGraph_->k_numRowEntries_) row_entries_type;
1430 
1431  if (getProfileType () != StaticProfile) {
1432  // Pack 2-D storage (DynamicProfile) into 1-D packed storage.
1433  //
1434  // DynamicProfile means that the matrix's column indices and
1435  // values are currently stored in a 2-D "unpacked" format, in
1436  // the arrays-of-arrays myGraph_->lclInds2D_ (for column
1437  // indices) and values2D_ (for values). We allocate 1-D storage
1438  // (k_inds resp. k_vals), and then copy from 2-D storage
1439  // (lclInds2D_ resp. values2D_) into 1-D storage (k_inds
1440  // resp. k_vals).
1441 
1442  // We're be packing on host. k_numRowEntries_ lives on host,
1443  // and computeOffsetsFromCounts accepts a host View for counts,
1444  // even if offsets is a device View. (Furthermore, the "host"
1445  // View may very well live in CudaUVMSpace, so doing this has no
1446  // penalty, other than requiring synchronization between Cuda
1447  // and host. UVM memory gets grumpy if both device and host
1448  // attempt to access it at the same time without an intervening
1449  // fence.)
1450  typename row_entries_type::const_type numRowEnt_h =
1451  myGraph_->k_numRowEntries_;
1452 #ifdef HAVE_TPETRA_DEBUG
1453  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1454  (static_cast<size_t> (numRowEnt_h.extent (0)) != lclNumRows,
1455  std::logic_error, "(DynamicProfile branch) numRowEnt_h has the "
1456  "wrong length. numRowEnt_h.extent(0) = "
1457  << numRowEnt_h.extent (0) << " != getNodeNumRows() = "
1458  << lclNumRows << ".");
1459 #endif // HAVE_TPETRA_DEBUG
1460 
1461  // We're packing on host (since we can't read Teuchos data
1462  // structures on device), so let's fill the packed row offsets
1463  // on host first.
1464  k_ptrs = typename row_map_type::non_const_type ("Tpetra::CrsGraph::ptr",
1465  lclNumRows+1);
1466  typename row_map_type::non_const_type::HostMirror h_ptrs =
1467  create_mirror_view (k_ptrs);
1468 
1469  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1470  // the array of valid entry counts per row.
1471  //
1472  // Return value is the total number of entries in the matrix on
1473  // the calling process. It's cheap to compute and useful as a
1474  // sanity check.
1475  const size_t lclTotalNumEntries =
1476  computeOffsetsFromCounts (h_ptrs, numRowEnt_h);
1477 #ifdef HAVE_TPETRA_DEBUG
1478  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1479  (static_cast<size_t> (h_ptrs.extent (0)) != lclNumRows + 1,
1480  std::logic_error, "(DynamicProfile branch) After packing h_ptrs, "
1481  "h_ptrs.extent(0) = " << h_ptrs.extent (0) << " != "
1482  "(lclNumRows+1) = " << (lclNumRows+1) << ".");
1483  {
1484  const size_t h_ptrs_lastEnt = h_ptrs(lclNumRows); // it's a host View
1485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1486  (h_ptrs_lastEnt != lclTotalNumEntries, std::logic_error,
1487  "(DynamicProfile branch) After packing h_ptrs, h_ptrs(lclNumRows="
1488  << lclNumRows << ") = " << h_ptrs_lastEnt << " != total number "
1489  "of entries on the calling process = " << lclTotalNumEntries << ".");
1490  }
1491 #endif // HAVE_TPETRA_DEBUG
1492 
1493  // Allocate the arrays of packed column indices and values.
1494  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::ind", lclTotalNumEntries);
1495  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1496 
1497  // We need host views of the above, since 2-D storage lives on host.
1498  typename lclinds_1d_type::HostMirror h_inds = create_mirror_view (k_inds);
1499  typename values_type::HostMirror h_vals = create_mirror_view (k_vals);
1500 
1501  // Pack the column indices and values on the host.
1502  ArrayRCP<Array<LocalOrdinal> > lclInds2D = myGraph_->lclInds2D_;
1503  for (size_t row = 0; row < lclNumRows; ++row) {
1504  const size_t numEnt = numRowEnt_h(row);
1505  std::copy (lclInds2D[row].begin(),
1506  lclInds2D[row].begin() + numEnt,
1507  h_inds.data() + h_ptrs(row));
1508  std::copy (values2D_[row].begin(),
1509  values2D_[row].begin() + numEnt,
1510  h_vals.data() + h_ptrs(row));
1511  }
1512 
1513  // Copy the packed column indices and values to the device.
1514  Kokkos::deep_copy (k_inds, h_inds);
1515  Kokkos::deep_copy (k_vals, h_vals);
1516  // Copy the packed row offsets to the device too.
1517  // We didn't actually need them on device before.
1518  Kokkos::deep_copy (k_ptrs, h_ptrs);
1519  k_ptrs_const = k_ptrs; // const version of k_ptrs
1520 
1521 #ifdef HAVE_TPETRA_DEBUG
1522  // Sanity check of packed row offsets.
1523  if (k_ptrs.extent (0) != 0) {
1524  const size_t numOffsets = static_cast<size_t> (k_ptrs.extent (0));
1525  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1526  (numOffsets != lclNumRows + 1, std::logic_error, "(DynamicProfile "
1527  "branch) After copying into k_ptrs, k_ptrs.extent(0) = " <<
1528  numOffsets << " != (lclNumRows+1) = " << (lclNumRows+1) << ".");
1529 
1530  const auto valToCheck = ::Tpetra::Details::getEntryOnHost (k_ptrs, numOffsets-1);
1531  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1532  (static_cast<size_t> (valToCheck) != k_vals.extent (0),
1533  std::logic_error, "(DynamicProfile branch) After packing, k_ptrs("
1534  << (numOffsets-1) << ") = " << valToCheck << " != "
1535  "k_vals.extent(0) = " << k_vals.extent (0) << ".");
1536  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1537  (static_cast<size_t> (valToCheck) != k_inds.extent (0),
1538  std::logic_error, "(DynamicProfile branch) After packing, k_ptrs("
1539  << (numOffsets-1) << ") = " << valToCheck << " != "
1540  "k_inds.extent(0) = " << k_inds.extent (0) << ".");
1541  }
1542 #endif // HAVE_TPETRA_DEBUG
1543  }
1544  else if (getProfileType () == StaticProfile) {
1545  // StaticProfile means that the matrix's column indices and
1546  // values are currently stored in a 1-D format, with row offsets
1547  // in k_rowPtrs_ and local column indices in k_lclInds1D_.
1548 
1549  // StaticProfile also means that the graph's array of row
1550  // offsets must already be allocated.
1551  typename Graph::local_graph_type::row_map_type curRowOffsets =
1552  myGraph_->k_rowPtrs_;
1553 
1554 #ifdef HAVE_TPETRA_DEBUG
1555  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1556  (curRowOffsets.extent (0) == 0, std::logic_error,
1557  "(StaticProfile branch) curRowOffsets.extent(0) == 0.");
1558  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1559  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1560  "(StaticProfile branch) curRowOffsets.extent(0) = "
1561  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1562  << (lclNumRows + 1) << ".")
1563  {
1564  const size_t numOffsets = curRowOffsets.extent (0);
1565  const auto valToCheck =
1566  ::Tpetra::Details::getEntryOnHost (curRowOffsets, numOffsets - 1);
1567  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1568  (numOffsets != 0 &&
1569  myGraph_->k_lclInds1D_.extent (0) != valToCheck,
1570  std::logic_error, "(StaticProfile branch) numOffsets = " <<
1571  numOffsets << " != 0 and myGraph_->k_lclInds1D_.extent(0) = "
1572  << myGraph_->k_lclInds1D_.extent (0) << " != curRowOffsets("
1573  << numOffsets << ") = " << valToCheck << ".");
1574  }
1575 #endif // HAVE_TPETRA_DEBUG
1576 
1577  if (myGraph_->getNodeNumEntries () != myGraph_->getNodeAllocationSize ()) {
1578  // The matrix's current 1-D storage is "unpacked." This means
1579  // the row offsets may differ from what the final row offsets
1580  // should be. This could happen, for example, if the user
1581  // specified StaticProfile in the constructor and set an upper
1582  // bound on the number of entries per row, but didn't fill all
1583  // those entries.
1584 #ifdef HAVE_TPETRA_DEBUG
1585  if (curRowOffsets.extent (0) != 0) {
1586  const size_t numOffsets =
1587  static_cast<size_t> (curRowOffsets.extent (0));
1588  const auto valToCheck =
1589  ::Tpetra::Details::getEntryOnHost (curRowOffsets, numOffsets-1);
1590  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1591  (static_cast<size_t> (valToCheck) !=
1592  static_cast<size_t> (k_values1D_.extent (0)),
1593  std::logic_error, "(StaticProfile unpacked branch) Before "
1594  "allocating or packing, curRowOffsets(" << (numOffsets-1) << ") = "
1595  << valToCheck << " != k_values1D_.extent(0)"
1596  " = " << k_values1D_.extent (0) << ".");
1597  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1598  (static_cast<size_t> (valToCheck) !=
1599  static_cast<size_t> (myGraph_->k_lclInds1D_.extent (0)),
1600  std::logic_error, "(StaticProfile unpacked branch) Before "
1601  "allocating or packing, curRowOffsets(" << (numOffsets-1) << ") = "
1602  << valToCheck
1603  << " != myGraph_->k_lclInds1D_.extent(0) = "
1604  << myGraph_->k_lclInds1D_.extent (0) << ".");
1605  }
1606 #endif // HAVE_TPETRA_DEBUG
1607 
1608  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1609  // the array of valid entry counts per row.
1610 
1611  // Total number of entries in the matrix on the calling
1612  // process. We will compute this in the loop below. It's
1613  // cheap to compute and useful as a sanity check.
1614  size_t lclTotalNumEntries = 0;
1615  // This will be a host view of packed row offsets.
1616  typename row_map_type::non_const_type::HostMirror h_ptrs;
1617  {
1618  // Allocate the packed row offsets array. We use a nonconst
1619  // temporary (packedRowOffsets) here, because k_ptrs is
1620  // const. We will assign packedRowOffsets to k_ptrs below.
1621  typename row_map_type::non_const_type
1622  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1623  typename row_entries_type::const_type numRowEnt_h =
1624  myGraph_->k_numRowEntries_;
1625  // We're computing offsets on device. This function can
1626  // handle numRowEnt_h being a host View.
1627  lclTotalNumEntries =
1628  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1629  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1630  // to use packedRowOffsets in the loop above and assign here.
1631  k_ptrs = packedRowOffsets;
1632  k_ptrs_const = k_ptrs;
1633  }
1634 
1635 #ifdef HAVE_TPETRA_DEBUG
1636  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1637  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1638  std::logic_error,
1639  "(StaticProfile unpacked branch) After packing k_ptrs, "
1640  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1641  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1642  {
1643  const auto valToCheck = ::Tpetra::Details::getEntryOnHost (k_ptrs, lclNumRows);
1644  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1645  (valToCheck != lclTotalNumEntries, std::logic_error,
1646  "(StaticProfile unpacked branch) After filling k_ptrs, "
1647  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1648  << " != total number of entries on the calling process = "
1649  << lclTotalNumEntries << ".");
1650  }
1651 #endif // HAVE_TPETRA_DEBUG
1652 
1653  // Allocate the arrays of packed column indices and values.
1654  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::ind", lclTotalNumEntries);
1655  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1656 
1657  // curRowOffsets (myGraph_->k_rowPtrs_) (???), k_lclInds1D_,
1658  // and k_values1D_ are currently unpacked. Pack them, using
1659  // the packed row offsets array k_ptrs that we created above.
1660  //
1661  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1662  // need to keep around the unpacked row offsets, column
1663  // indices, and values arrays.
1664 
1665  // Pack the column indices from unpacked k_lclInds1D_ into
1666  // packed k_inds. We will replace k_lclInds1D_ below.
1667  typedef pack_functor<typename Graph::local_graph_type::entries_type::non_const_type,
1668  typename Graph::local_graph_type::row_map_type>
1669  inds_packer_type;
1670  inds_packer_type indsPacker (k_inds, myGraph_->k_lclInds1D_,
1671  k_ptrs, curRowOffsets);
1672  typedef typename decltype (k_inds)::execution_space exec_space;
1673  typedef Kokkos::RangePolicy<exec_space, LocalOrdinal> range_type;
1674  Kokkos::parallel_for (range_type (0, lclNumRows), indsPacker);
1675 
1676  // Pack the values from unpacked k_values1D_ into packed
1677  // k_vals. We will replace k_values1D_ below.
1678  typedef pack_functor<values_type, row_map_type> vals_packer_type;
1679  vals_packer_type valsPacker (k_vals, this->k_values1D_,
1680  k_ptrs, curRowOffsets);
1681  Kokkos::parallel_for (range_type (0, lclNumRows), valsPacker);
1682 
1683 #ifdef HAVE_TPETRA_DEBUG
1684  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1685  (k_ptrs.extent (0) == 0, std::logic_error,
1686  "(StaticProfile \"Optimize Storage\" = "
1687  "true branch) After packing, k_ptrs.extent(0) = 0. This "
1688  "probably means that k_rowPtrs_ was never allocated.");
1689  if (k_ptrs.extent (0) != 0) {
1690  const size_t numOffsets = static_cast<size_t> (k_ptrs.extent (0));
1691  const auto valToCheck = ::Tpetra::Details::getEntryOnHost (k_ptrs, numOffsets - 1);
1692  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1693  (static_cast<size_t> (valToCheck) != k_vals.extent (0),
1694  std::logic_error,
1695  "(StaticProfile \"Optimize Storage\"=true branch) After packing, "
1696  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1697  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1698  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1699  (static_cast<size_t> (valToCheck) != k_inds.extent (0),
1700  std::logic_error,
1701  "(StaticProfile \"Optimize Storage\"=true branch) After packing, "
1702  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1703  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1704  }
1705 #endif // HAVE_TPETRA_DEBUG
1706  }
1707  else { // We don't have to pack, so just set the pointers.
1708  k_ptrs_const = myGraph_->k_rowPtrs_;
1709  k_inds = myGraph_->k_lclInds1D_;
1710  k_vals = this->k_values1D_;
1711 
1712 #ifdef HAVE_TPETRA_DEBUG
1713  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1714  (k_ptrs_const.extent (0) == 0, std::logic_error,
1715  "(StaticProfile \"Optimize Storage\"=false branch) "
1716  "k_ptrs_const.extent(0) = 0. This probably means that "
1717  "k_rowPtrs_ was never allocated.");
1718  if (k_ptrs_const.extent (0) != 0) {
1719  const size_t numOffsets = static_cast<size_t> (k_ptrs_const.extent (0));
1720  const auto valToCheck = ::Tpetra::Details::getEntryOnHost (k_ptrs_const, numOffsets - 1);
1721  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1722  (static_cast<size_t> (valToCheck) != k_vals.extent (0),
1723  std::logic_error,
1724  "(StaticProfile \"Optimize Storage\"=false branch) "
1725  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1726  << " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1727  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1728  (static_cast<size_t> (valToCheck) != k_inds.extent (0),
1729  std::logic_error,
1730  "(StaticProfile \"Optimize Storage\" = false branch) "
1731  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1732  << " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1733  }
1734 #endif // HAVE_TPETRA_DEBUG
1735  }
1736  }
1737 
1738 #ifdef HAVE_TPETRA_DEBUG
1739  // Extra sanity checks.
1740  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1741  (static_cast<size_t> (k_ptrs_const.extent (0)) != lclNumRows + 1,
1742  std::logic_error, "After packing, k_ptrs_const.extent(0) = " <<
1743  k_ptrs_const.extent (0) << " != lclNumRows+1 = " << (lclNumRows+1)
1744  << ".");
1745  if (k_ptrs_const.extent (0) != 0) {
1746  const size_t numOffsets = static_cast<size_t> (k_ptrs_const.extent (0));
1747  const size_t k_ptrs_const_numOffsetsMinus1 =
1748  ::Tpetra::Details::getEntryOnHost (k_ptrs_const, numOffsets - 1);
1749  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1750  (k_ptrs_const_numOffsetsMinus1 != k_vals.extent (0),
1751  std::logic_error, "After packing, k_ptrs_const(" << (numOffsets-1) <<
1752  ") = " << k_ptrs_const_numOffsetsMinus1 << " != k_vals.extent(0)"
1753  " = " << k_vals.extent (0) << ".");
1754  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1755  (k_ptrs_const_numOffsetsMinus1 != k_inds.extent (0),
1756  std::logic_error, "After packing, k_ptrs_const(" << (numOffsets-1) <<
1757  ") = " << k_ptrs_const_numOffsetsMinus1 << " != k_inds.extent(0)"
1758  " = " << k_inds.extent (0) << ".");
1759  }
1760 #endif // HAVE_TPETRA_DEBUG
1761 
1762  // May we ditch the old allocations for the packed (and otherwise
1763  // "optimized") allocations, later in this routine? Optimize
1764  // storage if the graph is not static, or if the graph already has
1765  // optimized storage.
1766  const bool defaultOptStorage =
1767  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1768  const bool requestOptimizedStorage =
1769  (! params.is_null () && params->get ("Optimize Storage", defaultOptStorage)) ||
1770  (params.is_null () && defaultOptStorage);
1771 
1772  // The graph has optimized storage when indices are allocated,
1773  // myGraph_->k_numRowEntries_ is empty, and there are more than
1774  // zero rows on this process. It's impossible for the graph to
1775  // have dynamic profile (getProfileType() == DynamicProfile) and
1776  // be optimized (isStorageOptimized()).
1777  if (requestOptimizedStorage) {
1778  // Free the old, unpacked, unoptimized allocations.
1779  // Change the graph from dynamic to static allocation profile
1780 
1781  // Free graph data structures that are only needed for 2-D or
1782  // unpacked 1-D storage.
1783  myGraph_->lclInds2D_ = null; // legacy KokkosClassic 2-D storage
1784  myGraph_->k_numRowEntries_ = row_entries_type ();
1785 
1786  // Free the matrix's 2-D storage.
1787  this->values2D_ = null;
1788 
1789  // Keep the new 1-D packed allocations.
1790  myGraph_->k_rowPtrs_ = k_ptrs_const;
1791  myGraph_->k_lclInds1D_ = k_inds;
1792  this->k_values1D_ = k_vals;
1793 
1794  // Whatever graph was before, it's StaticProfile now.
1795  myGraph_->pftype_ = StaticProfile;
1796  myGraph_->storageStatus_ = ::Tpetra::Details::STORAGE_1D_PACKED;
1797  this->storageStatus_ = ::Tpetra::Details::STORAGE_1D_PACKED;
1798  }
1799 
1800  // Make the local graph, using the arrays of row offsets and
1801  // column indices that we built above. The local graph should be
1802  // null, but we delete it first so that any memory can be freed
1803  // before we allocate the new one.
1804  //
1805  // FIXME (mfh 06,28 Aug 2014) It would make more sense for
1806  // Tpetra::CrsGraph to have a protected method that accepts k_inds
1807  // and k_ptrs, and creates the local graph lclGraph_.
1808  myGraph_->lclGraph_ =
1809  typename Graph::local_graph_type (k_inds, k_ptrs_const);
1810 
1811  // Make the local matrix, using the local graph and vals array.
1812  auto lclMat = std::make_shared<local_matrix_type>
1813  ("Tpetra::CrsMatrix::lclMatrix_", getNodeNumCols (),
1814  k_vals, myGraph_->lclGraph_);
1815  lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
1816  }
1817 
1818  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1819  void
1821  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1822  {
1823  using ::Tpetra::Details::ProfilingRegion;
1824  using Kokkos::create_mirror_view;
1825  using Teuchos::ArrayRCP;
1826  using Teuchos::Array;
1827  using Teuchos::null;
1828  using Teuchos::RCP;
1829  using Teuchos::rcp;
1830  typedef LocalOrdinal LO;
1831  typedef typename Graph::local_graph_type::row_map_type row_map_type;
1832  typedef typename row_map_type::non_const_type non_const_row_map_type;
1833  typedef typename local_matrix_type::values_type values_type;
1834 #ifdef HAVE_TPETRA_DEBUG
1835  const char tfecfFuncName[] = "fillLocalMatrix (called from fillComplete): ";
1836 #endif // HAVE_TPETRA_DEBUG
1837  ProfilingRegion regionFLM ("Tpetra::CrsMatrix::fillLocalMatrix");
1838 
1839  const size_t lclNumRows = getNodeNumRows();
1840 
1841  // The goals of this routine are first, to allocate and fill
1842  // packed 1-D storage (see below for an explanation) in the vals
1843  // array, and second, to give vals to the local matrix and
1844  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1845  // row offsets, within the scope of this routine, since we're only
1846  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1847  // fill both the graph and the matrix at the same time).
1848 
1849  // get data from staticGraph_
1850  ArrayRCP<Array<LO> > lclInds2D = staticGraph_->lclInds2D_;
1851  size_t nodeNumEntries = staticGraph_->getNodeNumEntries ();
1852  size_t nodeNumAllocated = staticGraph_->getNodeAllocationSize ();
1853  row_map_type k_rowPtrs_ = staticGraph_->lclGraph_.row_map;
1854 
1855  row_map_type k_ptrs; // "packed" row offsets array
1856  values_type k_vals; // "packed" values array
1857 
1858  // May we ditch the old allocations for the packed (and otherwise
1859  // "optimized") allocations, later in this routine? Request
1860  // optimized storage by default.
1861  bool requestOptimizedStorage = true;
1862  const bool default_OptimizeStorage =
1863  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1864  if (! params.is_null () && ! params->get ("Optimize Storage", default_OptimizeStorage)) {
1865  requestOptimizedStorage = false;
1866  }
1867  // If we're not allowed to change a static graph, then we can't
1868  // change the storage of the matrix, either. This means that if
1869  // the graph's storage isn't already optimized, we can't optimize
1870  // the matrix's storage either. Check and give warning, as
1871  // appropriate.
1872  if (! staticGraph_->isStorageOptimized () && requestOptimizedStorage) {
1873  TPETRA_ABUSE_WARNING(true, std::runtime_error,
1874  "You requested optimized storage by setting the"
1875  "\"Optimize Storage\" flag to \"true\" in the parameter list, or by virtue"
1876  "of default behavior. However, the associated CrsGraph was filled separately"
1877  "and requested not to optimize storage. Therefore, the CrsMatrix cannot"
1878  "optimize storage.");
1879  requestOptimizedStorage = false;
1880  }
1881 
1882  typedef decltype (staticGraph_->k_numRowEntries_) row_entries_type;
1883 
1884  if (getProfileType() != StaticProfile) {
1885  // Pack 2-D storage (DynamicProfile) into 1-D packed storage.
1886  //
1887  // DynamicProfile means that the matrix's values are currently
1888  // stored in a 2-D "unpacked" format, in the array-of-arrays
1889  // values2D_. We allocate 1-D storage and then copy from 2-D
1890  // storage in values2D_ into 1-D storage in k_vals. Since we're
1891  // only allocating the local matrix here, not the local graph,
1892  // we don't need to keep the row offsets array, but we do need
1893  // it here temporarily in order to convert to 1-D storage. (The
1894  // allocStorage() function needs it.) We'll free ptrs later in
1895  // this method.
1896  //
1897  // FIXME (mfh 08 Aug 2014) If we're in this method, then the
1898  // graph should already have packed 1-D storage. Why can't we
1899  // just use the graph's current row offsets array?
1900 
1901  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1902  // the array of valid entry counts per row.
1903  //
1904  // Total number of entries in the matrix on the calling
1905  // process. We will compute this in the loop below. It's
1906  // cheap to compute and useful as a sanity check.
1907  size_t lclTotalNumEntries = 0;
1908  // This will be a host view of packed row offsets.
1909  typename non_const_row_map_type::HostMirror h_ptrs;
1910 
1911  typename row_entries_type::const_type numRowEnt_h =
1912  staticGraph_->k_numRowEntries_;
1913  {
1914  non_const_row_map_type packedRowOffsets ("Tpetra::CrsGraph::ptr",
1915  lclNumRows+1);
1916  // NOTE (mfh 27 Jun 2016) We need h_ptrs on host anyway, so
1917  // let's just compute offsets on host.
1918  h_ptrs = create_mirror_view (packedRowOffsets);
1920  lclTotalNumEntries = computeOffsetsFromCounts (h_ptrs, numRowEnt_h);
1921  Kokkos::deep_copy (packedRowOffsets, h_ptrs);
1922  k_ptrs = packedRowOffsets;
1923  }
1924 
1925 #ifdef HAVE_TPETRA_DEBUG
1926  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1927  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1928  std::logic_error, "In DynamicProfile branch, after packing k_ptrs, "
1929  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1930  "(lclNumRows+1) = " << (lclNumRows+1) << ".");
1931  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1932  (static_cast<size_t> (h_ptrs.extent (0)) != lclNumRows + 1,
1933  std::logic_error, "In DynamicProfile branch, after packing h_ptrs, "
1934  "h_ptrs.extent(0) = " << h_ptrs.extent (0) << " != "
1935  "(lclNumRows+1) = " << (lclNumRows+1) << ".");
1936  {
1937  const auto valToCheck = ::Tpetra::Details::getEntryOnHost (k_ptrs, lclNumRows);
1938  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1939  (static_cast<size_t> (valToCheck) != lclTotalNumEntries,
1940  std::logic_error, "(DynamicProfile branch) After packing k_ptrs, "
1941  "k_ptrs(lclNumRows = " << lclNumRows << ") = " << valToCheck
1942  << " != total number of entries on the calling process = "
1943  << lclTotalNumEntries << ".");
1944  }
1945 #endif // HAVE_TPETRA_DEBUG
1946 
1947  // Allocate the array of packed values.
1948  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1949  // We need a host view of the above, since 2-D storage lives on host.
1950  typename values_type::HostMirror h_vals = create_mirror_view (k_vals);
1951  // Pack the values on the host.
1952  for (size_t lclRow = 0; lclRow < lclNumRows; ++lclRow) {
1953  const size_t numEnt = numRowEnt_h(lclRow);
1954  std::copy (values2D_[lclRow].begin(),
1955  values2D_[lclRow].begin() + numEnt,
1956  h_vals.data() + h_ptrs(lclRow));
1957  }
1958  // Copy the packed values to the device.
1959  Kokkos::deep_copy (k_vals, h_vals);
1960 
1961 #ifdef HAVE_TPETRA_DEBUG
1962  // Sanity check of packed row offsets.
1963  if (k_ptrs.extent (0) != 0) {
1964  const size_t numOffsets = static_cast<size_t> (k_ptrs.extent (0));
1965  const auto valToCheck =
1966  ::Tpetra::Details::getEntryOnHost (k_ptrs, numOffsets - 1);
1967  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1968  (static_cast<size_t> (valToCheck) != k_vals.extent (0),
1969  std::logic_error, "(DynamicProfile branch) After packing, k_ptrs("
1970  << (numOffsets-1) << ") = " << valToCheck << " != "
1971  "k_vals.extent(0) = " << k_vals.extent (0) << ".");
1972  }
1973 #endif // HAVE_TPETRA_DEBUG
1974  }
1975  else if (getProfileType () == StaticProfile) {
1976  // StaticProfile means that the matrix's values are currently
1977  // stored in a 1-D format. However, this format is "unpacked";
1978  // it doesn't necessarily have the same row offsets as indicated
1979  // by the ptrs array returned by allocRowPtrs. This could
1980  // happen, for example, if the user specified StaticProfile in
1981  // the constructor and fixed the number of matrix entries in
1982  // each row, but didn't fill all those entries.
1983  //
1984  // As above, we don't need to keep the "packed" row offsets
1985  // array ptrs here, but we do need it here temporarily, so we
1986  // have to allocate it. We'll free ptrs later in this method.
1987  //
1988  // Note that this routine checks whether storage has already
1989  // been packed. This is a common case for solution of nonlinear
1990  // PDEs using the finite element method, as long as the
1991  // structure of the sparse matrix does not change between linear
1992  // solves.
1993  if (nodeNumEntries != nodeNumAllocated) {
1994  // We have to pack the 1-D storage, since the user didn't fill
1995  // up all requested storage.
1996  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1997  lclNumRows+1);
1998  // Total number of entries in the matrix on the calling
1999  // process. We will compute this in the loop below. It's
2000  // cheap to compute and useful as a sanity check.
2001  size_t lclTotalNumEntries = 0;
2002  k_ptrs = tmpk_ptrs;
2003  {
2004  typename row_entries_type::const_type numRowEnt_d =
2005  staticGraph_->k_numRowEntries_;
2007  // This function can handle the counts being a host View.
2008  lclTotalNumEntries = computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_d);
2009  }
2010 
2011  // Allocate the "packed" values array.
2012  // It has exactly the right number of entries.
2013  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
2014 
2015  // Pack k_values1D_ into k_vals. We will replace k_values1D_ below.
2016  typedef pack_functor<values_type, row_map_type> packer_type;
2017  packer_type valsPacker (k_vals, k_values1D_, tmpk_ptrs, k_rowPtrs_);
2018 
2019  typedef typename decltype (k_vals)::execution_space exec_space;
2020  typedef Kokkos::RangePolicy<exec_space, LocalOrdinal> range_type;
2021  Kokkos::parallel_for (range_type (0, lclNumRows), valsPacker);
2022  }
2023  else { // We don't have to pack, so just set the pointer.
2024  k_vals = k_values1D_;
2025  }
2026  }
2027 
2028  // May we ditch the old allocations for the packed one?
2029  if (requestOptimizedStorage) {
2030  // The user requested optimized storage, so we can dump the
2031  // unpacked 2-D and 1-D storage, and keep the packed storage.
2032  values2D_ = null;
2033  k_values1D_ = k_vals;
2034  this->storageStatus_ = ::Tpetra::Details::STORAGE_1D_PACKED;
2035  }
2036 
2037  // Build the local sparse matrix object. At this point, the local
2038  // matrix certainly has a column Map. Remember that the local
2039  // matrix's number of columns comes from the column Map, not the
2040  // domain Map.
2041  auto lclMat = std::make_shared<local_matrix_type>
2042  ("Tpetra::CrsMatrix::lclMatrix_",
2043  getColMap ()->getNodeNumElements (),
2044  k_vals, staticGraph_->getLocalGraph ());
2045  lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
2046  }
2047 
2048  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2049  void
2051  insertIndicesAndValues (crs_graph_type& graph,
2052  RowInfo& rowInfo,
2053  const typename crs_graph_type::SLocalGlobalViews& newInds,
2054  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
2055  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
2056  const ELocalGlobal lg,
2057  const ELocalGlobal I)
2058  {
2059  const size_t oldNumEnt = rowInfo.numEntries;
2060  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
2061 
2062  // Use of memcpy here works around an issue with GCC >= 4.9.0,
2063  // that probably relates to scalar_type vs. impl_scalar_type
2064  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
2065  // details; look for GCC_WORKAROUND macro definition.
2066  if (numInserted > 0) {
2067  const size_t startOffset = oldNumEnt;
2068  memcpy (&oldRowVals[startOffset], &newRowVals[0],
2069  numInserted * sizeof (impl_scalar_type));
2070  }
2071  }
2072 
2073  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2074  void
2076  insertLocalValues (const LocalOrdinal lclRow,
2077  const Teuchos::ArrayView<const LocalOrdinal>& indices,
2078  const Teuchos::ArrayView<const Scalar>& values)
2079  {
2080  using std::endl;
2081  typedef impl_scalar_type IST;
2082  const char tfecfFuncName[] = "insertLocalValues: ";
2083 
2084  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2085  (! this->isFillActive (), std::runtime_error,
2086  "Fill is not active. After calling fillComplete, you must call "
2087  "resumeFill before you may insert entries into the matrix again.");
2088  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2089  (this->isStaticGraph (), std::runtime_error,
2090  "Cannot insert indices with static graph; use replaceLocalValues() "
2091  "instead.");
2092  // At this point, we know that myGraph_ is nonnull.
2093  crs_graph_type& graph = * (this->myGraph_);
2094  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2095  (graph.colMap_.is_null (), std::runtime_error,
2096  "Cannot insert local indices without a column map.");
2097  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2098  (graph.isGloballyIndexed (),
2099  std::runtime_error, "Graph indices are global; use "
2100  "insertGlobalValues().");
2101  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2102  (values.size () != indices.size (), std::runtime_error,
2103  "values.size() = " << values.size ()
2104  << " != indices.size() = " << indices.size () << ".");
2105  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2106  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
2107  "Local row index " << lclRow << " does not belong to this process.");
2108 
2109  if (! graph.indicesAreAllocated ()) {
2110  this->allocateValues (LocalIndices, GraphNotYetAllocated);
2111  }
2112 
2113  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
2114 #ifdef HAVE_TPETRA_DEBUG
2115  // In a debug build, test whether any of the given column indices
2116  // are not in the column Map. Keep track of the invalid column
2117  // indices so we can tell the user about them.
2118  {
2119  using Teuchos::toString;
2120 
2121  const map_type& colMap = * (graph.colMap_);
2122  Teuchos::Array<LocalOrdinal> badColInds;
2123  bool allInColMap = true;
2124  for (size_t k = 0; k < numEntriesToAdd; ++k) {
2125  if (! colMap.isNodeLocalElement (indices[k])) {
2126  allInColMap = false;
2127  badColInds.push_back (indices[k]);
2128  }
2129  }
2130  if (! allInColMap) {
2131  std::ostringstream os;
2132  os << "You attempted to insert entries in owned row " << lclRow
2133  << ", at the following column indices: " << toString (indices)
2134  << "." << endl;
2135  os << "Of those, the following indices are not in the column Map on "
2136  "this process: " << toString (badColInds) << "." << endl << "Since "
2137  "the matrix has a column Map already, it is invalid to insert "
2138  "entries at those locations.";
2139  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2140  (true, std::invalid_argument, os.str ());
2141  }
2142  }
2143 #endif // HAVE_TPETRA_DEBUG
2144 
2145  RowInfo rowInfo = graph.getRowInfo (lclRow);
2146 
2147  if (this->getProfileType() == StaticProfile)
2148  {
2149  Teuchos::ArrayView<IST> valsView = this->getViewNonConst(rowInfo);
2150  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
2151  valsView[offset] += values[k]; };
2152  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2153  graph.insertLocalIndicesImpl(lclRow, indices, cb);
2154  }
2155  else
2156  {
2157  // NOTE (DYNAMICPROFILE_REMOVAL) (tjf Mar 2019) Remove with DynamicProfile
2158  const size_t curNumEnt = rowInfo.numEntries;
2159  const size_t newNumEnt = curNumEnt + numEntriesToAdd;
2160  if (newNumEnt > rowInfo.allocSize) {
2161  // This must be a nonconst reference, since we'll reallocate.
2162  Teuchos::Array<IST>& curVals = this->values2D_[lclRow];
2163  // Make space for the new matrix entries.
2164  // Teuchos::ArrayRCP::resize automatically copies over values on
2165  // reallocation.
2166  graph.lclInds2D_[rowInfo.localRow].resize (newNumEnt);
2167  curVals.resize (newNumEnt);
2168  rowInfo.allocSize = newNumEnt; // give rowInfo updated allocSize
2169  }
2170  typename crs_graph_type::SLocalGlobalViews indsView;
2171  indsView.linds = indices;
2172 
2173  Teuchos::ArrayView<IST> valsView = this->getViewNonConst (rowInfo);
2174  Teuchos::ArrayView<const IST> valsIn =
2175  Teuchos::av_reinterpret_cast<const IST> (values);
2176  this->insertIndicesAndValues (graph, rowInfo, indsView, valsView,
2177  valsIn, LocalIndices, LocalIndices);
2178 #ifdef HAVE_TPETRA_DEBUG
2179  const size_t chkNewNumEnt = graph.getNumEntriesInLocalRow (lclRow);
2180  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2181  (chkNewNumEnt != newNumEnt, std::logic_error,
2182  "The row should have " << newNumEnt << " entries after insert, but "
2183  "instead has " << chkNewNumEnt << ". Please report this bug to "
2184  "the Tpetra developers.");
2185 #endif // HAVE_TPETRA_DEBUG
2186  }
2187  }
2188 
2189  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2190  void
2192  insertLocalValues (const LocalOrdinal localRow,
2193  const LocalOrdinal numEnt,
2194  const Scalar vals[],
2195  const LocalOrdinal cols[])
2196  {
2197  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
2198  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2199  this->insertLocalValues (localRow, colsT, valsT);
2200  }
2201 
2202  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2203  void
2205  insertGlobalValuesImpl (crs_graph_type& graph,
2206  RowInfo& rowInfo,
2207  const GlobalOrdinal gblColInds[],
2208  const impl_scalar_type vals[],
2209  const size_t numInputEnt)
2210  {
2211  typedef impl_scalar_type IST;
2212  typedef GlobalOrdinal GO;
2213 #ifdef HAVE_TPETRA_DEBUG
2214  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
2215  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
2216 #endif // HAVE_TPETRA_DEBUG
2217 
2218  size_t newNumEnt = 0;
2219  const size_t curNumEnt = rowInfo.numEntries;
2220 
2221  if (! graph.indicesAreAllocated ()) {
2222  this->allocateValues (GlobalIndices, GraphNotYetAllocated);
2223  // mfh 23 Jul 2017: allocateValues invalidates existing
2224  // getRowInfo results. Once we get rid of lazy graph
2225  // allocation, we'll be able to move the getRowInfo call outside
2226  // of this method.
2227  rowInfo = graph.getRowInfo (rowInfo.localRow);
2228  }
2229 
2230  if (this->getProfileType () == StaticProfile) {
2231  Teuchos::ArrayView<IST> valsView = this->getViewNonConst(rowInfo);
2232  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
2233  valsView[offset] += vals[k];
2234  };
2235  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2236  auto numInserted =
2237  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
2238  newNumEnt = curNumEnt + numInserted;
2239  }
2240  else {
2241  // NOTE (DYNAMICPROFILE_REMOVAL) remove this block
2242  newNumEnt = curNumEnt + numInputEnt;
2243  if (newNumEnt > rowInfo.allocSize) {
2244  // This needs to be a nonconst reference, in case we want to
2245  // reallocate it.
2246  Teuchos::Array<IST>& curVals = this->values2D_[rowInfo.localRow];
2247  // Teuchos::ArrayRCP::resize automatically copies over values on
2248  // reallocation.
2249  graph.gblInds2D_[rowInfo.localRow].resize (newNumEnt);
2250  curVals.resize (newNumEnt);
2251  rowInfo.allocSize = newNumEnt; // reassign for updated allocSize
2252  }
2253 
2254  using Teuchos::ArrayView;
2255  typename crs_graph_type::SLocalGlobalViews inputIndsAV;
2256  inputIndsAV.ginds = ArrayView<const GO> (gblColInds, numInputEnt);
2257  ArrayView<IST> curValsAV = this->getViewNonConst (rowInfo);
2258  ArrayView<const IST> inputValsAV (vals, numInputEnt);
2259 
2260  const ELocalGlobal curIndexingStatus =
2261  this->isGloballyIndexed () ? GlobalIndices : LocalIndices;
2262  // curIndexingStatus == GlobalIndices means the method calls
2263  // getGlobalViewNonConst() and does direct copying, which should
2264  // be reasonably fast. LocalIndices means the method calls the
2265  // Map's getLocalElement() method once per entry to insert. This
2266  // may be slow.
2267  this->insertIndicesAndValues (graph, rowInfo, inputIndsAV, curValsAV,
2268  inputValsAV, GlobalIndices,
2269  curIndexingStatus);
2270  }
2271 
2272 #ifdef HAVE_TPETRA_DEBUG
2273  const size_t chkNewNumEnt =
2274  graph.getNumEntriesInLocalRow (rowInfo.localRow);
2275  if (chkNewNumEnt != newNumEnt) {
2276  std::ostringstream os;
2277  os << std::endl << "newNumEnt = " << newNumEnt
2278  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
2279  << ") = " << chkNewNumEnt << "." << std::endl
2280  << "\torigNumEnt: " << origNumEnt << std::endl
2281  << "\tnumInputEnt: " << numInputEnt << std::endl
2282  << "\tgblColInds: [";
2283  for (size_t k = 0; k < numInputEnt; ++k) {
2284  os << gblColInds[k];
2285  if (k + size_t (1) < numInputEnt) {
2286  os << ",";
2287  }
2288  }
2289  os << "]" << std::endl
2290  << "\tvals: [";
2291  for (size_t k = 0; k < numInputEnt; ++k) {
2292  os << vals[k];
2293  if (k + size_t (1) < numInputEnt) {
2294  os << ",";
2295  }
2296  }
2297  os << "]" << std::endl;
2298 
2299  if (this->supportsRowViews ()) {
2300  Teuchos::ArrayView<const Scalar> vals2;
2301  if (this->isGloballyIndexed ()) {
2302  Teuchos::ArrayView<const GlobalOrdinal> gblColInds2;
2303  const GlobalOrdinal gblRow =
2304  graph.rowMap_->getGlobalElement (rowInfo.localRow);
2305  if (gblRow == Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
2306  os << "Local row index " << rowInfo.localRow << " is invalid!" << std::endl;
2307  }
2308  else {
2309  bool getViewThrew = false;
2310  try {
2311  this->getGlobalRowView (gblRow, gblColInds2, vals2);
2312  }
2313  catch (std::exception& e) {
2314  getViewThrew = true;
2315  os << "getGlobalRowView threw exception:" << std::endl
2316  << e.what () << std::endl;
2317  }
2318  if (! getViewThrew) {
2319  os << "\tNew global column indices: "
2320  << Teuchos::toString (gblColInds2) << std::endl
2321  << "\tNew values: " << Teuchos::toString (vals2) << std::endl;
2322  }
2323  }
2324  }
2325  else if (this->isLocallyIndexed ()) {
2326  Teuchos::ArrayView<const LocalOrdinal> lclColInds2;
2327  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
2328  os << "\tNew local column indices: " << Teuchos::toString (lclColInds2)
2329  << std::endl;
2330  os << "\tNew values: " << Teuchos::toString (vals2) << std::endl;
2331  }
2332  }
2333 
2334  os << "Please report this bug to the Tpetra developers.";
2335  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2336  (true, std::logic_error, os.str ());
2337  }
2338 #endif // HAVE_TPETRA_DEBUG
2339  }
2340 
2341  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2342  void
2344  insertGlobalValues (const GlobalOrdinal gblRow,
2345  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2346  const Teuchos::ArrayView<const Scalar>& values)
2347  {
2348  using Teuchos::toString;
2349  using std::endl;
2350  typedef impl_scalar_type IST;
2351  typedef LocalOrdinal LO;
2352  typedef GlobalOrdinal GO;
2353  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2354  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
2355  const char tfecfFuncName[] = "insertGlobalValues: ";
2356 
2357 #ifdef HAVE_TPETRA_DEBUG
2358  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2359  (values.size () != indices.size (), std::runtime_error,
2360  "values.size() = " << values.size () << " != indices.size() = "
2361  << indices.size () << ".");
2362 #endif // HAVE_TPETRA_DEBUG
2363 
2364  // getRowMap() is not thread safe, because it increments RCP's
2365  // reference count. getCrsGraphRef() is thread safe.
2366  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2367  const LO lclRow = rowMap.getLocalElement (gblRow);
2368 
2369  if (lclRow == OTLO::invalid ()) {
2370  // Input row is _not_ owned by the calling process.
2371  //
2372  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2373  // is not in the row Map, it doesn't matter whether or not the
2374  // graph is static; the data just get stashed for later use by
2375  // globalAssemble().
2376  this->insertNonownedGlobalValues (gblRow, indices, values);
2377  }
2378  else { // Input row _is_ owned by the calling process
2379  if (this->isStaticGraph ()) {
2380  // Uh oh! Not allowed to insert into owned rows in that case.
2381  const int myRank = rowMap.getComm ()->getRank ();
2382  const int numProcs = rowMap.getComm ()->getSize ();
2383  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2384  (true, std::runtime_error,
2385  "The matrix was constructed with a constant (\"static\") graph, "
2386  "yet the given global row index " << gblRow << " is in the row "
2387  "Map on the calling process (with rank " << myRank << ", of " <<
2388  numProcs << " process(es)). In this case, you may not insert "
2389  "new entries into rows owned by the calling process.");
2390  }
2391 
2392  crs_graph_type& graph = * (this->myGraph_);
2393  const IST* const inputVals =
2394  reinterpret_cast<const IST*> (values.getRawPtr ());
2395  const GO* const inputGblColInds = indices.getRawPtr ();
2396  const size_t numInputEnt = indices.size ();
2397  RowInfo rowInfo = graph.getRowInfo (lclRow);
2398 
2399  // If the matrix has a column Map, check at this point whether
2400  // the column indices belong to the column Map.
2401  //
2402  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2403  // test to the CrsGraph method, since it may have to do this
2404  // anyway.
2405  if (! graph.colMap_.is_null ()) {
2406  const map_type& colMap = * (graph.colMap_);
2407  // In a debug build, keep track of the nonowned ("bad") column
2408  // indices, so that we can display them in the exception
2409  // message. In a release build, just ditch the loop early if
2410  // we encounter a nonowned column index.
2411 #ifdef HAVE_TPETRA_DEBUG
2412  Teuchos::Array<GO> badColInds;
2413 #endif // HAVE_TPETRA_DEBUG
2414  const size_type numEntriesToInsert = indices.size ();
2415  bool allInColMap = true;
2416  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2417  if (! colMap.isNodeGlobalElement (indices[k])) {
2418  allInColMap = false;
2419 #ifdef HAVE_TPETRA_DEBUG
2420  badColInds.push_back (indices[k]);
2421 #else
2422  break;
2423 #endif // HAVE_TPETRA_DEBUG
2424  }
2425  }
2426  if (! allInColMap) {
2427  std::ostringstream os;
2428  os << "You attempted to insert entries in owned row " << gblRow
2429  << ", at the following column indices: " << toString (indices)
2430  << "." << endl;
2431 #ifdef HAVE_TPETRA_DEBUG
2432  os << "Of those, the following indices are not in the column Map "
2433  "on this process: " << toString (badColInds) << "." << endl
2434  << "Since the matrix has a column Map already, it is invalid "
2435  "to insert entries at those locations.";
2436 #else
2437  os << "At least one of those indices is not in the column Map "
2438  "on this process." << endl << "It is invalid to insert into "
2439  "columns not in the column Map on the process that owns the "
2440  "row.";
2441 #endif // HAVE_TPETRA_DEBUG
2442  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2443  (true, std::invalid_argument, os.str ());
2444  }
2445  }
2446 
2447  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2448  inputVals, numInputEnt);
2449  }
2450  }
2451 
2452 
2453  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2454  void
2456  insertGlobalValues (const GlobalOrdinal globalRow,
2457  const LocalOrdinal numEnt,
2458  const Scalar vals[],
2459  const GlobalOrdinal inds[])
2460  {
2461  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2462  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2463  this->insertGlobalValues (globalRow, indsT, valsT);
2464  }
2465 
2466 
2467  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2468  void
2470  insertGlobalValuesFiltered (const GlobalOrdinal gblRow,
2471  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2472  const Teuchos::ArrayView<const Scalar>& values)
2473  {
2474  typedef impl_scalar_type IST;
2475  typedef LocalOrdinal LO;
2476  typedef GlobalOrdinal GO;
2477  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2478  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2479 
2480 #ifdef HAVE_TPETRA_DEBUG
2481  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2482  (values.size () != indices.size (), std::runtime_error,
2483  "values.size() = " << values.size () << " != indices.size() = "
2484  << indices.size () << ".");
2485 #endif // HAVE_TPETRA_DEBUG
2486 
2487  // getRowMap() is not thread safe, because it increments RCP's
2488  // reference count. getCrsGraphRef() is thread safe.
2489  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2490  const LO lclRow = rowMap.getLocalElement (gblRow);
2491  if (lclRow == OTLO::invalid ()) {
2492  // Input row is _not_ owned by the calling process.
2493  //
2494  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2495  // is not in the row Map, it doesn't matter whether or not the
2496  // graph is static; the data just get stashed for later use by
2497  // globalAssemble().
2498  this->insertNonownedGlobalValues (gblRow, indices, values);
2499  }
2500  else { // Input row _is_ owned by the calling process
2501  if (this->isStaticGraph ()) {
2502  // Uh oh! Not allowed to insert into owned rows in that case.
2503  const int myRank = rowMap.getComm ()->getRank ();
2504  const int numProcs = rowMap.getComm ()->getSize ();
2505  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2506  (true, std::runtime_error,
2507  "The matrix was constructed with a constant (\"static\") graph, "
2508  "yet the given global row index " << gblRow << " is in the row "
2509  "Map on the calling process (with rank " << myRank << ", of " <<
2510  numProcs << " process(es)). In this case, you may not insert "
2511  "new entries into rows owned by the calling process.");
2512  }
2513 
2514  crs_graph_type& graph = * (this->myGraph_);
2515  const IST* const inputVals =
2516  reinterpret_cast<const IST*> (values.getRawPtr ());
2517  const GO* const inputGblColInds = indices.getRawPtr ();
2518  const size_t numInputEnt = indices.size ();
2519  RowInfo rowInfo = graph.getRowInfo (lclRow);
2520 
2521  if (!graph.colMap_.is_null() &&
2522  graph.isLocallyIndexed() &&
2523  this->getProfileType() == StaticProfile) {
2524  // This branch is similar in function to the following branch, but for
2525  // the special case that the target graph is locally indexed (and the
2526  // profile type is StaticProfile). In this case, we cannot simply filter
2527  // out global indices that don't exist on the receiving process and
2528  // insert the remaining (global) indices, but we must convert them (the
2529  // remaining global indices) to local and call `insertLocalValues`.
2530  const map_type& colMap = * (graph.colMap_);
2531  size_t curOffset = 0;
2532  while (curOffset < numInputEnt) {
2533  // Find a sequence of input indices that are in the column Map on the
2534  // calling process. Doing a sequence at a time, instead of one at a
2535  // time, amortizes some overhead.
2536  Teuchos::Array<LO> lclIndices;
2537  size_t endOffset = curOffset;
2538  for ( ; endOffset < numInputEnt; ++endOffset) {
2539  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2540  if (lclIndex != OTLO::invalid())
2541  lclIndices.push_back(lclIndex);
2542  else
2543  break;
2544  }
2545  // curOffset, endOffset: half-exclusive range of indices in the column
2546  // Map on the calling process. If endOffset == curOffset, the range is
2547  // empty.
2548  const LO numIndInSeq = (endOffset - curOffset);
2549  if (numIndInSeq != 0) {
2550  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2551  }
2552  // Invariant before the increment line: Either endOffset ==
2553  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2554  // on the calling process.
2555 #ifdef HAVE_TPETRA_DEBUG
2556  const bool invariant = endOffset == numInputEnt ||
2557  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2558  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2559  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2560 #endif // HAVE_TPETRA_DEBUG
2561  curOffset = endOffset + 1;
2562  }
2563  }
2564  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2565  const map_type& colMap = * (graph.colMap_);
2566  size_t curOffset = 0;
2567  while (curOffset < numInputEnt) {
2568  // Find a sequence of input indices that are in the column
2569  // Map on the calling process. Doing a sequence at a time,
2570  // instead of one at a time, amortizes some overhead.
2571  size_t endOffset = curOffset;
2572  for ( ; endOffset < numInputEnt &&
2573  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2574  ++endOffset)
2575  {}
2576  // curOffset, endOffset: half-exclusive range of indices in
2577  // the column Map on the calling process. If endOffset ==
2578  // curOffset, the range is empty.
2579  const LO numIndInSeq = (endOffset - curOffset);
2580  if (numIndInSeq != 0) {
2581  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2582  this->insertGlobalValuesImpl (graph, rowInfo,
2583  inputGblColInds + curOffset,
2584  inputVals + curOffset,
2585  numIndInSeq);
2586  }
2587  // Invariant before the increment line: Either endOffset ==
2588  // numInputEnt, or inputGblColInds[endOffset] is not in the
2589  // column Map on the calling process.
2590 #ifdef HAVE_TPETRA_DEBUG
2591  const bool invariant = endOffset == numInputEnt ||
2592  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2593  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2594  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2595 #endif // HAVE_TPETRA_DEBUG
2596  curOffset = endOffset + 1;
2597  }
2598  }
2599  else { // we don't have a column Map.
2600  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2601  inputVals, numInputEnt);
2602  }
2603  }
2604  }
2605 
2606  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2607  LocalOrdinal
2608  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2609  replaceLocalValuesImpl (impl_scalar_type rowVals[],
2610  const crs_graph_type& graph,
2611  const RowInfo& rowInfo,
2612  const LocalOrdinal inds[],
2613  const impl_scalar_type newVals[],
2614  const LocalOrdinal numElts) const
2615  {
2616  typedef LocalOrdinal LO;
2617  typedef GlobalOrdinal GO;
2618  const bool sorted = graph.isSorted ();
2619 
2620  size_t hint = 0; // Guess for the current index k into rowVals
2621  LO numValid = 0; // number of valid local column indices
2622 
2623  // NOTE (mfh 11 Oct 2015) This method assumes UVM. More
2624  // accurately, it assumes that the host execution space can
2625  // access data in both InputMemorySpace and ValsMemorySpace.
2626 
2627  if (graph.isLocallyIndexed ()) {
2628  // Get a view of the column indices in the row. This amortizes
2629  // the cost of getting the view over all the entries of inds.
2630  auto colInds = graph.getLocalKokkosRowView (rowInfo);
2631 
2632  for (LO j = 0; j < numElts; ++j) {
2633  const LO lclColInd = inds[j];
2634  const size_t offset =
2635  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2636  lclColInd, hint, sorted);
2637  if (offset != rowInfo.numEntries) {
2638  rowVals[offset] = newVals[j];
2639  hint = offset + 1;
2640  ++numValid;
2641  }
2642  }
2643  }
2644  else if (graph.isGloballyIndexed ()) {
2645  if (graph.colMap_.is_null ()) {
2646  return Teuchos::OrdinalTraits<LO>::invalid ();
2647  }
2648  const map_type colMap = * (graph.colMap_);
2649 
2650  // Get a view of the column indices in the row. This amortizes
2651  // the cost of getting the view over all the entries of inds.
2652  auto colInds = graph.getGlobalKokkosRowView (rowInfo);
2653 
2654  for (LO j = 0; j < numElts; ++j) {
2655  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2656  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2657  const size_t offset =
2658  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2659  gblColInd, hint, sorted);
2660  if (offset != rowInfo.numEntries) {
2661  rowVals[offset] = newVals[j];
2662  hint = offset + 1;
2663  ++numValid;
2664  }
2665  }
2666  }
2667  }
2668  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2669  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2670  // to be neither locally nor globally indexed on a process.
2671  // This means that the graph or matrix has no entries on that
2672  // process. Epetra also works like this. It's related to lazy
2673  // allocation (on first insertion, not at graph / matrix
2674  // construction). Lazy allocation will go away because it is
2675  // not thread scalable.
2676 
2677  return numValid;
2678  }
2679 
2680  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2681  LocalOrdinal
2683  replaceLocalValues (const LocalOrdinal localRow,
2684  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2685  const Teuchos::ArrayView<const Scalar>& vals) const
2686  {
2687  typedef LocalOrdinal LO;
2688 
2689  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2690  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2691  return Teuchos::OrdinalTraits<LO>::invalid ();
2692  }
2693  const LO* const inputInds = lclCols.getRawPtr ();
2694  const Scalar* const inputVals = vals.getRawPtr ();
2695  return this->replaceLocalValues (localRow, numInputEnt,
2696  inputVals, inputInds);
2697  }
2698 
2699  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2700  LocalOrdinal
2702  replaceLocalValues (const LocalOrdinal localRow,
2703  const LocalOrdinal numEnt,
2704  const Scalar inputVals[],
2705  const LocalOrdinal inputCols[]) const
2706  {
2707  typedef impl_scalar_type IST;
2708  typedef LocalOrdinal LO;
2709 
2710  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2711  // Fill must be active and the "nonconst" graph must exist.
2712  return Teuchos::OrdinalTraits<LO>::invalid ();
2713  }
2714  const crs_graph_type& graph = * (this->staticGraph_);
2715  const RowInfo rowInfo = graph.getRowInfo (localRow);
2716 
2717  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2718  // The calling process does not own this row, so it is not
2719  // allowed to modify its values.
2720  return static_cast<LO> (0);
2721  }
2722  auto curRowVals = this->getRowViewNonConst (rowInfo);
2723  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2724  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2725  inputCols, inVals, numEnt);
2726  }
2727 
2728  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2729  LocalOrdinal
2731  replaceGlobalValuesImpl (impl_scalar_type rowVals[],
2732  const crs_graph_type& graph,
2733  const RowInfo& rowInfo,
2734  const GlobalOrdinal inds[],
2735  const impl_scalar_type newVals[],
2736  const LocalOrdinal numElts) const
2737  {
2738  if (graph.getProfileType() == StaticProfile)
2739  {
2740  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2741  auto fun =
2742  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2743  rowVals[offset] = newVals[k];
2744  };
2745  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2746  return graph.findGlobalIndices(rowInfo, indsT, cb);
2747  }
2748 
2749  // NOTE (DYNAMICPROFILE_REMOVAL) (tjf Mar 2019) from this point down can be
2750  // yanked once DynamicProfile is removed.
2751  typedef LocalOrdinal LO;
2752  typedef GlobalOrdinal GO;
2753 
2754  const bool sorted = graph.isSorted ();
2755 
2756  size_t hint = 0; // guess at the index's relative offset in the row
2757  LO numValid = 0; // number of valid input column indices
2758 
2759  // NOTE (mfh 11 Oct 2015) This method assumes UVM. More
2760  // accurately, it assumes that the host execution space can
2761  // access data in all the Views.
2762 
2763  if (graph.isLocallyIndexed ()) {
2764  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2765  // pointer does NOT change its reference count. Thus, this
2766  // code is still thread safe.
2767  if (graph.colMap_.is_null ()) {
2768  // NO input column indices are valid in this case, since if
2769  // the column Map is null on the calling process, then the
2770  // calling process owns no graph entries.
2771  return numValid;
2772  }
2773  const map_type& colMap = * (graph.colMap_);
2774 
2775  // Get a view of the column indices in the row. This amortizes
2776  // the cost of getting the view over all the entries of inds.
2777  auto colInds = graph.getLocalKokkosRowView (rowInfo);
2778  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2779  for (LO j = 0; j < numElts; ++j) {
2780  const LO lclColInd = colMap.getLocalElement (inds[j]);
2781  if (lclColInd != LINV) {
2782  const size_t offset =
2783  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2784  lclColInd, hint, sorted);
2785  if (offset != rowInfo.numEntries) {
2786  rowVals[offset] = newVals[j];
2787  hint = offset + 1;
2788  numValid++;
2789  }
2790  }
2791  }
2792  }
2793  else if (graph.isGloballyIndexed ()) {
2794  // Get a view of the column indices in the row. This amortizes
2795  // the cost of getting the view over all the entries of inds.
2796  auto colInds = graph.getGlobalKokkosRowView (rowInfo);
2797 
2798  for (LO j = 0; j < numElts; ++j) {
2799  const GO gblColInd = inds[j];
2800  const size_t offset =
2801  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2802  gblColInd, hint, sorted);
2803  if (offset != rowInfo.numEntries) {
2804  rowVals[offset] = newVals[j];
2805  hint = offset + 1;
2806  numValid++;
2807  }
2808  }
2809  }
2810  // If the graph is neither locally nor globally indexed on the
2811  // calling process, that means the calling process has no graph
2812  // entries. Thus, none of the input column indices are valid.
2813 
2814  return numValid;
2815  }
2816 
2817  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2818  LocalOrdinal
2820  replaceGlobalValues (const GlobalOrdinal globalRow,
2821  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2822  const Teuchos::ArrayView<const Scalar>& inputVals) const
2823  {
2824  typedef LocalOrdinal LO;
2825 
2826  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2827  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2828  return Teuchos::OrdinalTraits<LO>::invalid ();
2829  }
2830  return this->replaceGlobalValues (globalRow, numInputEnt,
2831  inputVals.getRawPtr (),
2832  inputGblColInds.getRawPtr ());
2833  }
2834 
2835  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2836  LocalOrdinal
2838  replaceGlobalValues (const GlobalOrdinal globalRow,
2839  const LocalOrdinal numEnt,
2840  const Scalar inputVals[],
2841  const GlobalOrdinal inputGblColInds[]) const
2842  {
2843  typedef impl_scalar_type IST;
2844  typedef LocalOrdinal LO;
2845 
2846  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2847  // Fill must be active and the "nonconst" graph must exist.
2848  return Teuchos::OrdinalTraits<LO>::invalid ();
2849  }
2850  const crs_graph_type& graph = * (this->staticGraph_);
2851 
2852  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2853  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2854  // The input local row is invalid on the calling process,
2855  // which means that the calling process summed 0 entries.
2856  return static_cast<LO> (0);
2857  }
2858 
2859  auto curRowVals = this->getRowViewNonConst (rowInfo);
2860  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2861  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2862  inputGblColInds, inVals, numEnt);
2863  }
2864 
2865  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2866  LocalOrdinal
2868  sumIntoGlobalValuesImpl (impl_scalar_type rowVals[],
2869  const crs_graph_type& graph,
2870  const RowInfo& rowInfo,
2871  const GlobalOrdinal inds[],
2872  const impl_scalar_type newVals[],
2873  const LocalOrdinal numElts,
2874  const bool atomic) const
2875  {
2876  typedef LocalOrdinal LO;
2877  typedef GlobalOrdinal GO;
2878 
2879  const bool sorted = graph.isSorted ();
2880 
2881  size_t hint = 0; // guess at the index's relative offset in the row
2882  LO numValid = 0; // number of valid input column indices
2883 
2884  // NOTE (mfh 11 Oct 2015) This method assumes UVM. More
2885  // accurately, it assumes that the host execution space can
2886  // access data in both InputMemorySpace and ValsMemorySpace.
2887 
2888  if (graph.isLocallyIndexed ()) {
2889  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2890  // pointer does NOT change its reference count. Thus, this
2891  // code is still thread safe.
2892  if (graph.colMap_.is_null ()) {
2893  // NO input column indices are valid in this case, since if
2894  // the column Map is null on the calling process, then the
2895  // calling process owns no graph entries.
2896  return numValid;
2897  }
2898  const map_type& colMap = * (graph.colMap_);
2899 
2900  // Get a view of the column indices in the row. This amortizes
2901  // the cost of getting the view over all the entries of inds.
2902  auto colInds = graph.getLocalKokkosRowView (rowInfo);
2903  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2904 
2905  for (LO j = 0; j < numElts; ++j) {
2906  const LO lclColInd = colMap.getLocalElement (inds[j]);
2907  if (lclColInd != LINV) {
2908  const size_t offset =
2909  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2910  lclColInd, hint, sorted);
2911  if (offset != rowInfo.numEntries) {
2912  if (atomic) {
2913  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2914  }
2915  else {
2916  rowVals[offset] += newVals[j];
2917  }
2918  hint = offset + 1;
2919  numValid++;
2920  }
2921  }
2922  }
2923  }
2924  else if (graph.isGloballyIndexed ()) {
2925  // Get a view of the column indices in the row. This amortizes
2926  // the cost of getting the view over all the entries of inds.
2927  auto colInds = graph.getGlobalKokkosRowView (rowInfo);
2928 
2929  for (LO j = 0; j < numElts; ++j) {
2930  const GO gblColInd = inds[j];
2931  const size_t offset =
2932  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2933  gblColInd, hint, sorted);
2934  if (offset != rowInfo.numEntries) {
2935  if (atomic) {
2936  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2937  }
2938  else {
2939  rowVals[offset] += newVals[j];
2940  }
2941  hint = offset + 1;
2942  numValid++;
2943  }
2944  }
2945  }
2946  // If the graph is neither locally nor globally indexed on the
2947  // calling process, that means the calling process has no graph
2948  // entries. Thus, none of the input column indices are valid.
2949 
2950  return numValid;
2951  }
2952 
2953  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2954  LocalOrdinal
2956  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2957  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2958  const Teuchos::ArrayView<const Scalar>& inputVals,
2959  const bool atomic)
2960  {
2961  typedef LocalOrdinal LO;
2962 
2963  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2964  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2965  return Teuchos::OrdinalTraits<LO>::invalid ();
2966  }
2967  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2968  inputVals.getRawPtr (),
2969  inputGblColInds.getRawPtr (),
2970  atomic);
2971  }
2972 
2973  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2974  LocalOrdinal
2976  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2977  const LocalOrdinal numInputEnt,
2978  const Scalar inputVals[],
2979  const GlobalOrdinal inputGblColInds[],
2980  const bool atomic)
2981  {
2982  typedef impl_scalar_type IST;
2983  typedef LocalOrdinal LO;
2984  typedef GlobalOrdinal GO;
2985 
2986  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2987  // Fill must be active and the "nonconst" graph must exist.
2988  return Teuchos::OrdinalTraits<LO>::invalid ();
2989  }
2990  const crs_graph_type& graph = * (this->staticGraph_);
2991 
2992  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2993  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2994  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2995  // thread safe in a debug build, in part because it uses
2996  // Teuchos::ArrayView, and in part because of the data structure
2997  // used to stash outgoing entries.
2998  using Teuchos::ArrayView;
2999  ArrayView<const GO> inputGblColInds_av (numInputEnt == 0 ? NULL :
3000  inputGblColInds, numInputEnt);
3001  ArrayView<const Scalar> inputVals_av (numInputEnt == 0 ? NULL :
3002  inputVals, numInputEnt);
3003  // gblRow is not in the row Map on the calling process, so stash
3004  // the given entries away in a separate data structure.
3005  // globalAssemble() (called during fillComplete()) will exchange
3006  // that data and sum it in using sumIntoGlobalValues().
3007  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
3008  inputVals_av);
3009  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
3010  // since we won't know whether the given indices were valid
3011  // until globalAssemble (called in fillComplete) is called.
3012  // That's why insertNonownedGlobalValues doesn't return
3013  // anything. Just for consistency, I'll return the number of
3014  // entries that the user gave us.
3015  return numInputEnt;
3016  }
3017  else { // input row is in the row Map on the calling process
3018  auto curRowVals = this->getRowViewNonConst (rowInfo);
3019  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
3020  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
3021  inputGblColInds, inVals,
3022  numInputEnt, atomic);
3023  }
3024  }
3025 
3026  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3027  LocalOrdinal
3029  transformLocalValues (const LocalOrdinal lclRow,
3030  const LocalOrdinal numInputEnt,
3031  const impl_scalar_type inputVals[],
3032  const LocalOrdinal inputCols[],
3033  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
3034  const bool atomic) const
3035  {
3036  using Tpetra::Details::OrdinalTraits;
3037  typedef LocalOrdinal LO;
3038 
3039  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3040  // Fill must be active and the "nonconst" graph must exist.
3041  return Teuchos::OrdinalTraits<LO>::invalid ();
3042  }
3043  const crs_graph_type& graph = * (this->staticGraph_);
3044  const RowInfo rowInfo = graph.getRowInfo (lclRow);
3045 
3046  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
3047  // The calling process does not own this row, so it is not
3048  // allowed to modify its values.
3049  return static_cast<LO> (0);
3050  }
3051  auto curRowVals = this->getRowViewNonConst (rowInfo);
3052  return this->transformLocalValues (curRowVals.data (), graph,
3053  rowInfo, inputCols, inputVals,
3054  numInputEnt, f, atomic);
3055  }
3056 
3057  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3058  LocalOrdinal
3059  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
3060  transformGlobalValues (const GlobalOrdinal gblRow,
3061  const LocalOrdinal numInputEnt,
3062  const impl_scalar_type inputVals[],
3063  const GlobalOrdinal inputCols[],
3064  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
3065  const bool atomic) const
3066  {
3067  using Tpetra::Details::OrdinalTraits;
3068  typedef LocalOrdinal LO;
3069 
3070  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3071  // Fill must be active and the "nonconst" graph must exist.
3072  return OrdinalTraits<LO>::invalid ();
3073  }
3074  const crs_graph_type& graph = * (this->staticGraph_);
3075  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
3076 
3077  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
3078  // The calling process does not own this row, so it is not
3079  // allowed to modify its values.
3080  return static_cast<LO> (0);
3081  }
3082  auto curRowVals = this->getRowViewNonConst (rowInfo);
3083  return this->transformGlobalValues (curRowVals.data (), graph,
3084  rowInfo, inputCols, inputVals,
3085  numInputEnt, f, atomic);
3086  }
3087 
3088  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3089  LocalOrdinal
3090  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
3091  transformLocalValues (impl_scalar_type rowVals[],
3092  const crs_graph_type& graph,
3093  const RowInfo& rowInfo,
3094  const LocalOrdinal inds[],
3095  const impl_scalar_type newVals[],
3096  const LocalOrdinal numElts,
3097  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
3098  const bool atomic) const
3099  {
3100  typedef impl_scalar_type ST;
3101  typedef LocalOrdinal LO;
3102  typedef GlobalOrdinal GO;
3103 
3104  //if (newVals.extent (0) != inds.extent (0)) {
3105  // The sizes of the input arrays must match.
3106  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
3107  //}
3108  //const LO numElts = static_cast<LO> (inds.extent (0));
3109  const bool sorted = graph.isSorted ();
3110 
3111  LO numValid = 0; // number of valid input column indices
3112  size_t hint = 0; // Guess for the current index k into rowVals
3113 
3114  if (graph.isLocallyIndexed ()) {
3115  // Get a view of the column indices in the row. This amortizes
3116  // the cost of getting the view over all the entries of inds.
3117  auto colInds = graph.getLocalKokkosRowView (rowInfo);
3118 
3119  for (LO j = 0; j < numElts; ++j) {
3120  const LO lclColInd = inds[j];
3121  const size_t offset =
3122  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3123  lclColInd, hint, sorted);
3124  if (offset != rowInfo.numEntries) {
3125  if (atomic) {
3126  // NOTE (mfh 30 Nov 2015) The commented-out code is
3127  // wrong because another thread may have changed
3128  // rowVals[offset] between those two lines of code.
3129  //
3130  //const ST newVal = f (rowVals[offset], newVals[j]);
3131  //Kokkos::atomic_assign (&rowVals[offset], newVal);
3132 
3133  volatile ST* const dest = &rowVals[offset];
3134  (void) atomic_binary_function_update (dest, newVals[j], f);
3135  }
3136  else {
3137  // use binary function f
3138  rowVals[offset] = f (rowVals[offset], newVals[j]);
3139  }
3140  hint = offset + 1;
3141  ++numValid;
3142  }
3143  }
3144  }
3145  else if (graph.isGloballyIndexed ()) {
3146  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
3147  // pointer does NOT change its reference count. Thus, this
3148  // code is still thread safe.
3149  if (graph.colMap_.is_null ()) {
3150  // NO input column indices are valid in this case. Either
3151  // the column Map hasn't been set yet (so local indices
3152  // don't exist yet), or the calling process owns no graph
3153  // entries.
3154  return numValid;
3155  }
3156  const map_type& colMap = * (graph.colMap_);
3157  // Get a view of the column indices in the row. This amortizes
3158  // the cost of getting the view over all the entries of inds.
3159  auto colInds = graph.getGlobalKokkosRowView (rowInfo);
3160 
3161  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
3162  for (LO j = 0; j < numElts; ++j) {
3163  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3164  if (gblColInd != GINV) {
3165  const size_t offset =
3166  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3167  gblColInd, hint, sorted);
3168  if (offset != rowInfo.numEntries) {
3169  if (atomic) {
3170  // NOTE (mfh 30 Nov 2015) The commented-out code is
3171  // wrong because another thread may have changed
3172  // rowVals[offset] between those two lines of code.
3173  //
3174  //const ST newVal = f (rowVals[offset], newVals[j]);
3175  //Kokkos::atomic_assign (&rowVals[offset], newVal);
3176 
3177  volatile ST* const dest = &rowVals[offset];
3178  (void) atomic_binary_function_update (dest, newVals[j], f);
3179  }
3180  else {
3181  // use binary function f
3182  rowVals[offset] = f (rowVals[offset], newVals[j]);
3183  }
3184  hint = offset + 1;
3185  numValid++;
3186  }
3187  }
3188  }
3189  }
3190  // If the graph is neither locally nor globally indexed on the
3191  // calling process, that means the calling process has no graph
3192  // entries. Thus, none of the input column indices are valid.
3193 
3194  return numValid;
3195  }
3196 
3197  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3198  LocalOrdinal
3199  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
3200  transformGlobalValues (impl_scalar_type rowVals[],
3201  const crs_graph_type& graph,
3202  const RowInfo& rowInfo,
3203  const GlobalOrdinal inds[],
3204  const impl_scalar_type newVals[],
3205  const LocalOrdinal numElts,
3206  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
3207  const bool atomic) const
3208  {
3209  typedef impl_scalar_type ST;
3210  typedef LocalOrdinal LO;
3211  typedef GlobalOrdinal GO;
3212 
3213  //if (newVals.extent (0) != inds.extent (0)) {
3214  // The sizes of the input arrays must match.
3215  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
3216  //}
3217  //const LO numElts = static_cast<LO> (inds.extent (0));
3218  const bool sorted = graph.isSorted ();
3219 
3220  LO numValid = 0; // number of valid input column indices
3221  size_t hint = 0; // Guess for the current index k into rowVals
3222 
3223  if (graph.isGloballyIndexed ()) {
3224  // Get a view of the column indices in the row. This amortizes
3225  // the cost of getting the view over all the entries of inds.
3226  auto colInds = graph.getGlobalKokkosRowView (rowInfo);
3227 
3228  for (LO j = 0; j < numElts; ++j) {
3229  const GO gblColInd = inds[j];
3230  const size_t offset =
3231  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3232  gblColInd, hint, sorted);
3233  if (offset != rowInfo.numEntries) {
3234  if (atomic) {
3235  // NOTE (mfh 30 Nov 2015) The commented-out code is
3236  // wrong because another thread may have changed
3237  // rowVals[offset] between those two lines of code.
3238  //
3239  //const ST newVal = f (rowVals[offset], newVals[j]);
3240  //Kokkos::atomic_assign (&rowVals[offset], newVal);
3241 
3242  volatile ST* const dest = &rowVals[offset];
3243  (void) atomic_binary_function_update (dest, newVals[j], f);
3244  }
3245  else {
3246  // use binary function f
3247  rowVals[offset] = f (rowVals[offset], newVals[j]);
3248  }
3249  hint = offset + 1;
3250  ++numValid;
3251  }
3252  }
3253  }
3254  else if (graph.isLocallyIndexed ()) {
3255  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
3256  // pointer does NOT change its reference count. Thus, this
3257  // code is still thread safe.
3258  if (graph.colMap_.is_null ()) {
3259  // NO input column indices are valid in this case. Either the
3260  // column Map hasn't been set yet (so local indices don't
3261  // exist yet), or the calling process owns no graph entries.
3262  return numValid;
3263  }
3264  const map_type& colMap = * (graph.colMap_);
3265  // Get a view of the column indices in the row. This amortizes
3266  // the cost of getting the view over all the entries of inds.
3267  auto colInds = graph.getLocalKokkosRowView (rowInfo);
3268 
3269  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
3270  for (LO j = 0; j < numElts; ++j) {
3271  const LO lclColInd = colMap.getLocalElement (inds[j]);
3272  if (lclColInd != LINV) {
3273  const size_t offset =
3274  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3275  lclColInd, hint, sorted);
3276  if (offset != rowInfo.numEntries) {
3277  if (atomic) {
3278  // NOTE (mfh 30 Nov 2015) The commented-out code is
3279  // wrong because another thread may have changed
3280  // rowVals[offset] between those two lines of code.
3281  //
3282  //const ST newVal = f (rowVals[offset], newVals[j]);
3283  //Kokkos::atomic_assign (&rowVals[offset], newVal);
3284 
3285  volatile ST* const dest = &rowVals[offset];
3286  (void) atomic_binary_function_update (dest, newVals[j], f);
3287  }
3288  else {
3289  // use binary function f
3290  rowVals[offset] = f (rowVals[offset], newVals[j]);
3291  }
3292  hint = offset + 1;
3293  numValid++;
3294  }
3295  }
3296  }
3297  }
3298  // If the graph is neither locally nor globally indexed on the
3299  // calling process, that means the calling process has no graph
3300  // entries. Thus, none of the input column indices are valid.
3301 
3302  return numValid;
3303  }
3304 
3305  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3306  LocalOrdinal
3307  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
3308  sumIntoLocalValuesImpl (impl_scalar_type rowVals[],
3309  const crs_graph_type& graph,
3310  const RowInfo& rowInfo,
3311  const LocalOrdinal inds[],
3312  const impl_scalar_type newVals[],
3313  const LocalOrdinal numElts,
3314  const bool atomic) const
3315  {
3316  typedef LocalOrdinal LO;
3317  typedef GlobalOrdinal GO;
3318 
3319  const bool sorted = graph.isSorted ();
3320 
3321  size_t hint = 0; // Guess for the current index k into rowVals
3322  LO numValid = 0; // number of valid local column indices
3323 
3324  // NOTE (mfh 11 Oct 2015) This method assumes UVM. More
3325  // accurately, it assumes that the host execution space can
3326  // access data in both InputMemorySpace and ValsMemorySpace.
3327 
3328  if (graph.isLocallyIndexed ()) {
3329  // Get a view of the column indices in the row. This amortizes
3330  // the cost of getting the view over all the entries of inds.
3331  auto colInds = graph.getLocalKokkosRowView (rowInfo);
3332 
3333  for (LO j = 0; j < numElts; ++j) {
3334  const LO lclColInd = inds[j];
3335  const size_t offset =
3336  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3337  lclColInd, hint, sorted);
3338  if (offset != rowInfo.numEntries) {
3339  if (atomic) {
3340  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3341  }
3342  else {
3343  rowVals[offset] += newVals[j];
3344  }
3345  hint = offset + 1;
3346  ++numValid;
3347  }
3348  }
3349  }
3350  else if (graph.isGloballyIndexed ()) {
3351  if (graph.colMap_.is_null ()) {
3352  return Teuchos::OrdinalTraits<LO>::invalid ();
3353  }
3354  const map_type colMap = * (graph.colMap_);
3355 
3356  // Get a view of the column indices in the row. This amortizes
3357  // the cost of getting the view over all the entries of inds.
3358  auto colInds = graph.getGlobalKokkosRowView (rowInfo);
3359 
3360  for (LO j = 0; j < numElts; ++j) {
3361  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3362  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3363  const size_t offset =
3364  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3365  gblColInd, hint, sorted);
3366  if (offset != rowInfo.numEntries) {
3367  if (atomic) {
3368  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3369  }
3370  else {
3371  rowVals[offset] += newVals[j];
3372  }
3373  hint = offset + 1;
3374  ++numValid;
3375  }
3376  }
3377  }
3378  }
3379  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3380  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3381  // to be neither locally nor globally indexed on a process.
3382  // This means that the graph or matrix has no entries on that
3383  // process. Epetra also works like this. It's related to lazy
3384  // allocation (on first insertion, not at graph / matrix
3385  // construction). Lazy allocation will go away because it is
3386  // not thread scalable.
3387 
3388  return numValid;
3389  }
3390 
3391  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3392  LocalOrdinal
3394  sumIntoLocalValues (const LocalOrdinal localRow,
3395  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3396  const Teuchos::ArrayView<const Scalar>& values,
3397  const bool atomic) const
3398  {
3399  typedef LocalOrdinal LO;
3400 
3401  const LO numInputEnt = static_cast<LO> (indices.size ());
3402  if (static_cast<LO> (values.size ()) != numInputEnt) {
3403  return Teuchos::OrdinalTraits<LO>::invalid ();
3404  }
3405  const LO* const inputInds = indices.getRawPtr ();
3406  const Scalar* const inputVals = values.getRawPtr ();
3407  return this->sumIntoLocalValues (localRow, numInputEnt,
3408  inputVals, inputInds, atomic);
3409  }
3410 
3411  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3412  LocalOrdinal
3414  sumIntoLocalValues (const LocalOrdinal localRow,
3415  const LocalOrdinal numEnt,
3416  const Scalar vals[],
3417  const LocalOrdinal cols[],
3418  const bool atomic) const
3419  {
3420  typedef impl_scalar_type IST;
3421  typedef LocalOrdinal LO;
3422 
3423  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3424  // Fill must be active and the "nonconst" graph must exist.
3425  return Teuchos::OrdinalTraits<LO>::invalid ();
3426  }
3427  const crs_graph_type& graph = * (this->staticGraph_);
3428  const RowInfo rowInfo = graph.getRowInfo (localRow);
3429 
3430  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3431  // The calling process does not own this row, so it is not
3432  // allowed to modify its values.
3433  return static_cast<LO> (0);
3434  }
3435  auto curRowVals = this->getRowViewNonConst (rowInfo);
3436  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3437  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3438  cols, inputVals, numEnt, atomic);
3439  }
3440 
3441  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3442  Teuchos::ArrayView<const typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>
3444  getView (RowInfo rowinfo) const
3445  {
3446  using Kokkos::MemoryUnmanaged;
3447  using Kokkos::View;
3448  using Teuchos::ArrayView;
3449  typedef impl_scalar_type ST;
3450  typedef std::pair<size_t, size_t> range_type;
3451 
3452  if (k_values1D_.extent (0) != 0 && rowinfo.allocSize > 0) {
3453 #ifdef HAVE_TPETRA_DEBUG
3454  TEUCHOS_TEST_FOR_EXCEPTION(
3455  rowinfo.offset1D + rowinfo.allocSize > k_values1D_.extent (0),
3456  std::range_error, "Tpetra::CrsMatrix::getView: Invalid access "
3457  "to 1-D storage of values." << std::endl << "rowinfo.offset1D (" <<
3458  rowinfo.offset1D << ") + rowinfo.allocSize (" << rowinfo.allocSize <<
3459  ") > k_values1D_.extent(0) (" << k_values1D_.extent (0) << ").");
3460 #endif // HAVE_TPETRA_DEBUG
3461  range_type range (rowinfo.offset1D, rowinfo.offset1D + rowinfo.allocSize);
3462  typedef View<const ST*, execution_space, MemoryUnmanaged> subview_type;
3463  // mfh 23 Nov 2015: Don't just create a subview of k_values1D_
3464  // directly, because that first creates a _managed_ subview,
3465  // then returns an unmanaged version of that. That touches the
3466  // reference count, which costs performance in a measurable way.
3467  // Instead, we create a temporary unmanaged view, then create
3468  // the subview from that.
3469  subview_type sv = Kokkos::subview (subview_type (k_values1D_), range);
3470  const ST* const sv_raw = (rowinfo.allocSize == 0) ? NULL : sv.data ();
3471  return ArrayView<const ST> (sv_raw, rowinfo.allocSize);
3472  }
3473  else if (values2D_ != Teuchos::null) {
3474  return values2D_[rowinfo.localRow] ();
3475  }
3476  else {
3477  return ArrayView<impl_scalar_type> ();
3478  }
3479  }
3480 
3481 
3482  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3483  LocalOrdinal
3486  LocalOrdinal& numEnt,
3487  const RowInfo& rowinfo) const
3488  {
3489  if (k_values1D_.extent (0) != 0 && rowinfo.allocSize > 0) {
3490 #ifdef HAVE_TPETRA_DEBUG
3491  if (rowinfo.offset1D + rowinfo.allocSize > k_values1D_.extent (0)) {
3492  vals = NULL;
3493  numEnt = 0;
3494  return Teuchos::OrdinalTraits<LocalOrdinal>::invalid ();
3495  }
3496 #endif // HAVE_TPETRA_DEBUG
3497  vals = k_values1D_.data () + rowinfo.offset1D;
3498  numEnt = rowinfo.allocSize;
3499  }
3500  else if (! values2D_.is_null ()) {
3501 #ifdef HAVE_TPETRA_DEBUG
3502  if (rowinfo.localRow >= static_cast<size_t> (values2D_.size ())) {
3503  vals = NULL;
3504  numEnt = 0;
3505  return Teuchos::OrdinalTraits<LocalOrdinal>::invalid ();
3506  }
3507 #endif // HAVE_TPETRA_DEBUG
3508  // Use const reference so that we don't update ArrayRCP's
3509  // reference count, which is not thread safe.
3510  const auto& curRow = values2D_[rowinfo.localRow];
3511  vals = curRow.getRawPtr ();
3512  numEnt = curRow.size ();
3513  }
3514  else {
3515  vals = NULL;
3516  numEnt = 0;
3517  }
3518 
3519  return static_cast<LocalOrdinal> (0);
3520  }
3521 
3522  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3523  LocalOrdinal
3526  LocalOrdinal& numEnt,
3527  const RowInfo& rowinfo) const
3528  {
3529  const impl_scalar_type* valsConst;
3530  const LocalOrdinal err = this->getViewRawConst (valsConst, numEnt, rowinfo);
3531  vals = const_cast<impl_scalar_type*> (valsConst);
3532  return err;
3533  }
3534 
3535  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3536  Kokkos::View<const typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,
3538  Kokkos::MemoryUnmanaged>
3540  getRowView (const RowInfo& rowInfo) const
3541  {
3542  using Kokkos::MemoryUnmanaged;
3543  using Kokkos::View;
3544  typedef impl_scalar_type ST;
3545  typedef View<const ST*, execution_space, MemoryUnmanaged> subview_type;
3546  typedef std::pair<size_t, size_t> range_type;
3547 
3548  if (k_values1D_.extent (0) != 0 && rowInfo.allocSize > 0) {
3549 #ifdef HAVE_TPETRA_DEBUG
3550  TEUCHOS_TEST_FOR_EXCEPTION
3551  (rowInfo.offset1D + rowInfo.allocSize > this->k_values1D_.extent (0),
3552  std::range_error, "Tpetra::CrsMatrix::getRowView: Invalid access "
3553  "to 1-D storage of values. rowInfo.offset1D ("
3554  << rowInfo.offset1D << ") + rowInfo.allocSize (" << rowInfo.allocSize
3555  << ") > this->k_values1D_.extent(0) ("
3556  << this->k_values1D_.extent (0) << ").");
3557 #endif // HAVE_TPETRA_DEBUG
3558  range_type range (rowInfo.offset1D, rowInfo.offset1D + rowInfo.allocSize);
3559  // mfh 23 Nov 2015: Don't just create a subview of k_values1D_
3560  // directly, because that first creates a _managed_ subview,
3561  // then returns an unmanaged version of that. That touches the
3562  // reference count, which costs performance in a measurable way.
3563  // Instead, we create a temporary unmanaged view, then create
3564  // the subview from that.
3565  return Kokkos::subview (subview_type (this->k_values1D_), range);
3566  }
3567  else if (this->values2D_ != Teuchos::null) {
3568  // Use a reference, so that I don't touch the Teuchos::ArrayView
3569  // reference count in a debug build. (It has no reference count
3570  // in a release build.) This ensures thread safety.
3571  auto& rowView = this->values2D_[rowInfo.localRow];
3572  return subview_type (rowView.getRawPtr (), rowView.size ());
3573  }
3574  else {
3575  return subview_type ();
3576  }
3577  }
3578 
3579  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3580  Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,
3581  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::execution_space,
3582  Kokkos::MemoryUnmanaged>
3583  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
3584  getRowViewNonConst (const RowInfo& rowInfo) const
3585  {
3586  using Kokkos::MemoryUnmanaged;
3587  using Kokkos::View;
3588  typedef impl_scalar_type ST;
3589  typedef View<ST*, execution_space, MemoryUnmanaged> subview_type;
3590  typedef std::pair<size_t, size_t> range_type;
3591 
3592  if (k_values1D_.extent (0) != 0 && rowInfo.allocSize > 0) {
3593 #ifdef HAVE_TPETRA_DEBUG
3594  TEUCHOS_TEST_FOR_EXCEPTION
3595  (rowInfo.offset1D + rowInfo.allocSize > this->k_values1D_.extent (0),
3596  std::range_error, "Tpetra::CrsMatrix::getRowViewNonConst: Invalid "
3597  "access to 1-D storage of values. rowInfo.offset1D ("
3598  << rowInfo.offset1D << ") + rowInfo.allocSize (" << rowInfo.allocSize
3599  << ") > this->k_values1D_.extent(0) ("
3600  << this->k_values1D_.extent (0) << ").");
3601 #endif // HAVE_TPETRA_DEBUG
3602  range_type range (rowInfo.offset1D, rowInfo.offset1D + rowInfo.allocSize);
3603  // mfh 23 Nov 2015: Don't just create a subview of k_values1D_
3604  // directly, because that first creates a _managed_ subview,
3605  // then returns an unmanaged version of that. That touches the
3606  // reference count, which costs performance in a measurable way.
3607  // Instead, we create a temporary unmanaged view, then create
3608  // the subview from that.
3609  return Kokkos::subview (subview_type (this->k_values1D_), range);
3610  }
3611  else if (this->values2D_ != Teuchos::null) {
3612  // Use a reference, so that I don't touch the Teuchos::ArrayView
3613  // reference count in a debug build. (It has no reference count
3614  // in a release build.) This ensures thread safety.
3615  auto& rowView = this->values2D_[rowInfo.localRow];
3616  return subview_type (rowView.getRawPtr (), rowView.size ());
3617  }
3618  else {
3619  return subview_type ();
3620  }
3621  }
3622 
3623  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3624  Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>
3626  getViewNonConst (const RowInfo& rowinfo) const
3627  {
3628  return Teuchos::av_const_cast<impl_scalar_type> (this->getView (rowinfo));
3629  }
3630 
3631  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3632  void
3634  getLocalRowCopy (LocalOrdinal localRow,
3635  const Teuchos::ArrayView<LocalOrdinal>& indices,
3636  const Teuchos::ArrayView<Scalar>& values,
3637  size_t& numEntries) const
3638  {
3639  using Teuchos::ArrayView;
3640  using Teuchos::av_reinterpret_cast;
3641  const char tfecfFuncName[] = "getLocalRowCopy: ";
3642 
3643  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3644  (! this->hasColMap (), std::runtime_error,
3645  "The matrix does not have a column Map yet. This means we don't have "
3646  "local indices for columns yet, so it doesn't make sense to call this "
3647  "method. If the matrix doesn't have a column Map yet, you should call "
3648  "fillComplete on it first.");
3649 
3650  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3651  const size_t theNumEntries = rowinfo.numEntries;
3652  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3653  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3654  static_cast<size_t> (values.size ()) < theNumEntries,
3655  std::runtime_error, "Row with local index " << localRow << " has " <<
3656  theNumEntries << " entry/ies, but indices.size() = " <<
3657  indices.size () << " and values.size() = " << values.size () << ".");
3658  numEntries = theNumEntries; // first side effect
3659 
3660  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3661  if (staticGraph_->isLocallyIndexed ()) {
3662  const LocalOrdinal* curLclInds;
3663  const impl_scalar_type* curVals;
3664  LocalOrdinal numSpots; // includes both current entries and extra space
3665 
3666  // If we got this far, rowinfo should be correct and should
3667  // refer to a valid local row. Thus, these error checks are
3668  // superfluous, but we retain them in a debug build.
3669 #ifdef HAVE_TPETRA_DEBUG
3670  int err =
3671  staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
3672  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3673  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3674  "staticGraph_->getLocalViewRawConst returned nonzero error code "
3675  << err << ".");
3676  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3677  (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
3678  "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
3679  << ".");
3680  const LocalOrdinal numSpotsBefore = numSpots;
3681  err = getViewRawConst (curVals, numSpots, rowinfo);
3682  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3683  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3684  "getViewRaw returned nonzero error code " << err << ".");
3685  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3686  (numSpotsBefore != numSpots, std::logic_error,
3687  "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
3688  << numSpots << ".");
3689 #else
3690  (void) staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
3691  (void) getViewRawConst (curVals, numSpots, rowinfo);
3692 #endif // HAVE_TPETRA_DEBUG
3693 
3694  for (size_t j = 0; j < theNumEntries; ++j) {
3695  values[j] = curVals[j];
3696  indices[j] = curLclInds[j];
3697  }
3698  }
3699  else if (staticGraph_->isGloballyIndexed ()) {
3700  // Don't call getColMap(), because it touches RCP's reference count.
3701  const map_type& colMap = * (staticGraph_->colMap_);
3702  const GlobalOrdinal* curGblInds;
3703  const impl_scalar_type* curVals;
3704  LocalOrdinal numSpots; // includes both current entries and extra space
3705 
3706  // If we got this far, rowinfo should be correct and should
3707  // refer to a valid local row. Thus, these error checks are
3708  // superfluous, but we retain them in a debug build.
3709 #ifdef HAVE_TPETRA_DEBUG
3710  int err =
3711  staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
3712  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3713  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3714  "staticGraph_->getGlobalViewRawConst returned nonzero error code "
3715  << err << ".");
3716  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3717  (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
3718  "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
3719  << ".");
3720  const LocalOrdinal numSpotsBefore = numSpots;
3721  err = getViewRawConst (curVals, numSpots, rowinfo);
3722  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3723  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3724  "getViewRawConst returned nonzero error code " << err << ".");
3725  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3726  (numSpotsBefore != numSpots, std::logic_error,
3727  "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
3728  << numSpots << ".");
3729 #else
3730  (void) staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
3731  (void) getViewRawConst (curVals, numSpots, rowinfo);
3732 #endif //HAVE_TPETRA_DEBUG
3733 
3734  for (size_t j = 0; j < theNumEntries; ++j) {
3735  values[j] = curVals[j];
3736  indices[j] = colMap.getLocalElement (curGblInds[j]);
3737  }
3738  }
3739  }
3740  }
3741 
3742  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3743  void
3745  getGlobalRowCopy (GlobalOrdinal globalRow,
3746  const Teuchos::ArrayView<GlobalOrdinal>& indices,
3747  const Teuchos::ArrayView<Scalar>& values,
3748  size_t& numEntries) const
3749  {
3750  using Teuchos::ArrayView;
3751  using Teuchos::av_reinterpret_cast;
3752  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3753 
3754  const RowInfo rowinfo =
3755  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3756  const size_t theNumEntries = rowinfo.numEntries;
3757  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3758  static_cast<size_t> (indices.size ()) < theNumEntries ||
3759  static_cast<size_t> (values.size ()) < theNumEntries,
3760  std::runtime_error, "Row with global index " << globalRow << " has "
3761  << theNumEntries << " entry/ies, but indices.size() = " <<
3762  indices.size () << " and values.size() = " << values.size () << ".");
3763  numEntries = theNumEntries; // first side effect
3764 
3765  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3766  if (staticGraph_->isLocallyIndexed ()) {
3767  const map_type& colMap = * (staticGraph_->colMap_);
3768  const LocalOrdinal* curLclInds;
3769  const impl_scalar_type* curVals;
3770  LocalOrdinal numSpots; // includes both current entries and extra space
3771 
3772  // If we got this far, rowinfo should be correct and should
3773  // refer to a valid local row. Thus, these error checks are
3774  // superfluous, but we retain them in a debug build.
3775 #ifdef HAVE_TPETRA_DEBUG
3776  int err =
3777  staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
3778  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3779  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3780  "staticGraph_->getLocalViewRawConst returned nonzero error code "
3781  << err << ".");
3782  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3783  (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
3784  "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
3785  << ".");
3786  const LocalOrdinal numSpotsBefore = numSpots;
3787  err = getViewRawConst (curVals, numSpots, rowinfo);
3788  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3789  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3790  "getViewRaw returned nonzero error code " << err << ".");
3791  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3792  (numSpotsBefore != numSpots, std::logic_error,
3793  "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
3794  << numSpots << ".");
3795 #else
3796  (void) staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
3797  (void) getViewRawConst (curVals, numSpots, rowinfo);
3798 #endif //HAVE_TPETRA_DEBUG
3799 
3800  for (size_t j = 0; j < theNumEntries; ++j) {
3801  values[j] = curVals[j];
3802  indices[j] = colMap.getGlobalElement (curLclInds[j]);
3803  }
3804  }
3805  else if (staticGraph_->isGloballyIndexed ()) {
3806  const GlobalOrdinal* curGblInds;
3807  const impl_scalar_type* curVals;
3808  LocalOrdinal numSpots; // includes both current entries and extra space
3809 
3810  // If we got this far, rowinfo should be correct and should
3811  // refer to a valid local row. Thus, these error checks are
3812  // superfluous, but we retain them in a debug build.
3813 #ifdef HAVE_TPETRA_DEBUG
3814  int err =
3815  staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
3816  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3817  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3818  "staticGraph_->getGlobalViewRawConst returned nonzero error code "
3819  << err << ".");
3820  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3821  (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
3822  "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
3823  << ".");
3824  const LocalOrdinal numSpotsBefore = numSpots;
3825  err = getViewRawConst (curVals, numSpots, rowinfo);
3826  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3827  (err != static_cast<LocalOrdinal> (0), std::logic_error,
3828  "getViewRawConst returned nonzero error code " << err << ".");
3829  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3830  (numSpotsBefore != numSpots, std::logic_error,
3831  "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
3832  << numSpots << ".");
3833 #else
3834  (void) staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
3835  (void) getViewRawConst (curVals, numSpots, rowinfo);
3836 #endif //HAVE_TPETRA_DEBUG
3837 
3838  for (size_t j = 0; j < theNumEntries; ++j) {
3839  values[j] = curVals[j];
3840  indices[j] = curGblInds[j];
3841  }
3842  }
3843  }
3844  }
3845 
3846  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3847  void
3849  getLocalRowView (LocalOrdinal localRow,
3850  Teuchos::ArrayView<const LocalOrdinal>& indices,
3851  Teuchos::ArrayView<const Scalar>& values) const
3852  {
3853  using Teuchos::ArrayView;
3854  using Teuchos::av_reinterpret_cast;
3855  typedef LocalOrdinal LO;
3856  const char tfecfFuncName[] = "getLocalRowView: ";
3857 
3858  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3859  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3860  "its indices as global indices, so you cannot get a view with local "
3861  "column indices. If the matrix has a column Map, you may call "
3862  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3863  "a view with global column indices by calling getGlobalRowCopy().");
3864  indices = Teuchos::null;
3865  values = Teuchos::null;
3866  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3867  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3868  rowinfo.numEntries > 0) {
3869  ArrayView<const LO> indTmp = staticGraph_->getLocalView (rowinfo);
3870  ArrayView<const Scalar> valTmp =
3871  av_reinterpret_cast<const Scalar> (this->getView (rowinfo));
3872  indices = indTmp (0, rowinfo.numEntries);
3873  values = valTmp (0, rowinfo.numEntries);
3874  }
3875 
3876 #ifdef HAVE_TPETRA_DEBUG
3877  const char suffix[] = ". This should never happen. Please report this "
3878  "bug to the Tpetra developers.";
3879  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3880  (static_cast<size_t> (indices.size ()) !=
3881  static_cast<size_t> (values.size ()), std::logic_error,
3882  "At the end of this method, for local row " << localRow << ", "
3883  "indices.size() = " << indices.size () << " != values.size () = "
3884  << values.size () << suffix);
3885  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3886  (static_cast<size_t> (indices.size ()) !=
3887  static_cast<size_t> (rowinfo.numEntries), std::logic_error,
3888  "At the end of this method, for local row " << localRow << ", "
3889  "indices.size() = " << indices.size () << " != rowinfo.numEntries = "
3890  << rowinfo.numEntries << suffix);
3891  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3892  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3893  (rowinfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3894  "of this method, for local row " << localRow << ", rowinfo.numEntries = "
3895  << rowinfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3896  expectedNumEntries << suffix);
3897 #endif // HAVE_TPETRA_DEBUG
3898  }
3899 
3900  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3901  LocalOrdinal
3903  getLocalRowView (const LocalOrdinal lclRow,
3904  LocalOrdinal& numEnt,
3905  const impl_scalar_type*& val,
3906  const LocalOrdinal*& ind) const
3907  {
3908  typedef LocalOrdinal LO;
3909 
3910  // Don't call getCrsGraph(), because that modfies an RCP reference
3911  // count, which is not thread safe. Checking whether an RCP is
3912  // null does NOT modify its reference count, and is therefore
3913  // thread safe. Note that isGloballyIndexed() calls
3914  // getCrsGraph(), so we have to go to the graph directly.
3915  if (staticGraph_.is_null () || staticGraph_->isGloballyIndexed ()) {
3916  return Tpetra::Details::OrdinalTraits<LO>::invalid ();
3917  }
3918  else {
3919  const RowInfo rowInfo = staticGraph_->getRowInfo (lclRow);
3920  if (rowInfo.localRow == Tpetra::Details::OrdinalTraits<size_t>::invalid ()) {
3921  numEnt = 0; // no valid entries in this row on the calling process
3922  val = NULL;
3923  ind = NULL;
3924  // First argument (lclRow) invalid, so make 1 the error code.
3925  return static_cast<LO> (1);
3926  }
3927  else {
3928  numEnt = static_cast<LO> (rowInfo.numEntries);
3929  auto lclColInds = staticGraph_->getLocalKokkosRowView (rowInfo);
3930  ind = lclColInds.data (); // FIXME (mfh 18 Jul 2016) UVM
3931  const LO err = this->getViewRawConst (val, numEnt, rowInfo);
3932  return err;
3933  }
3934  }
3935  }
3936 
3937  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3938  LocalOrdinal
3940  getLocalRowViewRaw (const LocalOrdinal lclRow,
3941  LocalOrdinal& numEnt,
3942  const LocalOrdinal*& lclColInds,
3943  const Scalar*& vals) const
3944  {
3945  const impl_scalar_type* vals_ist = NULL;
3946  const LocalOrdinal errCode =
3947  this->getLocalRowView (lclRow, numEnt, vals_ist, lclColInds);
3948  vals = reinterpret_cast<const Scalar*> (vals_ist);
3949  return errCode;
3950  }
3951 
3952  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3953  void
3955  getGlobalRowView (GlobalOrdinal globalRow,
3956  Teuchos::ArrayView<const GlobalOrdinal>& indices,
3957  Teuchos::ArrayView<const Scalar>& values) const
3958  {
3959  using Teuchos::ArrayView;
3960  using Teuchos::av_reinterpret_cast;
3961  typedef GlobalOrdinal GO;
3962  const char tfecfFuncName[] = "getGlobalRowView: ";
3963 
3964  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3965  isLocallyIndexed (), std::runtime_error,
3966  "The matrix is locally indexed, so we cannot return a view of the row "
3967  "with global column indices. Use getGlobalRowCopy() instead.");
3968  indices = Teuchos::null;
3969  values = Teuchos::null;
3970  const RowInfo rowinfo =
3971  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3972  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3973  rowinfo.numEntries > 0) {
3974  ArrayView<const GO> indTmp = staticGraph_->getGlobalView (rowinfo);
3975  ArrayView<const Scalar> valTmp =
3976  av_reinterpret_cast<const Scalar> (this->getView (rowinfo));
3977 #ifdef HAVE_TPETRA_DEBUG
3978  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3979  (static_cast<size_t> (indTmp.size ()) < rowinfo.numEntries ||
3980  static_cast<size_t> (valTmp.size ()) < rowinfo.numEntries,
3981  std::logic_error, std::endl << "rowinfo.numEntries not accurate. "
3982  << std::endl << "indTmp.size() = " << indTmp.size ()
3983  << ", valTmp.size() = " << valTmp.size ()
3984  << ", rowinfo.numEntries = " << rowinfo.numEntries << ".");
3985 #endif // HAVE_TPETRA_DEBUG
3986  indices = indTmp (0, rowinfo.numEntries);
3987  values = valTmp (0, rowinfo.numEntries);
3988  }
3989 
3990 #ifdef HAVE_TPETRA_DEBUG
3991  const char suffix[] = ". This should never happen. Please report this "
3992  "bug to the Tpetra developers.";
3993  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3994  (static_cast<size_t> (indices.size ()) !=
3995  static_cast<size_t> (values.size ()), std::logic_error,
3996  "At the end of this method, for global row " << globalRow << ", "
3997  "indices.size() = " << indices.size () << " != values.size () = "
3998  << values.size () << suffix);
3999  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4000  (static_cast<size_t> (indices.size ()) !=
4001  static_cast<size_t> (rowinfo.numEntries), std::logic_error,
4002  "At the end of this method, for global row " << globalRow << ", "
4003  "indices.size() = " << indices.size () << " != rowinfo.numEntries = "
4004  << rowinfo.numEntries << suffix);
4005  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
4006  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4007  (rowinfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
4008  "of this method, for global row " << globalRow << ", rowinfo.numEntries "
4009  "= " << rowinfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
4010  " " << expectedNumEntries << suffix);
4011 #endif // HAVE_TPETRA_DEBUG
4012  }
4013 
4014  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4015  void
4017  scale (const Scalar& alpha)
4018  {
4019  typedef LocalOrdinal LO;
4020  typedef typename Teuchos::Array<Scalar>::size_type size_type;
4021  const char tfecfFuncName[] = "scale: ";
4022  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
4023 
4024  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4025  ! isFillActive (), std::runtime_error,
4026  "Fill must be active before you may call this method. "
4027  "Please call resumeFill() to make fill active.");
4028 
4029  const size_t nlrs = staticGraph_->getNodeNumRows ();
4030  const size_t numEntries = staticGraph_->getNodeNumEntries ();
4031  if (! staticGraph_->indicesAreAllocated () ||
4032  nlrs == 0 || numEntries == 0) {
4033  // do nothing
4034  }
4035  else {
4036  auto lclMat = this->getLocalMatrix ();
4037 
4038  if (staticGraph_->getProfileType () == StaticProfile) {
4039  const LO lclNumRows = lclMat.numRows ();
4040  for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
4041  auto row_i = lclMat.row (lclRow);
4042  for (LO k = 0; k < row_i.length; ++k) {
4043  // FIXME (mfh 02 Jan 2015) This assumes CUDA UVM.
4044  row_i.value (k) *= theAlpha;
4045  }
4046  }
4047  }
4048  else if (staticGraph_->getProfileType () != StaticProfile) {
4049  for (size_t row = 0; row < nlrs; ++row) {
4050  const size_type numEnt = getNumEntriesInLocalRow (row);
4051  Teuchos::ArrayView<impl_scalar_type> rowVals = values2D_[row] ();
4052  for (size_type k = 0; k < numEnt; ++k) {
4053  rowVals[k] *= theAlpha;
4054  }
4055  }
4056  }
4057  }
4058  }
4059 
4060  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4061  void
4063  setAllToScalar (const Scalar& alpha)
4064  {
4065  const char tfecfFuncName[] = "setAllToScalar: ";
4066  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
4067  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4068  ! isFillActive (), std::runtime_error,
4069  "Fill must be active before you may call this method. "
4070  "Please call resumeFill() to make fill active.");
4071 
4072  // replace all values in the matrix
4073  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
4074  // however, if there are no valid entries, we can short-circuit
4075  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
4076  const size_t nlrs = staticGraph_->getNodeNumRows();
4077  const size_t numEntries = staticGraph_->getNodeNumEntries();
4078  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
4079  // do nothing
4080  }
4081  else {
4082  const ProfileType profType = staticGraph_->getProfileType ();
4083  if (profType == StaticProfile) {
4084  // FIXME (mfh 24 Dec 2014) Once CrsMatrix implements DualView
4085  // semantics, this would be the place to mark memory as
4086  // modified.
4087  Kokkos::deep_copy (k_values1D_, theAlpha);
4088  }
4089  else if (profType != StaticProfile) {
4090  for (size_t row = 0; row < nlrs; ++row) {
4091  std::fill (values2D_[row].begin (), values2D_[row].end (), theAlpha);
4092  }
4093  }
4094  }
4095  }
4096 
4097  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4098  void
4100  setAllValues (const typename local_matrix_type::row_map_type& rowPointers,
4101  const typename local_graph_type::entries_type::non_const_type& columnIndices,
4102  const typename local_matrix_type::values_type& values)
4103  {
4104  const char tfecfFuncName[] = "setAllValues: ";
4105  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4106  (columnIndices.size () != values.size (), std::invalid_argument,
4107  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
4108  " = " << values.size () << ".");
4109  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4110  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
4111 
4112  try {
4113  myGraph_->setAllIndices (rowPointers, columnIndices);
4114  }
4115  catch (std::exception &e) {
4116  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4117  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
4118  "exception: " << e.what ());
4119  }
4120  // Make sure that myGraph_ now has a local graph. It may not be
4121  // fillComplete yet, so it's important to check. We don't care
4122  // whether setAllIndices() did a shallow copy or a deep copy, so a
4123  // good way to check is to compare dimensions.
4124  auto lclGraph = myGraph_->getLocalGraph ();
4125  const size_t numEnt = lclGraph.entries.extent (0);
4126  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4127  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
4128  numEnt != static_cast<size_t> (columnIndices.extent (0)),
4129  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
4130  "local graph. Please report this bug to the Tpetra developers.");
4131 
4132  const size_t numCols = myGraph_->getColMap ()->getNodeNumElements ();
4133 
4134  auto lclMat = std::make_shared<local_matrix_type>
4135  ("Tpetra::CrsMatrix::lclMatrix_", numCols, values, lclGraph);
4136  lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
4137 
4138  // FIXME (22 Jun 2016) I would very much like to get rid of
4139  // k_values1D_ at some point. I find it confusing to have all
4140  // these extra references lying around.
4141  k_values1D_ = lclMat->values;
4142 
4143  // Storage MUST be packed, since the interface doesn't give any
4144  // way to indicate any extra space at the end of each row.
4145  this->storageStatus_ = ::Tpetra::Details::STORAGE_1D_PACKED;
4146 
4147  checkInternalState ();
4148  }
4149 
4150  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4151  void
4153  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
4154  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
4155  const Teuchos::ArrayRCP<Scalar>& val)
4156  {
4157  using Kokkos::Compat::getKokkosViewDeepCopy;
4158  using Teuchos::ArrayRCP;
4159  using Teuchos::av_reinterpret_cast;
4160  typedef device_type DT;
4161  typedef impl_scalar_type IST;
4162  typedef typename local_matrix_type::row_map_type row_map_type;
4163  //typedef typename row_map_type::non_const_value_type row_offset_type;
4164  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
4165 
4166  // The row offset type may depend on the execution space. It may
4167  // not necessarily be size_t. If it's not, we need to make a deep
4168  // copy. We need to make a deep copy anyway so that Kokkos can
4169  // own the memory. Regardless, ptrIn gets the copy.
4170  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
4171  Kokkos::View<const size_t*,
4172  typename row_map_type::array_layout,
4173  Kokkos::HostSpace,
4174  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
4175  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
4176 
4177  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4178  (ptrNative.extent (0) != ptrSizeT.extent (0),
4179  std::logic_error, "ptrNative.extent(0) = " <<
4180  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
4181  << ptrSizeT.extent (0) << ". Please report this bug to the "
4182  "Tpetra developers.");
4183 
4184  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
4185  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
4186  this->setAllValues (ptrNative, indIn, valIn);
4187  }
4188 
4189  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4190  void
4192  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
4193  {
4194  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
4195  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4196  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
4197 
4198  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
4199  // this method in CrsGraph too, so don't call it (otherwise build
4200  // warnings will show up and annoy users). Instead, copy results
4201  // in and out, if the memory space requires it.
4202 
4203  const size_t lclNumRows = staticGraph_->getNodeNumRows ();
4204  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
4205  offsets.resize (lclNumRows);
4206  }
4207 
4208  // The input ArrayRCP must always be a host pointer. Thus, if
4209  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
4210  // to write to that allocation directly as a Kokkos::View.
4211  typedef typename device_type::memory_space memory_space;
4212  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
4213  // It is always syntactically correct to assign a raw host
4214  // pointer to a device View, so this code will compile correctly
4215  // even if this branch never runs.
4216  typedef Kokkos::View<size_t*, device_type,
4217  Kokkos::MemoryUnmanaged> output_type;
4218  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
4219  staticGraph_->getLocalDiagOffsets (offsetsOut);
4220  }
4221  else {
4222  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
4223  staticGraph_->getLocalDiagOffsets (offsetsTmp);
4224  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
4225  Kokkos::MemoryUnmanaged> output_type;
4226  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
4227  Kokkos::deep_copy (offsetsOut, offsetsTmp);
4228  }
4229  }
4230 
4231  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4232  void
4235  {
4236  using Teuchos::ArrayRCP;
4237  using Teuchos::ArrayView;
4238  using Teuchos::av_reinterpret_cast;
4239  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
4240  typedef local_ordinal_type LO;
4241 
4242 
4243  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4244  staticGraph_.is_null (), std::runtime_error,
4245  "This method requires that the matrix have a graph.");
4246  auto rowMapPtr = this->getRowMap ();
4247  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
4248  // Processes on which the row Map or its communicator is null
4249  // don't participate. Users shouldn't even call this method on
4250  // those processes.
4251  return;
4252  }
4253  auto colMapPtr = this->getColMap ();
4254  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4255  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
4256  "This method requires that the matrix have a column Map.");
4257  const map_type& rowMap = * rowMapPtr;
4258  const map_type& colMap = * colMapPtr;
4259  const LO myNumRows = static_cast<LO> (this->getNodeNumRows ());
4260 
4261 #ifdef HAVE_TPETRA_DEBUG
4262  // isCompatible() requires an all-reduce, and thus this check
4263  // should only be done in debug mode.
4264  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4265  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
4266  "The input Vector's Map must be compatible with the CrsMatrix's row "
4267  "Map. You may check this by using Map's isCompatible method: "
4268  "diag.getMap ()->isCompatible (A.getRowMap ());");
4269 #endif // HAVE_TPETRA_DEBUG
4270 
4271  if (this->isFillComplete ()) {
4272  diag.template modify<device_type> ();
4273  const auto D_lcl = diag.template getLocalView<device_type> ();
4274  // 1-D subview of the first (and only) column of D_lcl.
4275  const auto D_lcl_1d =
4276  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
4277 
4278  const auto lclRowMap = rowMap.getLocalMap ();
4279  const auto lclColMap = colMap.getLocalMap ();
4281  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
4282  lclColMap,
4283  lclMatrix_->getLocalMatrix ());
4284  }
4285  else {
4287  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
4288  }
4289  }
4290 
4291  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4292  void
4295  const Kokkos::View<const size_t*, device_type,
4296  Kokkos::MemoryUnmanaged>& offsets) const
4297  {
4298  typedef LocalOrdinal LO;
4299 
4300 #ifdef HAVE_TPETRA_DEBUG
4301  const char tfecfFuncName[] = "getLocalDiagCopy: ";
4302  const map_type& rowMap = * (this->getRowMap ());
4303  // isCompatible() requires an all-reduce, and thus this check
4304  // should only be done in debug mode.
4305  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4306  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
4307  "The input Vector's Map must be compatible with (in the sense of Map::"
4308  "isCompatible) the CrsMatrix's row Map.");
4309 #endif // HAVE_TPETRA_DEBUG
4310 
4311  // For now, we fill the Vector on the host and sync to device.
4312  // Later, we may write a parallel kernel that works entirely on
4313  // device.
4314  //
4315  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
4316  // we write a device kernel, it will not need to assume UVM.
4317 
4318  diag.template modify<device_type> ();
4319  auto D_lcl = diag.template getLocalView<device_type> ();
4320  const LO myNumRows = static_cast<LO> (this->getNodeNumRows ());
4321  // Get 1-D subview of the first (and only) column of D_lcl.
4322  auto D_lcl_1d =
4323  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
4324 
4325  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
4326  lclMatrix_->getLocalMatrix ());
4327  }
4328 
4329  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4330  void
4333  const Teuchos::ArrayView<const size_t>& offsets) const
4334  {
4335  using LO = LocalOrdinal;
4336  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
4337  using IST = impl_scalar_type;
4338 
4339 #ifdef HAVE_TPETRA_DEBUG
4340  const char tfecfFuncName[] = "getLocalDiagCopy: ";
4341  const map_type& rowMap = * (this->getRowMap ());
4342  // isCompatible() requires an all-reduce, and thus this check
4343  // should only be done in debug mode.
4344  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4345  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
4346  "The input Vector's Map must be compatible with (in the sense of Map::"
4347  "isCompatible) the CrsMatrix's row Map.");
4348 #endif // HAVE_TPETRA_DEBUG
4349 
4350  // See #1510. In case diag has already been marked modified on
4351  // device, we need to clear that flag, since the code below works
4352  // on host.
4353  diag.clear_sync_state ();
4354 
4355  // For now, we fill the Vector on the host and sync to device.
4356  // Later, we may write a parallel kernel that works entirely on
4357  // device.
4358  diag.modify_host ();
4359  auto lclVecHost = diag.getLocalViewHost ();
4360  // 1-D subview of the first (and only) column of lclVecHost.
4361  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
4362 
4363  using host_offsets_view_type =
4364  Kokkos::View<const size_t*, Kokkos::HostSpace,
4365  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
4366  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
4367  // Find the diagonal entries and put them in lclVecHost1d.
4368  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
4369  const LO myNumRows = static_cast<LO> (this->getNodeNumRows ());
4370  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
4371 
4372  local_matrix_type lclMat = lclMatrix_->getLocalMatrix ();
4373  Kokkos::parallel_for
4374  ("Tpetra::CrsMatrix::getLocalDiagCopy",
4375  range_type (0, myNumRows),
4376  [&] (const LO lclRow) {
4377  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
4378  if (h_offsets[lclRow] != INV) {
4379  auto curRow = lclMat.rowConst (lclRow);
4380  lclVecHost1d(lclRow) = static_cast<IST> (curRow.value(h_offsets[lclRow]));
4381  }
4382  });
4383  diag.sync_device ();
4384  }
4385 
4386 
4387  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4388  void
4391  {
4392  using ::Tpetra::Details::ProfilingRegion;
4393  using Teuchos::ArrayRCP;
4394  using Teuchos::ArrayView;
4395  using Teuchos::null;
4396  using Teuchos::RCP;
4397  using Teuchos::rcp;
4398  using Teuchos::rcpFromRef;
4399  using LO = local_ordinal_type;
4401  const char tfecfFuncName[] = "leftScale: ";
4402 
4403  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
4404 
4405  RCP<const vec_type> xp;
4406  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
4407  // Take from Epetra: If we have a non-trivial exporter, we must
4408  // import elements that are permuted or are on other processors.
4409  auto exporter = this->getCrsGraphRef ().getExporter ();
4410  if (exporter.get () != nullptr) {
4411  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
4412  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
4413  xp = tempVec;
4414  }
4415  else {
4416  xp = rcpFromRef (x);
4417  }
4418  }
4419  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
4420  xp = rcpFromRef (x);
4421  }
4422  else {
4423  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4424  (true, std::invalid_argument, "x's Map must be the same as "
4425  "either the row Map or the range Map of the CrsMatrix.");
4426  }
4427 
4428  // Check whether A has a valid local matrix. It might not if it
4429  // was not created with a local matrix, and if fillComplete has
4430  // never been called on it before. A never-initialized (and thus
4431  // invalid) local matrix has zero rows, because it was default
4432  // constructed.
4433  const LO lclNumRows =
4434  static_cast<LO> (this->getRowMap ()->getNodeNumElements ());
4435  const bool validLocalMatrix = lclMatrix_.get () != nullptr &&
4436  lclMatrix_->getLocalMatrix ().numRows () == lclNumRows;
4437 
4438  if (validLocalMatrix) {
4439  using dev_memory_space = typename device_type::memory_space;
4440  if (xp->template need_sync<dev_memory_space> ()) {
4441  using Teuchos::rcp_const_cast;
4442  rcp_const_cast<vec_type> (xp)->template sync<dev_memory_space> ();
4443  }
4444  auto x_lcl = xp->template getLocalView<dev_memory_space> ();
4445  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
4446  using ::Tpetra::Details::leftScaleLocalCrsMatrix;
4447  leftScaleLocalCrsMatrix (lclMatrix_->getLocalMatrix (),
4448  x_lcl_1d, false, false);
4449  }
4450  else {
4451  execution_space().fence (); // for UVM's sake
4452 
4453  ArrayRCP<const Scalar> vectorVals = xp->getData (0);
4454  ArrayView<impl_scalar_type> rowValues = Teuchos::null;
4455  for (LocalOrdinal i = 0; i < lclNumRows; ++i) {
4456  const RowInfo rowinfo = this->staticGraph_->getRowInfo (i);
4457  rowValues = this->getViewNonConst (rowinfo);
4458  const impl_scalar_type scaleValue = static_cast<impl_scalar_type> (vectorVals[i]);
4459  for (size_t j = 0; j < rowinfo.numEntries; ++j) {
4460  rowValues[j] *= scaleValue;
4461  }
4462  }
4463  execution_space().fence (); // for UVM's sake
4464  }
4465  }
4466 
4467  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4468  void
4471  {
4472  using ::Tpetra::Details::ProfilingRegion;
4473  using Teuchos::ArrayRCP;
4474  using Teuchos::ArrayView;
4475  using Teuchos::null;
4476  using Teuchos::RCP;
4477  using Teuchos::rcp;
4478  using Teuchos::rcpFromRef;
4479  using LO = local_ordinal_type;
4481  const char tfecfFuncName[] = "rightScale: ";
4482 
4483  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
4484 
4485  RCP<const vec_type> xp;
4486  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
4487  // Take from Epetra: If we have a non-trivial exporter, we must
4488  // import elements that are permuted or are on other processors.
4489  auto importer = this->getCrsGraphRef ().getImporter ();
4490  if (importer.get () != nullptr) {
4491  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
4492  tempVec->doImport (x, *importer, REPLACE);
4493  xp = tempVec;
4494  }
4495  else {
4496  xp = rcpFromRef (x);
4497  }
4498  }
4499  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
4500  xp = rcpFromRef (x);
4501  } else {
4502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4503  (true, std::runtime_error, "x's Map must be the same as "
4504  "either the domain Map or the column Map of the CrsMatrix.");
4505  }
4506 
4507  // Check whether A has a valid local matrix. It might not if it
4508  // was not created with a local matrix, and if fillComplete has
4509  // never been called on it before. A never-initialized (and thus
4510  // invalid) local matrix has zero rows, because it was default
4511  // constructed.
4512  const LO lclNumRows =
4513  static_cast<LO> (this->getRowMap ()->getNodeNumElements ());
4514  const bool validLocalMatrix = lclMatrix_.get () != nullptr &&
4515  lclMatrix_->getLocalMatrix ().numRows () == lclNumRows;
4516 
4517  if (validLocalMatrix) {
4518  using dev_memory_space = typename device_type::memory_space;
4519  if (xp->template need_sync<dev_memory_space> ()) {
4520  using Teuchos::rcp_const_cast;
4521  rcp_const_cast<vec_type> (xp)->template sync<dev_memory_space> ();
4522  }
4523  auto x_lcl = xp->template getLocalView<dev_memory_space> ();
4524  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
4525  using ::Tpetra::Details::rightScaleLocalCrsMatrix;
4526  rightScaleLocalCrsMatrix (lclMatrix_->getLocalMatrix (),
4527  x_lcl_1d, false, false);
4528  }
4529  else {
4530  execution_space().fence (); // for UVM's sake
4531 
4532  ArrayRCP<const Scalar> vectorVals = xp->getData (0);
4533  ArrayView<impl_scalar_type> rowValues = null;
4534  for (LO i = 0; i < lclNumRows; ++i) {
4535  const RowInfo rowinfo = this->staticGraph_->getRowInfo (i);
4536  rowValues = this->getViewNonConst (rowinfo);
4537  ArrayView<const LO> colInds;
4538  this->getCrsGraphRef ().getLocalRowView (i, colInds);
4539  for (size_t j = 0; j < rowinfo.numEntries; ++j) {
4540  rowValues[j] *= static_cast<impl_scalar_type> (vectorVals[colInds[j]]);
4541  }
4542  }
4543  execution_space().fence (); // for UVM's sake
4544  }
4545  }
4546 
4547  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4551  {
4552  using Teuchos::ArrayView;
4553  using Teuchos::outArg;
4554  using Teuchos::REDUCE_SUM;
4555  using Teuchos::reduceAll;
4556  typedef typename Teuchos::ArrayRCP<const impl_scalar_type>::size_type size_type;
4557 
4558  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
4559  // local part of this computation. It could make sense to put
4560  // this operation in the Kokkos::CrsMatrix.
4561 
4562  // check the cache first
4563  mag_type frobNorm = frobNorm_;
4564  if (frobNorm == -STM::one ()) {
4565  mag_type mySum = STM::zero ();
4566  if (getNodeNumEntries() > 0) {
4567  if (isStorageOptimized ()) {
4568  // "Optimized" storage is packed storage. That means we can
4569  // iterate in one pass through the 1-D values array.
4570  const size_type numEntries =
4571  static_cast<size_type> (getNodeNumEntries ());
4572  for (size_type k = 0; k < numEntries; ++k) {
4573  // FIXME (mfh 05 Aug 2014) This assumes UVM.
4574  const impl_scalar_type val = k_values1D_(k);
4575  // Note (etp 06 Jan 2015) We need abs() here for composite types
4576  // (in general, if mag_type is on the left-hand-side, we need
4577  // abs() on the right-hand-side)
4578  const mag_type val_abs = STS::abs (val);
4579  mySum += val_abs * val_abs;
4580  }
4581  }
4582  else {
4583  const LocalOrdinal numRows =
4584  static_cast<LocalOrdinal> (this->getNodeNumRows ());
4585  for (LocalOrdinal r = 0; r < numRows; ++r) {
4586  const RowInfo rowInfo = myGraph_->getRowInfo (r);
4587  const size_type numEntries =
4588  static_cast<size_type> (rowInfo.numEntries);
4589  ArrayView<const impl_scalar_type> A_r =
4590  this->getView (rowInfo).view (0, numEntries);
4591  for (size_type k = 0; k < numEntries; ++k) {
4592  const impl_scalar_type val = A_r[k];
4593  const mag_type val_abs = STS::abs (val);
4594  mySum += val_abs * val_abs;
4595  }
4596  }
4597  }
4598  }
4599  mag_type totalSum = STM::zero ();
4600  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
4601  mySum, outArg (totalSum));
4602  frobNorm = STM::sqrt (totalSum);
4603  }
4604  if (isFillComplete ()) {
4605  // Only cache the result if the matrix is fill complete.
4606  // Otherwise, the values might still change. resumeFill clears
4607  // the cache.
4608  frobNorm_ = frobNorm;
4609  }
4610  return frobNorm;
4611  }
4612 
4613  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4614  void
4616  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
4617  {
4618  const char tfecfFuncName[] = "replaceColMap: ";
4619  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
4620  // Then replacing the column Map might mean that we need to
4621  // reindex the column indices.
4622  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4623  myGraph_.is_null (), std::runtime_error,
4624  "This method does not work if the matrix has a const graph. The whole "
4625  "idea of a const graph is that you are not allowed to change it, but "
4626  "this method necessarily must modify the graph, since the graph owns "
4627  "the matrix's column Map.");
4628  myGraph_->replaceColMap (newColMap);
4629  }
4630 
4631  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4632  void
4635  const Teuchos::RCP<const map_type>& newColMap,
4636  const Teuchos::RCP<const import_type>& newImport,
4637  const bool sortEachRow)
4638  {
4639  const char tfecfFuncName[] = "reindexColumns: ";
4640  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4641  graph == NULL && myGraph_.is_null (), std::invalid_argument,
4642  "The input graph is NULL, but the matrix does not own its graph.");
4643 
4644  crs_graph_type& theGraph = (graph == NULL) ? *myGraph_ : *graph;
4645  const bool sortGraph = false; // we'll sort graph & matrix together below
4646  theGraph.reindexColumns (newColMap, newImport, sortGraph);
4647  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
4648  const LocalOrdinal lclNumRows =
4649  static_cast<LocalOrdinal> (theGraph.getNodeNumRows ());
4650  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
4651  const RowInfo rowInfo = theGraph.getRowInfo (row);
4652  auto lclColInds = theGraph.getLocalKokkosRowViewNonConst (rowInfo);
4653  auto vals = this->getRowViewNonConst (rowInfo);
4654  // FIXME (mfh 09 May 2017) This assumes CUDA UVM, at least for
4655  // lclColInds, if not also for values.
4656  sort2 (lclColInds.data (),
4657  lclColInds.data () + rowInfo.numEntries,
4658  vals.data ());
4659  }
4660  theGraph.indicesAreSorted_ = true;
4661  }
4662  }
4663 
4664  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4665  void
4667  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
4668  Teuchos::RCP<const import_type>& newImporter)
4669  {
4670  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
4671  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4672  myGraph_.is_null (), std::runtime_error,
4673  "This method does not work if the matrix has a const graph. The whole "
4674  "idea of a const graph is that you are not allowed to change it, but this"
4675  " method necessarily must modify the graph, since the graph owns the "
4676  "matrix's domain Map and Import objects.");
4677  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
4678  }
4679 
4680  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4681  void
4683  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4684  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4685  const Teuchos::ArrayView<const Scalar>& values)
4686  {
4687  using Teuchos::Array;
4688  typedef GlobalOrdinal GO;
4689  typedef typename Array<GO>::size_type size_type;
4690 
4691  const size_type numToInsert = indices.size ();
4692  // Add the new data to the list of nonlocals.
4693  // This creates the arrays if they don't exist yet.
4694  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4695  Array<GO>& curRowInds = curRow.first;
4696  Array<Scalar>& curRowVals = curRow.second;
4697  const size_type newCapacity = curRowInds.size () + numToInsert;
4698  curRowInds.reserve (newCapacity);
4699  curRowVals.reserve (newCapacity);
4700  for (size_type k = 0; k < numToInsert; ++k) {
4701  curRowInds.push_back (indices[k]);
4702  curRowVals.push_back (values[k]);
4703  }
4704  }
4705 
4706  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4707  void
4710  {
4711  using ::Tpetra::Details::ProfilingRegion;
4712  using Teuchos::Comm;
4713  using Teuchos::outArg;
4714  using Teuchos::RCP;
4715  using Teuchos::rcp;
4716  using Teuchos::REDUCE_MAX;
4717  using Teuchos::REDUCE_MIN;
4718  using Teuchos::reduceAll;
4720  //typedef LocalOrdinal LO;
4721  typedef GlobalOrdinal GO;
4722  typedef typename Teuchos::Array<GO>::size_type size_type;
4723  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4724  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4725 
4726  RCP<const Comm<int> > comm = getComm ();
4727 
4728  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4729  (! isFillActive (), std::runtime_error, "Fill must be active before "
4730  "you may call this method.");
4731 
4732  const size_t myNumNonlocalRows = nonlocals_.size ();
4733 
4734  // If no processes have nonlocal rows, then we don't have to do
4735  // anything. Checking this is probably cheaper than constructing
4736  // the Map of nonlocal rows (see below) and noticing that it has
4737  // zero global entries.
4738  {
4739  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4740  int someoneHasNonlocalRows = 0;
4741  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4742  outArg (someoneHasNonlocalRows));
4743  if (someoneHasNonlocalRows == 0) {
4744  return; // no process has nonlocal rows, so nothing to do
4745  }
4746  }
4747 
4748  // 1. Create a list of the "nonlocal" rows on each process. this
4749  // requires iterating over nonlocals_, so while we do this,
4750  // deduplicate the entries and get a count for each nonlocal
4751  // row on this process.
4752  // 2. Construct a new row Map corresponding to those rows. This
4753  // Map is likely overlapping. We know that the Map is not
4754  // empty on all processes, because the above all-reduce and
4755  // return exclude that case.
4756 
4757  RCP<const map_type> nonlocalRowMap;
4758  // Keep this for CrsGraph's constructor, so we can use StaticProfile.
4759  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4760  {
4761  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4762  size_type curPos = 0;
4763  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4764  ++mapIter, ++curPos) {
4765  myNonlocalGblRows[curPos] = mapIter->first;
4766  // Get the values and column indices by reference, since we
4767  // intend to change them in place (that's what "erase" does).
4768  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4769  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4770 
4771  // Sort both arrays jointly, using the column indices as keys,
4772  // then merge them jointly. "Merge" here adds values
4773  // corresponding to the same column indices. The first 2 args
4774  // of merge2 are output arguments that work just like the
4775  // return value of std::unique.
4776  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4777  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4778  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4779  merge2 (gblCols_newEnd, vals_newEnd,
4780  gblCols.begin (), gblCols.end (),
4781  vals.begin (), vals.end ());
4782  gblCols.erase (gblCols_newEnd, gblCols.end ());
4783  vals.erase (vals_newEnd, vals.end ());
4784  numEntPerNonlocalRow[curPos] = gblCols.size ();
4785  }
4786 
4787  // Currently, Map requires that its indexBase be the global min
4788  // of all its global indices. Map won't compute this for us, so
4789  // we must do it. If our process has no nonlocal rows, set the
4790  // "min" to the max possible GO value. This ensures that if
4791  // some process has at least one nonlocal row, then it will pick
4792  // that up as the min. We know that at least one process has a
4793  // nonlocal row, since the all-reduce and return at the top of
4794  // this method excluded that case.
4795  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4796  {
4797  auto iter = std::min_element (myNonlocalGblRows.begin (),
4798  myNonlocalGblRows.end ());
4799  if (iter != myNonlocalGblRows.end ()) {
4800  myMinNonlocalGblRow = *iter;
4801  }
4802  }
4803  GO gblMinNonlocalGblRow = 0;
4804  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4805  outArg (gblMinNonlocalGblRow));
4806  const GO indexBase = gblMinNonlocalGblRow;
4807  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4808  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4809  }
4810 
4811  // 3. Use the values and column indices for each nonlocal row, as
4812  // stored in nonlocals_, to construct a CrsMatrix corresponding
4813  // to nonlocal rows. We may use StaticProfile, since we have
4814  // exact counts of the number of entries in each nonlocal row.
4815 
4816  RCP<crs_matrix_type> nonlocalMatrix =
4817  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow (),
4818  StaticProfile));
4819  {
4820  size_type curPos = 0;
4821  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4822  ++mapIter, ++curPos) {
4823  const GO gblRow = mapIter->first;
4824  // Get values & column indices by ref, just to avoid copy.
4825  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4826  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4827  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4828  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4829  }
4830  }
4831  // There's no need to fill-complete the nonlocals matrix.
4832  // We just use it as a temporary container for the Export.
4833 
4834  // 4. If the original row Map is one to one, then we can Export
4835  // directly from nonlocalMatrix into this. Otherwise, we have
4836  // to create a temporary matrix with a one-to-one row Map,
4837  // Export into that, then Import from the temporary matrix into
4838  // *this.
4839 
4840  auto origRowMap = this->getRowMap ();
4841  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4842 
4843  int isLocallyComplete = 1; // true by default
4844 
4845  if (origRowMapIsOneToOne) {
4846  export_type exportToOrig (nonlocalRowMap, origRowMap);
4847  if (! exportToOrig.isLocallyComplete ()) {
4848  isLocallyComplete = 0;
4849  }
4850  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4851  // We're done at this point!
4852  }
4853  else {
4854  // If you ask a Map whether it is one to one, it does some
4855  // communication and stashes intermediate results for later use
4856  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4857  // much more then the original cost of calling isOneToOne.
4858  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4859  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4860  if (! exportToOneToOne.isLocallyComplete ()) {
4861  isLocallyComplete = 0;
4862  }
4863 
4864  // Create a temporary matrix with the one-to-one row Map.
4865  //
4866  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4867  // each row, to avoid reallocation during the Export operation.
4868  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4869  // Export from matrix of nonlocals into the temp one-to-one matrix.
4870  oneToOneMatrix.doExport (*nonlocalMatrix, exportToOneToOne, Tpetra::ADD);
4871 
4872  // We don't need the matrix of nonlocals anymore, so get rid of
4873  // it, to keep the memory high-water mark down.
4874  nonlocalMatrix = Teuchos::null;
4875 
4876  // Import from the one-to-one matrix to the original matrix.
4877  import_type importToOrig (oneToOneRowMap, origRowMap);
4878  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4879  }
4880 
4881  // It's safe now to clear out nonlocals_, since we've already
4882  // committed side effects to *this. The standard idiom for
4883  // clearing a Container like std::map, is to swap it with an empty
4884  // Container and let the swapped Container fall out of scope.
4885  decltype (nonlocals_) newNonlocals;
4886  std::swap (nonlocals_, newNonlocals);
4887 
4888  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4889  // don't like throwing an exception here. A local return value
4890  // would likely be more useful to users. However, if users find
4891  // themselves exercising nonlocal inserts often, then they are
4892  // probably novice users who need the help. See Gibhub Issues
4893  // #603 and #601 (esp. the latter) for discussion.
4894 
4895  int isGloballyComplete = 0; // output argument of reduceAll
4896  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4897  outArg (isGloballyComplete));
4898  TEUCHOS_TEST_FOR_EXCEPTION
4899  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4900  "you called insertGlobalValues with a global row index which is not in "
4901  "the matrix's row Map on any process in its communicator.");
4902  }
4903 
4904  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4905  void
4907  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4908  {
4909  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4910  myGraph_->resumeFill (params);
4911  }
4912  clearGlobalConstants ();
4913  fillComplete_ = false;
4914  }
4915 
4916  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4917  void
4920  {
4921  // This method doesn't do anything. The analogous method in
4922  // CrsGraph does actually compute something.
4923  //
4924  // Oddly enough, clearGlobalConstants() clears frobNorm_ (by
4925  // setting it to -1), but computeGlobalConstants() does _not_
4926  // compute the Frobenius norm; this is done on demand in
4927  // getFrobeniusNorm(), and the result is cached there.
4928  }
4929 
4930  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4931  bool
4934  return getCrsGraphRef ().haveGlobalConstants ();
4935  }
4936 
4937  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4938  void
4941  // We use -1 to indicate that the Frobenius norm needs to be
4942  // recomputed, since the values might change between now and the
4943  // next fillComplete call.
4944  //
4945  // Oddly enough, clearGlobalConstants() clears frobNorm_, but
4946  // computeGlobalConstants() does _not_ compute the Frobenius norm;
4947  // this is done on demand in getFrobeniusNorm(), and the result is
4948  // cached there.
4949  frobNorm_ = -STM::one ();
4950  }
4951 
4952  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4953  void
4955  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4956  {
4957  const char tfecfFuncName[] = "fillComplete(params): ";
4958 
4959  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4960  (this->getCrsGraph ().is_null (), std::logic_error,
4961  "getCrsGraph() returns null. This should not happen at this point. "
4962  "Please report this bug to the Tpetra developers.");
4963 
4964  const crs_graph_type& graph = this->getCrsGraphRef ();
4965  if (this->isStaticGraph () && graph.isFillComplete ()) {
4966  // If this matrix's graph is fill complete and the user did not
4967  // supply a domain or range Map, use the graph's domain and
4968  // range Maps.
4969  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4970  }
4971  else { // assume that user's row Map is the domain and range Map
4972  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4973  Teuchos::RCP<const map_type> domainMap = rangeMap;
4974  this->fillComplete (domainMap, rangeMap, params);
4975  }
4976  }
4977 
4978  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4979  void
4981  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4982  const Teuchos::RCP<const map_type>& rangeMap,
4983  const Teuchos::RCP<Teuchos::ParameterList>& params)
4984  {
4985  using ::Tpetra::Details::ProfilingRegion;
4986  using Teuchos::ArrayRCP;
4987  using Teuchos::RCP;
4988  using Teuchos::rcp;
4989  const char tfecfFuncName[] = "fillComplete: ";
4990  ProfilingRegion regionFillComplete ("Tpetra::CrsMatrix::fillComplete");
4991 
4992  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4993  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4994  "Matrix fill state must be active (isFillActive() "
4995  "must be true) before you may call fillComplete().");
4996  const int numProcs = this->getComm ()->getSize ();
4997 
4998  //
4999  // Read parameters from the input ParameterList.
5000  //
5001 
5002  // If true, the caller promises that no process did nonlocal
5003  // changes since the last call to fillComplete.
5004  bool assertNoNonlocalInserts = false;
5005  // If true, makeColMap sorts remote GIDs (within each remote
5006  // process' group).
5007  bool sortGhosts = true;
5008 
5009  if (! params.is_null ()) {
5010  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
5011  assertNoNonlocalInserts);
5012  if (params->isParameter ("sort column map ghost gids")) {
5013  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
5014  }
5015  else if (params->isParameter ("Sort column Map ghost GIDs")) {
5016  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
5017  }
5018  }
5019  // We also don't need to do global assembly if there is only one
5020  // process in the communicator.
5021  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
5022  // This parameter only matters if this matrix owns its graph.
5023  if (! this->myGraph_.is_null ()) {
5024  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
5025  }
5026 
5027  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
5028  if (this->hasColMap ()) {
5029  // We have a column Map, so use local indices.
5030  this->allocateValues (LocalIndices, GraphNotYetAllocated);
5031  } else {
5032  // We don't have a column Map, so use global indices.
5033  this->allocateValues (GlobalIndices, GraphNotYetAllocated);
5034  }
5035  }
5036  // Global assemble, if we need to. This call only costs a single
5037  // all-reduce if we didn't need global assembly after all.
5038  if (needGlobalAssemble) {
5039  this->globalAssemble ();
5040  }
5041  else {
5042  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5043  (numProcs == 1 && nonlocals_.size() > 0,
5044  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
5045  "An invalid entry (i.e., with row index not in the row Map) must have "
5046  "been submitted to the CrsMatrix.");
5047  }
5048 
5049  if (this->isStaticGraph ()) {
5050  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
5051  // checks below only in debug mode. It would be nicer to do a
5052  // local check, then propagate the error state in a deferred
5053  // way, whenever communication happens. That would reduce the
5054  // cost of checking, to the point where it may make sense to
5055  // enable it even in release mode.
5056 #ifdef HAVE_TPETRA_DEBUG
5057  // FIXME (mfh 18 Jun 2014) This check for correctness of the
5058  // input Maps incurs a penalty of two all-reduces for the
5059  // otherwise optimal const graph case.
5060  //
5061  // We could turn these (max) 2 all-reduces into (max) 1, by
5062  // fusing them. We could do this by adding a "locallySameAs"
5063  // method to Map, which would return one of four states:
5064  //
5065  // a. Certainly globally the same
5066  // b. Certainly globally not the same
5067  // c. Locally the same
5068  // d. Locally not the same
5069  //
5070  // The first two states don't require further communication.
5071  // The latter two states require an all-reduce to communicate
5072  // globally, but we only need one all-reduce, since we only need
5073  // to check whether at least one of the Maps is wrong.
5074  const bool domainMapsMatch =
5075  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
5076  const bool rangeMapsMatch =
5077  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
5078 
5079  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5080  (! domainMapsMatch, std::runtime_error,
5081  "The CrsMatrix's domain Map does not match the graph's domain Map. "
5082  "The graph cannot be changed because it was given to the CrsMatrix "
5083  "constructor as const. You can fix this by passing in the graph's "
5084  "domain Map and range Map to the matrix's fillComplete call.");
5085 
5086  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5087  (! rangeMapsMatch, std::runtime_error,
5088  "The CrsMatrix's range Map does not match the graph's range Map. "
5089  "The graph cannot be changed because it was given to the CrsMatrix "
5090  "constructor as const. You can fix this by passing in the graph's "
5091  "domain Map and range Map to the matrix's fillComplete call.");
5092 #endif // HAVE_TPETRA_DEBUG
5093 
5094  // The matrix does _not_ own the graph, and the graph's
5095  // structure is already fixed, so just fill the local matrix.
5096  this->fillLocalMatrix (params);
5097  }
5098  else {
5099  // Set the graph's domain and range Maps. This will clear the
5100  // Import if the domain Map has changed (is a different
5101  // pointer), and the Export if the range Map has changed (is a
5102  // different pointer).
5103  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
5104 
5105  // Make the graph's column Map, if necessary.
5106  Teuchos::Array<int> remotePIDs (0);
5107  const bool mustBuildColMap = ! this->hasColMap ();
5108  if (mustBuildColMap) {
5109  this->myGraph_->makeColMap (remotePIDs);
5110  }
5111 
5112  // Make indices local, if necessary. The method won't do
5113  // anything if the graph is already locally indexed.
5114  const std::pair<size_t, std::string> makeIndicesLocalResult =
5115  this->myGraph_->makeIndicesLocal ();
5116  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
5117  // the error state to makeImportExport or
5118  // computeGlobalConstants, which may do all-reduces and thus may
5119  // have the opportunity to communicate that error state.
5120  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5121  (makeIndicesLocalResult.first != 0, std::runtime_error,
5122  makeIndicesLocalResult.second);
5123 
5124  const bool sorted = this->myGraph_->isSorted ();
5125  const bool merged = this->myGraph_->isMerged ();
5126  this->sortAndMergeIndicesAndValues (sorted, merged);
5127 
5128  // Make Import and Export objects, if they haven't been made
5129  // already. If we made a column Map above, reuse information
5130  // from that process to avoid communiation in the Import setup.
5131  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
5132 
5133  // The matrix _does_ own the graph, so fill the local graph at
5134  // the same time as the local matrix.
5135  this->fillLocalGraphAndMatrix (params);
5136 
5137  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
5138  params->get ("compute global constants", true);
5139  const bool computeLocalTriangularConstants = params.get () == nullptr ||
5140  params->get ("compute local triangular constants", true);
5141  if (callGraphComputeGlobalConstants) {
5142  this->myGraph_->computeGlobalConstants (computeLocalTriangularConstants);
5143  }
5144  else {
5145  this->myGraph_->computeLocalConstants (computeLocalTriangularConstants);
5146  }
5147  this->myGraph_->fillComplete_ = true;
5148  this->myGraph_->checkInternalState ();
5149  }
5150 
5151  const bool callComputeGlobalConstants = params.get () == nullptr ||
5152  params->get ("compute global constants", true);
5153  if (callComputeGlobalConstants) {
5154  this->computeGlobalConstants ();
5155  }
5156 
5157  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
5158 
5159  this->fillComplete_ = true; // Now we're fill complete!
5160  this->checkInternalState ();
5161  }
5162 
5163  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5164  void
5166  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
5167  const Teuchos::RCP<const map_type> & rangeMap,
5168  const Teuchos::RCP<const import_type>& importer,
5169  const Teuchos::RCP<const export_type>& exporter,
5170  const Teuchos::RCP<Teuchos::ParameterList> &params)
5171  {
5172 #ifdef HAVE_TPETRA_MMM_TIMINGS
5173  std::string label;
5174  if(!params.is_null())
5175  label = params->get("Timer Label",label);
5176  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
5177  using Teuchos::TimeMonitor;
5178 
5179  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
5180 #endif
5181 
5182  const char tfecfFuncName[] = "expertStaticFillComplete: ";
5183  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
5184  std::runtime_error, "Matrix fill state must be active (isFillActive() "
5185  "must be true) before calling fillComplete().");
5186  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
5187  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
5188 
5189  {
5190 #ifdef HAVE_TPETRA_MMM_TIMINGS
5191  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
5192 #endif
5193  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
5194  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
5195  }
5196 
5197  const bool callComputeGlobalConstants = params.get () == nullptr ||
5198  params->get ("compute global constants", true);
5199  if (callComputeGlobalConstants) {
5200  this->computeGlobalConstants ();
5201  }
5202 
5203  {
5204 #ifdef HAVE_TPETRA_MMM_TIMINGS
5205  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
5206 #endif
5207  // Fill the local graph and matrix
5208  fillLocalGraphAndMatrix (params);
5209  }
5210  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
5211 
5212  // Now we're fill complete!
5213  fillComplete_ = true;
5214 
5215  // Sanity checks at the end.
5216 #ifdef HAVE_TPETRA_DEBUG
5217  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
5218  ": We're at the end of fillComplete(), but isFillActive() is true. "
5219  "Please report this bug to the Tpetra developers.");
5220  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
5221  ": We're at the end of fillComplete(), but isFillActive() is true. "
5222  "Please report this bug to the Tpetra developers.");
5223 #endif // HAVE_TPETRA_DEBUG
5224  {
5225 #ifdef HAVE_TPETRA_MMM_TIMINGS
5226  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
5227 #endif
5228 
5229  checkInternalState();
5230  }
5231  }
5232 
5233  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5234  size_t
5237  const RowInfo& rowInfo)
5238  {
5239 #ifdef HAVE_TPETRA_DEBUG
5240  const char tfecfFuncName[] = "mergeRowIndicesAndValues: ";
5241 #endif // HAVE_TPETRA_DEBUG
5242 
5243  auto rowValues = this->getRowViewNonConst (rowInfo);
5244  typedef typename std::decay<decltype (rowValues[0]) >::type value_type;
5245  value_type* rowValueIter = rowValues.data ();
5246  auto inds_view = graph.getLocalKokkosRowViewNonConst (rowInfo);
5247 
5248  // beg,end define a half-exclusive interval over which to iterate.
5249  LocalOrdinal* beg = inds_view.data ();
5250  LocalOrdinal* end = inds_view.data () + rowInfo.numEntries;
5251 
5252 #ifdef HAVE_TPETRA_DEBUG
5253  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5254  (rowInfo.allocSize != static_cast<size_t> (inds_view.extent (0)) ||
5255  rowInfo.allocSize != static_cast<size_t> (rowValues.extent (0)),
5256  std::runtime_error, "rowInfo.allocSize = " << rowInfo.allocSize
5257  << " != inds_view.extent(0) = " << inds_view.extent (0)
5258  << " || rowInfo.allocSize = " << rowInfo.allocSize
5259  << " != rowValues.extent(0) = " << rowValues.extent (0) << ".");
5260 #endif // HAVE_TPETRA_DEBUG
5261 
5262  LocalOrdinal* newend = beg;
5263  if (beg != end) {
5264  LocalOrdinal* cur = beg + 1;
5265  value_type* vcur = rowValueIter + 1;
5266  value_type* vend = rowValueIter;
5267  cur = beg+1;
5268  while (cur != end) {
5269  if (*cur != *newend) {
5270  // new entry; save it
5271  ++newend;
5272  ++vend;
5273  (*newend) = (*cur);
5274  (*vend) = (*vcur);
5275  }
5276  else {
5277  // old entry; merge it
5278  //(*vend) = f (*vend, *vcur);
5279  (*vend) += *vcur;
5280  }
5281  ++cur;
5282  ++vcur;
5283  }
5284  ++newend; // one past the last entry, per typical [beg,end) semantics
5285  }
5286  const size_t mergedEntries = newend - beg;
5287  graph.k_numRowEntries_(rowInfo.localRow) = mergedEntries;
5288  const size_t numDups = rowInfo.numEntries - mergedEntries;
5289  return numDups;
5290  }
5291 
5292  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5293  void
5295  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
5296  {
5297  using ::Tpetra::Details::ProfilingRegion;
5298  typedef LocalOrdinal LO;
5299  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
5300  host_execution_space;
5301  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
5302  //typedef Kokkos::RangePolicy<Kokkos::Serial, LO> range_type;
5303  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
5304  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
5305 
5306  if (! sorted || ! merged) {
5307  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5308  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
5309  "\"static\" (const) graph, since the matrix does not own the graph.");
5310  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5311  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
5312  "this matrix claims ! isStaticGraph(). "
5313  "Please report this bug to the Tpetra developers.");
5314  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5315  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
5316  "this method if the graph's storage has already been optimized. "
5317  "Please report this bug to the Tpetra developers.");
5318 
5319  crs_graph_type& graph = * (this->myGraph_);
5320  const LO lclNumRows = static_cast<LO> (this->getNodeNumRows ());
5321  size_t totalNumDups = 0;
5322  // FIXME (mfh 10 May 2017) This may assume CUDA UVM.
5323  Kokkos::parallel_reduce (range_type (0, lclNumRows),
5324  [this, &graph, sorted, merged] (const LO& lclRow, size_t& numDups) {
5325  const RowInfo rowInfo = graph.getRowInfo (lclRow);
5326  if (! sorted) {
5327  auto lclColInds = graph.getLocalKokkosRowViewNonConst (rowInfo);
5328  auto vals = this->getRowViewNonConst (rowInfo);
5329  // FIXME (mfh 09 May 2017) This assumes CUDA UVM, at least
5330  // for lclColInds, if not also for values.
5331  sort2 (lclColInds.data (),
5332  lclColInds.data () + rowInfo.numEntries,
5333  vals.data ());
5334  }
5335  if (! merged) {
5336  numDups += this->mergeRowIndicesAndValues (graph, rowInfo);
5337  }
5338  }, totalNumDups);
5339  if (! sorted) {
5340  graph.indicesAreSorted_ = true; // we just sorted every row
5341  }
5342  if (! merged) {
5343  graph.noRedundancies_ = true; // we just merged every row
5344  }
5345  }
5346  }
5347 
5348  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5349  void
5353  Scalar alpha,
5354  Scalar beta) const
5355  {
5357  using Teuchos::RCP;
5358  using Teuchos::rcp;
5359  using Teuchos::rcp_const_cast;
5360  using Teuchos::rcpFromRef;
5361  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5362  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
5363 
5364  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
5365  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
5366  // failing only for the Kokkos refactor version of Tpetra. It's a
5367  // good idea regardless to have the bypass.
5368  if (alpha == ZERO) {
5369  if (beta == ZERO) {
5370  Y_in.putScalar (ZERO);
5371  } else if (beta != ONE) {
5372  Y_in.scale (beta);
5373  }
5374  return;
5375  }
5376 
5377  // It's possible that X is a view of Y or vice versa. We don't
5378  // allow this (apply() requires that X and Y not alias one
5379  // another), but it's helpful to detect and work around this case.
5380  // We don't try to to detect the more subtle cases (e.g., one is a
5381  // subview of the other, but their initial pointers differ). We
5382  // only need to do this if this matrix's Import is trivial;
5383  // otherwise, we don't actually apply the operator from X into Y.
5384 
5385  RCP<const import_type> importer = this->getGraph ()->getImporter ();
5386  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
5387 
5388  // If beta == 0, then the output MV will be overwritten; none of
5389  // its entries should be read. (Sparse BLAS semantics say that we
5390  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
5391  // This matters if we need to do an Export operation; see below.
5392  const bool Y_is_overwritten = (beta == ZERO);
5393 
5394  // We treat the case of a replicated MV output specially.
5395  const bool Y_is_replicated =
5396  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
5397 
5398  // This is part of the special case for replicated MV output.
5399  // We'll let each process do its thing, but do an all-reduce at
5400  // the end to sum up the results. Setting beta=0 on all processes
5401  // but Proc 0 makes the math work out for the all-reduce. (This
5402  // assumes that the replicated data is correctly replicated, so
5403  // that the data are the same on all processes.)
5404  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
5405  beta = ZERO;
5406  }
5407 
5408  // Temporary MV for Import operation. After the block of code
5409  // below, this will be an (Imported if necessary) column Map MV
5410  // ready to give to localApply(...).
5411  RCP<const MV> X_colMap;
5412  if (importer.is_null ()) {
5413  if (! X_in.isConstantStride ()) {
5414  // Not all sparse mat-vec kernels can handle an input MV with
5415  // nonconstant stride correctly, so we have to copy it in that
5416  // case into a constant stride MV. To make a constant stride
5417  // copy of X_in, we force creation of the column (== domain)
5418  // Map MV (if it hasn't already been created, else fetch the
5419  // cached copy). This avoids creating a new MV each time.
5420  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
5421  Tpetra::deep_copy (*X_colMapNonConst, X_in);
5422  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
5423  }
5424  else {
5425  // The domain and column Maps are the same, so do the local
5426  // multiply using the domain Map input MV X_in.
5427  X_colMap = rcpFromRef (X_in);
5428  }
5429  }
5430  else { // need to Import source (multi)vector
5431  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
5432 
5433  // We're doing an Import anyway, which will copy the relevant
5434  // elements of the domain Map MV X_in into a separate column Map
5435  // MV. Thus, we don't have to worry whether X_in is constant
5436  // stride.
5437  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
5438 
5439  // Import from the domain Map MV to the column Map MV.
5440  X_colMapNonConst->doImport (X_in, *importer, INSERT);
5441  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
5442  }
5443 
5444  // Temporary MV for doExport (if needed), or for copying a
5445  // nonconstant stride output MV into a constant stride MV. This
5446  // is null if we don't need the temporary MV, that is, if the
5447  // Export is trivial (null).
5448  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
5449 
5450  // If we have a nontrivial Export object, we must perform an
5451  // Export. In that case, the local multiply result will go into
5452  // the row Map multivector. We don't have to make a
5453  // constant-stride version of Y_in in this case, because we had to
5454  // make a constant stride Y_rowMap MV and do an Export anyway.
5455  if (! exporter.is_null ()) {
5456  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
5457  {
5458  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
5459 
5460  // If we're overwriting the output MV Y_in completely (beta ==
5461  // 0), then make sure that it is filled with zeros before we
5462  // do the Export. Otherwise, the ADD combine mode will use
5463  // data in Y_in, which is supposed to be zero.
5464  if (Y_is_overwritten) {
5465  Y_in.putScalar (ZERO);
5466  }
5467  else {
5468  // Scale output MV by beta, so that doExport sums in the
5469  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
5470  Y_in.scale (beta);
5471  }
5472  // Do the Export operation.
5473  Y_in.doExport (*Y_rowMap, *exporter, ADD);
5474  }
5475  }
5476  else { // Don't do an Export: row Map and range Map are the same.
5477  //
5478  // If Y_in does not have constant stride, or if the column Map
5479  // MV aliases Y_in, then we can't let the kernel write directly
5480  // to Y_in. Instead, we have to use the cached row (== range)
5481  // Map MV as temporary storage.
5482  //
5483  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
5484  // the user passed in the same MultiVector for both X and Y. It
5485  // won't detect whether one MultiVector views the other. We
5486  // should also check the MultiVectors' raw data pointers.
5487  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
5488  // Force creating the MV if it hasn't been created already.
5489  // This will reuse a previously created cached MV.
5490  Y_rowMap = getRowMapMultiVector (Y_in, true);
5491 
5492  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
5493  // since we're overwriting it anyway.
5494  if (beta != ZERO) {
5495  Tpetra::deep_copy (*Y_rowMap, Y_in);
5496  }
5497  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
5498  Tpetra::deep_copy (Y_in, *Y_rowMap);
5499  }
5500  else {
5501  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
5502  }
5503  }
5504 
5505  // If the range Map is a locally replicated Map, sum up
5506  // contributions from each process. We set beta = 0 on all
5507  // processes but Proc 0 initially, so this will handle the scaling
5508  // factor beta correctly.
5509  if (Y_is_replicated) {
5510  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
5511  Y_in.reduce ();
5512  }
5513  }
5514 
5515  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5516  void
5520  const Teuchos::ETransp mode,
5521  Scalar alpha,
5522  Scalar beta) const
5523  {
5525  using Teuchos::null;
5526  using Teuchos::RCP;
5527  using Teuchos::rcp;
5528  using Teuchos::rcp_const_cast;
5529  using Teuchos::rcpFromRef;
5530  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5531 
5532  // Take shortcuts for alpha == 0.
5533  if (alpha == ZERO) {
5534  // Follow the Sparse BLAS convention by ignoring both the matrix
5535  // and X_in, in this case.
5536  if (beta == ZERO) {
5537  // Follow the Sparse BLAS convention by overwriting any Inf or
5538  // NaN values in Y_in, in this case.
5539  Y_in.putScalar (ZERO);
5540  }
5541  else {
5542  Y_in.scale (beta);
5543  }
5544  return;
5545  }
5546 
5547  const size_t numVectors = X_in.getNumVectors ();
5548 
5549  // We don't allow X_in and Y_in to alias one another. It's hard
5550  // to check this, because advanced users could create views from
5551  // raw pointers. However, if X_in and Y_in reference the same
5552  // object, we will do the user a favor by copying X into new
5553  // storage (with a warning). We only need to do this if we have
5554  // trivial importers; otherwise, we don't actually apply the
5555  // operator from X into Y.
5556  RCP<const import_type> importer = this->getGraph ()->getImporter ();
5557  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
5558  // access X indirectly, in case we need to create temporary storage
5559  RCP<const MV> X;
5560 
5561  // some parameters for below
5562  const bool Y_is_replicated = ! Y_in.isDistributed ();
5563  const bool Y_is_overwritten = (beta == ZERO);
5564  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
5565  beta = ZERO;
5566  }
5567 
5568  // The kernels do not allow input or output with nonconstant stride.
5569  if (! X_in.isConstantStride () && importer.is_null ()) {
5570  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
5571  } else {
5572  X = rcpFromRef (X_in); // Reference to X_in
5573  }
5574 
5575  // Set up temporary multivectors for Import and/or Export.
5576  if (importer != Teuchos::null) {
5577  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
5578  importMV_ = null;
5579  }
5580  if (importMV_ == null) {
5581  importMV_ = rcp (new MV (this->getColMap (), numVectors));
5582  }
5583  }
5584  if (exporter != Teuchos::null) {
5585  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
5586  exportMV_ = null;
5587  }
5588  if (exportMV_ == null) {
5589  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
5590  }
5591  }
5592 
5593  // If we have a non-trivial exporter, we must import elements that
5594  // are permuted or are on other processors.
5595  if (! exporter.is_null ()) {
5596  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
5597  exportMV_->doImport (X_in, *exporter, INSERT);
5598  X = exportMV_; // multiply out of exportMV_
5599  }
5600 
5601  // If we have a non-trivial importer, we must export elements that
5602  // are permuted or belong to other processors. We will compute
5603  // solution into the to-be-exported MV; get a view.
5604  if (importer != Teuchos::null) {
5605  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
5606 
5607  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
5608  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
5609  // back and figure out why this helps. importMV_ SHOULD be
5610  // completely overwritten in the localApply(...) call
5611  // below, because beta == ZERO there.
5612  importMV_->putScalar (ZERO);
5613  // Do the local computation.
5614  this->localApply (*X, *importMV_, mode, alpha, ZERO);
5615  if (Y_is_overwritten) {
5616  Y_in.putScalar (ZERO);
5617  } else {
5618  Y_in.scale (beta);
5619  }
5620  Y_in.doExport (*importMV_, *importer, ADD);
5621  }
5622  // otherwise, multiply into Y
5623  else {
5624  // can't multiply in-situ; can't multiply into non-strided multivector
5625  //
5626  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
5627  // the user passed in the same MultiVector for both X and Y. It
5628  // won't detect whether one MultiVector views the other. We
5629  // should also check the MultiVectors' raw data pointers.
5630  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
5631  // Make a deep copy of Y_in, into which to write the multiply result.
5632  MV Y (Y_in, Teuchos::Copy);
5633  this->localApply (*X, Y, mode, alpha, beta);
5634  Tpetra::deep_copy (Y_in, Y);
5635  } else {
5636  this->localApply (*X, Y_in, mode, alpha, beta);
5637  }
5638  }
5639 
5640  // If the range Map is a locally replicated map, sum the
5641  // contributions from each process. (That's why we set beta=0
5642  // above for all processes but Proc 0.)
5643  if (Y_is_replicated) {
5644  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
5645  Y_in.reduce ();
5646  }
5647  }
5648 
5649  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5650  void
5654  const Teuchos::ETransp mode,
5655  const Scalar& alpha,
5656  const Scalar& beta) const
5657  {
5659  using Teuchos::NO_TRANS;
5660  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5661 
5662  auto X_lcl = X.getLocalViewDevice ();
5663  auto Y_lcl = Y.getLocalViewDevice ();
5664  // TODO (24 Jul 2019) uncomment later; this line of code wasn't
5665  // here before, so we need to test it separately before pushing.
5666  //
5667  // Y.modify_device ();
5668 
5669  const bool debug = ::Tpetra::Details::Behavior::debug ();
5670  if (debug) {
5671  const char tfecfFuncName[] = "localApply: ";
5672  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5673  (lclMatrix_.get () == nullptr, std::logic_error,
5674  "lclMatrix_ not created yet.");
5675  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5676  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5677  "X.getNumVectors() = " << X.getNumVectors () << " != "
5678  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5679  const bool transpose = (mode != Teuchos::NO_TRANS);
5680  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5681  (! transpose && X.getLocalLength () !=
5682  getColMap ()->getNodeNumElements (), std::runtime_error,
5683  "NO_TRANS case: X has the wrong number of local rows. "
5684  "X.getLocalLength() = " << X.getLocalLength () << " != "
5685  "getColMap()->getNodeNumElements() = " <<
5686  getColMap ()->getNodeNumElements () << ".");
5687  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5688  (! transpose && Y.getLocalLength () !=
5689  getRowMap ()->getNodeNumElements (), std::runtime_error,
5690  "NO_TRANS case: Y has the wrong number of local rows. "
5691  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5692  "getRowMap()->getNodeNumElements() = " <<
5693  getRowMap ()->getNodeNumElements () << ".");
5694  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5695  (transpose && X.getLocalLength () !=
5696  getRowMap ()->getNodeNumElements (), std::runtime_error,
5697  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5698  "rows. X.getLocalLength() = " << X.getLocalLength ()
5699  << " != getRowMap()->getNodeNumElements() = "
5700  << getRowMap ()->getNodeNumElements () << ".");
5701  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5702  (transpose && Y.getLocalLength () !=
5703  getColMap ()->getNodeNumElements (), std::runtime_error,
5704  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5705  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5706  << " != getColMap()->getNodeNumElements() = "
5707  << getColMap ()->getNodeNumElements () << ".");
5708  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5709  (! isFillComplete (), std::runtime_error, "The matrix is not "
5710  "fill complete. You must call fillComplete() (possibly with "
5711  "domain and range Map arguments) without an intervening "
5712  "resumeFill() call before you may call this method.");
5713  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5714  (! X.isConstantStride () || ! Y.isConstantStride (),
5715  std::runtime_error, "X and Y must be constant stride.");
5716  // If the two pointers are NULL, then they don't alias one
5717  // another, even though they are equal.
5718  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5719  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr,
5720  std::runtime_error, "X and Y may not alias one another.");
5721  }
5722 
5723  lclMatrix_->apply (X_lcl, Y_lcl, mode, alpha, beta);
5724  }
5725 
5726  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5727  void
5731  Teuchos::ETransp mode,
5732  Scalar alpha,
5733  Scalar beta) const
5734  {
5736  const char fnName[] = "Tpetra::CrsMatrix::apply";
5737 
5738  TEUCHOS_TEST_FOR_EXCEPTION
5739  (! isFillComplete (), std::runtime_error,
5740  fnName << ": Cannot call apply() until fillComplete() "
5741  "has been called.");
5742 
5743  if (mode == Teuchos::NO_TRANS) {
5744  ProfilingRegion regionNonTranspose (fnName);
5745  this->applyNonTranspose (X, Y, alpha, beta);
5746  }
5747  else {
5748  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5749 
5750  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
5751  //when bets==0. This was not the case with transpose in a multithreaded
5752  //environment where a multiplication with subsequent atomic_adds is used
5753  //since 0 is effectively not special cased. Doing the explicit set to zero here
5754  //This catches cases where Y is nan or inf.
5755  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5756  if (beta == ZERO) {
5757  Y.putScalar (ZERO);
5758  }
5759  this->applyTranspose (X, Y, mode, alpha, beta);
5760  }
5761  }
5762 
5763  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5764  void
5769  const Scalar& dampingFactor,
5770  const ESweepDirection direction,
5771  const int numSweeps) const
5772  {
5773  reorderedGaussSeidel (B, X, D, Teuchos::null, dampingFactor, direction, numSweeps);
5774  }
5775 
5776  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5777  void
5782  const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
5783  const Scalar& dampingFactor,
5784  const ESweepDirection direction,
5785  const int numSweeps) const
5786  {
5787  using Teuchos::null;
5788  using Teuchos::RCP;
5789  using Teuchos::rcp;
5790  using Teuchos::rcp_const_cast;
5791  using Teuchos::rcpFromRef;
5792  typedef Scalar ST;
5793 
5794  TEUCHOS_TEST_FOR_EXCEPTION(
5795  isFillComplete() == false, std::runtime_error,
5796  "Tpetra::CrsMatrix::gaussSeidel: cannot call this method until "
5797  "fillComplete() has been called.");
5798  TEUCHOS_TEST_FOR_EXCEPTION(
5799  numSweeps < 0,
5800  std::invalid_argument,
5801  "Tpetra::CrsMatrix::gaussSeidel: The number of sweeps must be , "
5802  "nonnegative but you provided numSweeps = " << numSweeps << " < 0.");
5803 
5804  // Translate from global to local sweep direction.
5805  // While doing this, validate the input.
5806  ESweepDirection localDirection;
5807  if (direction == Forward) {
5808  localDirection = Forward;
5809  }
5810  else if (direction == Backward) {
5811  localDirection = Backward;
5812  }
5813  else if (direction == Symmetric) {
5814  // We'll control local sweep direction manually.
5815  localDirection = Forward;
5816  }
5817  else {
5818  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
5819  "Tpetra::CrsMatrix::gaussSeidel: The 'direction' enum does not have "
5820  "any of its valid values: Forward, Backward, or Symmetric.");
5821  }
5822 
5823  if (numSweeps == 0) {
5824  return; // Nothing to do.
5825  }
5826 
5827  // We don't need the Export object because this method assumes
5828  // that the row, domain, and range Maps are the same. We do need
5829  // the Import object, if there is one, though.
5830  RCP<const import_type> importer = this->getGraph()->getImporter();
5831  RCP<const export_type> exporter = this->getGraph()->getExporter();
5832  TEUCHOS_TEST_FOR_EXCEPTION(
5833  ! exporter.is_null (), std::runtime_error,
5834  "Tpetra's gaussSeidel implementation requires that the row, domain, "
5835  "and range Maps be the same. This cannot be the case, because the "
5836  "matrix has a nontrivial Export object.");
5837 
5838  RCP<const map_type> domainMap = this->getDomainMap ();
5839  RCP<const map_type> rangeMap = this->getRangeMap ();
5840  RCP<const map_type> rowMap = this->getGraph ()->getRowMap ();
5841  RCP<const map_type> colMap = this->getGraph ()->getColMap ();
5842 
5843 #ifdef HAVE_TEUCHOS_DEBUG
5844  {
5845  // The relation 'isSameAs' is transitive. It's also a
5846  // collective, so we don't have to do a "shared" test for
5847  // exception (i.e., a global reduction on the test value).
5848  TEUCHOS_TEST_FOR_EXCEPTION(
5849  ! X.getMap ()->isSameAs (*domainMap),
5850  std::runtime_error,
5851  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
5852  "multivector X be in the domain Map of the matrix.");
5853  TEUCHOS_TEST_FOR_EXCEPTION(
5854  ! B.getMap ()->isSameAs (*rangeMap),
5855  std::runtime_error,
5856  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
5857  "B be in the range Map of the matrix.");
5858  TEUCHOS_TEST_FOR_EXCEPTION(
5859  ! D.getMap ()->isSameAs (*rowMap),
5860  std::runtime_error,
5861  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
5862  "D be in the row Map of the matrix.");
5863  TEUCHOS_TEST_FOR_EXCEPTION(
5864  ! rowMap->isSameAs (*rangeMap),
5865  std::runtime_error,
5866  "Tpetra::CrsMatrix::gaussSeidel requires that the row Map and the "
5867  "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
5868  TEUCHOS_TEST_FOR_EXCEPTION(
5869  ! domainMap->isSameAs (*rangeMap),
5870  std::runtime_error,
5871  "Tpetra::CrsMatrix::gaussSeidel requires that the domain Map and "
5872  "the range Map of the matrix be the same.");
5873  }
5874 #else
5875  // Forestall any compiler warnings for unused variables.
5876  (void) rangeMap;
5877  (void) rowMap;
5878 #endif // HAVE_TEUCHOS_DEBUG
5879 
5880  // If B is not constant stride, copy it into a constant stride
5881  // multivector. We'l handle the right-hand side B first and deal
5882  // with X right before the sweeps, to improve locality of the
5883  // first sweep. (If the problem is small enough, then that will
5884  // hopefully keep more of the entries of X in cache. This
5885  // optimizes for the typical case of a small number of sweeps.)
5886  RCP<const MV> B_in;
5887  if (B.isConstantStride()) {
5888  B_in = rcpFromRef (B);
5889  }
5890  else {
5891  // The range Map and row Map are the same in this case, so we
5892  // can use the (possibly cached) row Map multivector to store a
5893  // constant stride copy of B. We don't have to copy back, since
5894  // Gauss-Seidel won't modify B.
5895  RCP<MV> B_in_nonconst = getRowMapMultiVector (B, true);
5896  deep_copy (*B_in_nonconst, B); // Copy from B into B_in(_nonconst).
5897  B_in = rcp_const_cast<const MV> (B_in_nonconst);
5898 
5900  ! B.isConstantStride (),
5901  std::runtime_error,
5902  "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
5903  "requires that X and B both have constant stride. Since B does not "
5904  "have constant stride, we had to make a copy. This is a limitation of "
5905  "the current implementation and not your fault, but we still report it "
5906  "as an efficiency warning for your information.");
5907  }
5908 
5909  // If X is not constant stride, copy it into a constant stride
5910  // multivector. Also, make the column Map multivector X_colMap,
5911  // and its domain Map view X_domainMap. (X actually must be a
5912  // domain Map view of a column Map multivector; exploit this, if X
5913  // has constant stride.)
5914 
5915  RCP<MV> X_domainMap;
5916  RCP<MV> X_colMap;
5917  bool copiedInput = false;
5918 
5919  if (importer.is_null ()) { // Domain and column Maps are the same.
5920  if (X.isConstantStride ()) {
5921  X_domainMap = rcpFromRef (X);
5922  X_colMap = X_domainMap;
5923  copiedInput = false;
5924  }
5925  else {
5926  // Get a temporary column Map multivector, make a domain Map
5927  // view of it, and copy X into the domain Map view. We have
5928  // to copy here because we won't be doing Import operations.
5929  X_colMap = getColumnMapMultiVector (X, true);
5930  X_domainMap = X_colMap; // Domain and column Maps are the same.
5931  deep_copy (*X_domainMap, X); // Copy X into the domain Map view.
5932  copiedInput = true;
5934  ! X.isConstantStride (), std::runtime_error,
5935  "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the "
5936  "Gauss-Seidel kernel requires that X and B both have constant "
5937  "stride. Since X does not have constant stride, we had to make a "
5938  "copy. This is a limitation of the current implementation and not "
5939  "your fault, but we still report it as an efficiency warning for "
5940  "your information.");
5941  }
5942  }
5943  else { // We will be doing Import operations in the sweeps.
5944  if (X.isConstantStride ()) {
5945  X_domainMap = rcpFromRef (X);
5946  // This kernel assumes that X is a domain Map view of a column
5947  // Map multivector. We will only check if this is valid if
5948  // the CMake configure Teuchos_ENABLE_DEBUG is ON.
5949  X_colMap = X_domainMap->offsetViewNonConst (colMap, 0);
5950 
5951  // FIXME (mfh 19 Mar 2013) Do we need to fill the remote
5952  // entries of X_colMap with zeros? Do we need to fill all of
5953  // X_domainMap initially with zeros? Ifpack
5954  // (Ifpack_PointRelaxation.cpp, line 906) creates an entirely
5955  // new MultiVector each time.
5956 
5957  // Do the first Import for the first sweep. This simplifies
5958  // the logic in the sweeps.
5959  X_colMap->doImport (X, *importer, INSERT);
5960  copiedInput = false;
5961  }
5962  else {
5963  // Get a temporary column Map multivector X_colMap, and make a
5964  // domain Map view X_domainMap of it. Instead of copying, we
5965  // do an Import from X into X_domainMap. This saves us a
5966  // copy, since the Import has to copy the data anyway.
5967  X_colMap = getColumnMapMultiVector (X, true);
5968  X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
5969  X_colMap->doImport (X, *importer, INSERT);
5970  copiedInput = true;
5972  ! X.isConstantStride (), std::runtime_error,
5973  "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the "
5974  "Gauss-Seidel kernel requires that X and B both have constant stride. "
5975  "Since X does not have constant stride, we had to make a copy. "
5976  "This is a limitation of the current implementation and not your fault, "
5977  "but we still report it as an efficiency warning for your information.");
5978  }
5979  }
5980 
5981  for (int sweep = 0; sweep < numSweeps; ++sweep) {
5982  if (! importer.is_null () && sweep > 0) {
5983  // We already did the first Import for the zeroth sweep.
5984  X_colMap->doImport (*X_domainMap, *importer, INSERT);
5985  }
5986 
5987  // Do local Gauss-Seidel.
5988  if (direction != Symmetric) {
5989  if (rowIndices.is_null ()) {
5990  this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
5991  dampingFactor,
5992  localDirection);
5993  }
5994  else {
5995  this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
5996  D, rowIndices,
5997  dampingFactor,
5998  localDirection);
5999  }
6000  }
6001  else { // direction == Symmetric
6002  const bool doImportBetweenDirections = false;
6003  if (rowIndices.is_null ()) {
6004  this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
6005  dampingFactor,
6006  Forward);
6007  // mfh 18 Mar 2013: Aztec's implementation of "symmetric
6008  // Gauss-Seidel" does _not_ do an Import between the forward
6009  // and backward sweeps. This makes sense, because Aztec
6010  // considers "symmetric Gauss-Seidel" a subdomain solver.
6011  if (doImportBetweenDirections) {
6012  // Communicate again before the Backward sweep.
6013  if (! importer.is_null ()) {
6014  X_colMap->doImport (*X_domainMap, *importer, INSERT);
6015  }
6016  }
6017  this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
6018  dampingFactor,
6019  Backward);
6020  }
6021  else {
6022  this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
6023  D, rowIndices,
6024  dampingFactor,
6025  Forward);
6026  if (doImportBetweenDirections) {
6027  // Communicate again before the Backward sweep.
6028  if (! importer.is_null ()) {
6029  X_colMap->doImport (*X_domainMap, *importer, INSERT);
6030  }
6031  }
6032  this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
6033  D, rowIndices,
6034  dampingFactor,
6035  Backward);
6036  }
6037  }
6038  }
6039 
6040  if (copiedInput) {
6041  deep_copy (X, *X_domainMap); // Copy back from X_domainMap to X.
6042  }
6043  }
6044 
6045  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6046  void
6051  const Scalar& dampingFactor,
6052  const ESweepDirection direction,
6053  const int numSweeps,
6054  const bool zeroInitialGuess) const
6055  {
6056  reorderedGaussSeidelCopy (X, B, D, Teuchos::null, dampingFactor, direction,
6057  numSweeps, zeroInitialGuess);
6058  }
6059 
6060  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6061  void
6066  const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
6067  const Scalar& dampingFactor,
6068  const ESweepDirection direction,
6069  const int numSweeps,
6070  const bool zeroInitialGuess) const
6071  {
6072  using Teuchos::null;
6073  using Teuchos::RCP;
6074  using Teuchos::rcp;
6075  using Teuchos::rcpFromRef;
6076  using Teuchos::rcp_const_cast;
6077  typedef Scalar ST;
6078  const char prefix[] = "Tpetra::CrsMatrix::(reordered)gaussSeidelCopy: ";
6079  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
6080 
6081  TEUCHOS_TEST_FOR_EXCEPTION(
6082  ! isFillComplete (), std::runtime_error,
6083  prefix << "The matrix is not fill complete.");
6084  TEUCHOS_TEST_FOR_EXCEPTION(
6085  numSweeps < 0, std::invalid_argument,
6086  prefix << "The number of sweeps must be nonnegative, "
6087  "but you provided numSweeps = " << numSweeps << " < 0.");
6088 
6089  // Translate from global to local sweep direction.
6090  // While doing this, validate the input.
6091  ESweepDirection localDirection;
6092  if (direction == Forward) {
6093  localDirection = Forward;
6094  }
6095  else if (direction == Backward) {
6096  localDirection = Backward;
6097  }
6098  else if (direction == Symmetric) {
6099  // We'll control local sweep direction manually.
6100  localDirection = Forward;
6101  }
6102  else {
6103  TEUCHOS_TEST_FOR_EXCEPTION(
6104  true, std::invalid_argument,
6105  prefix << "The 'direction' enum does not have any of its valid "
6106  "values: Forward, Backward, or Symmetric.");
6107  }
6108 
6109  if (numSweeps == 0) {
6110  return;
6111  }
6112 
6113  RCP<const import_type> importer = this->getGraph ()->getImporter ();
6114  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
6115  TEUCHOS_TEST_FOR_EXCEPTION(
6116  ! exporter.is_null (), std::runtime_error,
6117  "This method's implementation currently requires that the matrix's row, "
6118  "domain, and range Maps be the same. This cannot be the case, because "
6119  "the matrix has a nontrivial Export object.");
6120 
6121  RCP<const map_type> domainMap = this->getDomainMap ();
6122  RCP<const map_type> rangeMap = this->getRangeMap ();
6123  RCP<const map_type> rowMap = this->getGraph ()->getRowMap ();
6124  RCP<const map_type> colMap = this->getGraph ()->getColMap ();
6125 
6126 #ifdef HAVE_TEUCHOS_DEBUG
6127  {
6128  // The relation 'isSameAs' is transitive. It's also a
6129  // collective, so we don't have to do a "shared" test for
6130  // exception (i.e., a global reduction on the test value).
6131  TEUCHOS_TEST_FOR_EXCEPTION(
6132  ! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
6133  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
6134  "multivector X be in the domain Map of the matrix.");
6135  TEUCHOS_TEST_FOR_EXCEPTION(
6136  ! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
6137  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
6138  "B be in the range Map of the matrix.");
6139  TEUCHOS_TEST_FOR_EXCEPTION(
6140  ! D.getMap ()->isSameAs (*rowMap), std::runtime_error,
6141  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
6142  "D be in the row Map of the matrix.");
6143  TEUCHOS_TEST_FOR_EXCEPTION(
6144  ! rowMap->isSameAs (*rangeMap), std::runtime_error,
6145  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the "
6146  "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
6147  TEUCHOS_TEST_FOR_EXCEPTION(
6148  ! domainMap->isSameAs (*rangeMap), std::runtime_error,
6149  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and "
6150  "the range Map of the matrix be the same.");
6151  }
6152 #else
6153  // Forestall any compiler warnings for unused variables.
6154  (void) rangeMap;
6155  (void) rowMap;
6156 #endif // HAVE_TEUCHOS_DEBUG
6157 
6158  // Fetch a (possibly cached) temporary column Map multivector
6159  // X_colMap, and a domain Map view X_domainMap of it. Both have
6160  // constant stride by construction. We know that the domain Map
6161  // must include the column Map, because our Gauss-Seidel kernel
6162  // requires that the row Map, domain Map, and range Map are all
6163  // the same, and that each process owns all of its own diagonal
6164  // entries of the matrix.
6165 
6166  RCP<MV> X_colMap;
6167  RCP<MV> X_domainMap;
6168  bool copyBackOutput = false;
6169  if (importer.is_null ()) {
6170  if (X.isConstantStride ()) {
6171  X_colMap = rcpFromRef (X);
6172  X_domainMap = rcpFromRef (X);
6173  // Column Map and domain Map are the same, so there are no
6174  // remote entries. Thus, if we are not setting the initial
6175  // guess to zero, we don't have to worry about setting remote
6176  // entries to zero, even though we are not doing an Import in
6177  // this case.
6178  if (zeroInitialGuess) {
6179  X_colMap->putScalar (ZERO);
6180  }
6181  // No need to copy back to X at end.
6182  }
6183  else { // We must copy X into a constant stride multivector.
6184  // Just use the cached column Map multivector for that.
6185  // force=true means fill with zeros, so no need to fill
6186  // remote entries (not in domain Map) with zeros.
6187  X_colMap = getColumnMapMultiVector (X, true);
6188  // X_domainMap is always a domain Map view of the column Map
6189  // multivector. In this case, the domain and column Maps are
6190  // the same, so X_domainMap _is_ X_colMap.
6191  X_domainMap = X_colMap;
6192  if (! zeroInitialGuess) { // Don't copy if zero initial guess
6193  try {
6194  deep_copy (*X_domainMap , X); // Copy X into constant stride MV
6195  } catch (std::exception& e) {
6196  std::ostringstream os;
6197  os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: "
6198  "deep_copy(*X_domainMap, X) threw an exception: "
6199  << e.what () << ".";
6200  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
6201  }
6202  }
6203  copyBackOutput = true; // Don't forget to copy back at end.
6205  ! X.isConstantStride (),
6206  std::runtime_error,
6207  "gaussSeidelCopy: The current implementation of the Gauss-Seidel "
6208  "kernel requires that X and B both have constant stride. Since X "
6209  "does not have constant stride, we had to make a copy. This is a "
6210  "limitation of the current implementation and not your fault, but we "
6211  "still report it as an efficiency warning for your information.");
6212  }
6213  }
6214  else { // Column Map and domain Map are _not_ the same.
6215  X_colMap = getColumnMapMultiVector (X);
6216  X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
6217 
6218 #ifdef HAVE_TPETRA_DEBUG
6219  auto X_colMap_host_view = X_colMap->getLocalViewHost ();
6220  auto X_domainMap_host_view = X_domainMap->getLocalViewHost ();
6221 
6222  if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) {
6223  TEUCHOS_TEST_FOR_EXCEPTION
6224  (X_colMap_host_view.data () != X_domainMap_host_view.data (),
6225  std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: Pointer to "
6226  "start of column Map view of X is not equal to pointer to start of "
6227  "(domain Map view of) X. This may mean that Tpetra::MultiVector::"
6228  "offsetViewNonConst is broken. "
6229  "Please report this bug to the Tpetra developers.");
6230  }
6231 
6232  TEUCHOS_TEST_FOR_EXCEPTION(
6233  X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) ||
6234  X_colMap->getLocalLength () < X_domainMap->getLocalLength (),
6235  std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
6236  "X_colMap has fewer local rows than X_domainMap. "
6237  "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0)
6238  << ", X_domainMap_host_view.extent(0) = "
6239  << X_domainMap_host_view.extent (0)
6240  << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength ()
6241  << ", and X_domainMap->getLocalLength() = "
6242  << X_domainMap->getLocalLength ()
6243  << ". This means that Tpetra::MultiVector::offsetViewNonConst "
6244  "is broken. Please report this bug to the Tpetra developers.");
6245 
6246  TEUCHOS_TEST_FOR_EXCEPTION(
6247  X_colMap->getNumVectors () != X_domainMap->getNumVectors (),
6248  std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
6249  "X_colMap has a different number of columns than X_domainMap. "
6250  "X_colMap->getNumVectors() = " << X_colMap->getNumVectors ()
6251  << " != X_domainMap->getNumVectors() = "
6252  << X_domainMap->getNumVectors ()
6253  << ". This means that Tpetra::MultiVector::offsetViewNonConst "
6254  "is broken. Please report this bug to the Tpetra developers.");
6255 #endif // HAVE_TPETRA_DEBUG
6256 
6257  if (zeroInitialGuess) {
6258  // No need for an Import, since we're filling with zeros.
6259  X_colMap->putScalar (ZERO);
6260  } else {
6261  // We could just copy X into X_domainMap. However, that
6262  // wastes a copy, because the Import also does a copy (plus
6263  // communication). Since the typical use case for
6264  // Gauss-Seidel is a small number of sweeps (2 is typical), we
6265  // don't want to waste that copy. Thus, we do the Import
6266  // here, and skip the first Import in the first sweep.
6267  // Importing directly from X effects the copy into X_domainMap
6268  // (which is a view of X_colMap).
6269  X_colMap->doImport (X, *importer, INSERT);
6270  }
6271  copyBackOutput = true; // Don't forget to copy back at end.
6272  } // if column and domain Maps are (not) the same
6273 
6274  // The Gauss-Seidel / SOR kernel expects multivectors of constant
6275  // stride. X_colMap is by construction, but B might not be. If
6276  // it's not, we have to make a copy.
6277  RCP<const MV> B_in;
6278  if (B.isConstantStride ()) {
6279  B_in = rcpFromRef (B);
6280  }
6281  else {
6282  // Range Map and row Map are the same in this case, so we can
6283  // use the cached row Map multivector to store a constant stride
6284  // copy of B.
6285  RCP<MV> B_in_nonconst = getRowMapMultiVector (B, true);
6286  try {
6287  deep_copy (*B_in_nonconst, B);
6288  } catch (std::exception& e) {
6289  std::ostringstream os;
6290  os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: "
6291  "deep_copy(*B_in_nonconst, B) threw an exception: "
6292  << e.what () << ".";
6293  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
6294  }
6295  B_in = rcp_const_cast<const MV> (B_in_nonconst);
6296 
6298  ! B.isConstantStride (),
6299  std::runtime_error,
6300  "gaussSeidelCopy: The current implementation requires that B have "
6301  "constant stride. Since B does not have constant stride, we had to "
6302  "copy it into a separate constant-stride multivector. This is a "
6303  "limitation of the current implementation and not your fault, but we "
6304  "still report it as an efficiency warning for your information.");
6305  }
6306 
6307  for (int sweep = 0; sweep < numSweeps; ++sweep) {
6308  if (! importer.is_null () && sweep > 0) {
6309  // We already did the first Import for the zeroth sweep above,
6310  // if it was necessary.
6311  X_colMap->doImport (*X_domainMap, *importer, INSERT);
6312  }
6313 
6314  // Do local Gauss-Seidel.
6315  if (direction != Symmetric) {
6316  if (rowIndices.is_null ()) {
6317  this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
6318  dampingFactor,
6319  localDirection);
6320  }
6321  else {
6322  this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
6323  D, rowIndices,
6324  dampingFactor,
6325  localDirection);
6326  }
6327  }
6328  else { // direction == Symmetric
6329  if (rowIndices.is_null ()) {
6330  this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
6331  dampingFactor,
6332  Forward);
6333  // mfh 18 Mar 2013: Aztec's implementation of "symmetric
6334  // Gauss-Seidel" does _not_ do an Import between the forward
6335  // and backward sweeps. This makes symmetric Gauss-Seidel a
6336  // symmetric preconditioner if the matrix A is symmetric. We
6337  // imitate Aztec's behavior here.
6338  this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
6339  dampingFactor,
6340  Backward);
6341  }
6342  else {
6343  this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
6344  D, rowIndices,
6345  dampingFactor,
6346  Forward);
6347  this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
6348  D, rowIndices,
6349  dampingFactor,
6350  Backward);
6351 
6352  }
6353  }
6354  }
6355 
6356  if (copyBackOutput) {
6357  try {
6358  deep_copy (X , *X_domainMap); // Copy result back into X.
6359  } catch (std::exception& e) {
6360  TEUCHOS_TEST_FOR_EXCEPTION(
6361  true, std::runtime_error, prefix << "deep_copy(X, *X_domainMap) "
6362  "threw an exception: " << e.what ());
6363  }
6364  }
6365  }
6366 
6367  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6368  template<class T>
6369  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
6371  convert () const
6372  {
6373  using Teuchos::RCP;
6374  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
6375  const char tfecfFuncName[] = "convert: ";
6376 
6377  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6378  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
6379  "of the conversion) is not fill complete. You must first call "
6380  "fillComplete() (possibly with the domain and range Map) without an "
6381  "intervening call to resumeFill(), before you may call this method.");
6382  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6383  (! this->isStaticGraph (), std::logic_error, "This matrix (the source "
6384  "of the conversion) claims to be fill complete, but does not have a "
6385  "static (i.e., constant) graph. Please report this bug to the Tpetra "
6386  "developers.");
6387 
6388  RCP<output_matrix_type> newMatrix
6389  (new output_matrix_type (this->getCrsGraph ()));
6390  // Copy old values into new values. impl_scalar_type and T may
6391  // differ, so we can't use Kokkos::deep_copy.
6392  using ::Tpetra::Details::copyConvert;
6393  copyConvert (newMatrix->lclMatrix_->getLocalMatrix ().values,
6394  this->lclMatrix_->getLocalMatrix ().values);
6395  // Since newmat has a static (const) graph, the graph already has
6396  // a column Map, and Import and Export objects already exist (if
6397  // applicable). Thus, calling fillComplete is cheap.
6398  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
6399 
6400  return newMatrix;
6401  }
6402 
6403 
6404  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6405  void
6408  {
6409 #ifdef HAVE_TPETRA_DEBUG
6410  const char tfecfFuncName[] = "checkInternalState: ";
6411  const char err[] = "Internal state is not consistent. "
6412  "Please report this bug to the Tpetra developers.";
6413 
6414  // This version of the graph (RCP<const crs_graph_type>) must
6415  // always be nonnull.
6416  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6417  staticGraph_.is_null (),
6418  std::logic_error, err);
6419  // myGraph == null means that the matrix has a const ("static")
6420  // graph. Otherwise, the matrix has a dynamic graph (it owns its
6421  // graph).
6422  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6423  ! myGraph_.is_null () && myGraph_ != staticGraph_,
6424  std::logic_error, err);
6425  // if matrix is fill complete, then graph must be fill complete
6426  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6427  isFillComplete () && ! staticGraph_->isFillComplete (),
6428  std::logic_error, err << " Specifically, the matrix is fill complete, "
6429  "but its graph is NOT fill complete.");
6430  // if matrix is storage optimized, it should have a 1D allocation
6431  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6432  isStorageOptimized () && ! values2D_.is_null (),
6433  std::logic_error, err);
6434  // if matrix/graph are static profile, then 2D allocation should not be present
6435  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6436  getProfileType() == StaticProfile && values2D_ != Teuchos::null,
6437  std::logic_error, err);
6438  // if matrix/graph are dynamic profile, then 1D allocation should not be present
6439  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6440  getProfileType() != StaticProfile && k_values1D_.extent (0) > 0,
6441  std::logic_error, err);
6442  // if values are allocated and they are non-zero in number, then
6443  // one of the allocations should be present
6444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6445  staticGraph_->indicesAreAllocated () &&
6446  staticGraph_->getNodeAllocationSize() > 0 &&
6447  staticGraph_->getNodeNumRows() > 0
6448  && values2D_.is_null () &&
6449  k_values1D_.extent (0) == 0,
6450  std::logic_error, err);
6451  // we cannot have both a 1D and 2D allocation
6452  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6453  k_values1D_.extent (0) > 0 && values2D_ != Teuchos::null,
6454  std::logic_error, err << " Specifically, k_values1D_ is allocated (has "
6455  "size " << k_values1D_.extent (0) << " > 0) and values2D_ is also "
6456  "allocated. CrsMatrix is not suppose to have both a 1-D and a 2-D "
6457  "allocation at the same time.");
6458 #endif
6459  }
6460 
6461  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6462  std::string
6465  {
6466  std::ostringstream os;
6467 
6468  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
6469  if (this->getObjectLabel () != "") {
6470  os << "Label: \"" << this->getObjectLabel () << "\", ";
6471  }
6472  if (isFillComplete ()) {
6473  os << "isFillComplete: true"
6474  << ", global dimensions: [" << getGlobalNumRows () << ", "
6475  << getGlobalNumCols () << "]"
6476  << ", global number of entries: " << getGlobalNumEntries ()
6477  << "}";
6478  }
6479  else {
6480  os << "isFillComplete: false"
6481  << ", global dimensions: [" << getGlobalNumRows () << ", "
6482  << getGlobalNumCols () << "]}";
6483  }
6484  return os.str ();
6485  }
6486 
6487  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6488  void
6490  describe (Teuchos::FancyOStream &out,
6491  const Teuchos::EVerbosityLevel verbLevel) const
6492  {
6493  using std::endl;
6494  using std::setw;
6495  using Teuchos::ArrayView;
6496  using Teuchos::Comm;
6497  using Teuchos::RCP;
6498  using Teuchos::TypeNameTraits;
6499  using Teuchos::VERB_DEFAULT;
6500  using Teuchos::VERB_NONE;
6501  using Teuchos::VERB_LOW;
6502  using Teuchos::VERB_MEDIUM;
6503  using Teuchos::VERB_HIGH;
6504  using Teuchos::VERB_EXTREME;
6505 
6506  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
6507 
6508  if (vl == VERB_NONE) {
6509  return; // Don't print anything at all
6510  }
6511 
6512  // By convention, describe() always begins with a tab.
6513  Teuchos::OSTab tab0 (out);
6514 
6515  RCP<const Comm<int> > comm = this->getComm();
6516  const int myRank = comm->getRank();
6517  const int numProcs = comm->getSize();
6518  size_t width = 1;
6519  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
6520  ++width;
6521  }
6522  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
6523 
6524  // none: print nothing
6525  // low: print O(1) info from node 0
6526  // medium: print O(P) info, num entries per process
6527  // high: print O(N) info, num entries per row
6528  // extreme: print O(NNZ) info: print indices and values
6529  //
6530  // for medium and higher, print constituent objects at specified verbLevel
6531  if (myRank == 0) {
6532  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
6533  }
6534  Teuchos::OSTab tab1 (out);
6535 
6536  if (myRank == 0) {
6537  if (this->getObjectLabel () != "") {
6538  out << "Label: \"" << this->getObjectLabel () << "\", ";
6539  }
6540  {
6541  out << "Template parameters:" << endl;
6542  Teuchos::OSTab tab2 (out);
6543  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
6544  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
6545  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
6546  << "Node: " << TypeNameTraits<Node>::name () << endl;
6547  }
6548  if (isFillComplete()) {
6549  out << "isFillComplete: true" << endl
6550  << "Global dimensions: [" << getGlobalNumRows () << ", "
6551  << getGlobalNumCols () << "]" << endl
6552  << "Global number of entries: " << getGlobalNumEntries () << endl
6553  << endl << "Global max number of entries in a row: "
6554  << getGlobalMaxNumRowEntries () << endl;
6555  }
6556  else {
6557  out << "isFillComplete: false" << endl
6558  << "Global dimensions: [" << getGlobalNumRows () << ", "
6559  << getGlobalNumCols () << "]" << endl;
6560  }
6561  }
6562 
6563  if (vl < VERB_MEDIUM) {
6564  return; // all done!
6565  }
6566 
6567  // Describe the row Map.
6568  if (myRank == 0) {
6569  out << endl << "Row Map:" << endl;
6570  }
6571  if (getRowMap ().is_null ()) {
6572  if (myRank == 0) {
6573  out << "null" << endl;
6574  }
6575  }
6576  else {
6577  if (myRank == 0) {
6578  out << endl;
6579  }
6580  getRowMap ()->describe (out, vl);
6581  }
6582 
6583  // Describe the column Map.
6584  if (myRank == 0) {
6585  out << "Column Map: ";
6586  }
6587  if (getColMap ().is_null ()) {
6588  if (myRank == 0) {
6589  out << "null" << endl;
6590  }
6591  } else if (getColMap () == getRowMap ()) {
6592  if (myRank == 0) {
6593  out << "same as row Map" << endl;
6594  }
6595  } else {
6596  if (myRank == 0) {
6597  out << endl;
6598  }
6599  getColMap ()->describe (out, vl);
6600  }
6601 
6602  // Describe the domain Map.
6603  if (myRank == 0) {
6604  out << "Domain Map: ";
6605  }
6606  if (getDomainMap ().is_null ()) {
6607  if (myRank == 0) {
6608  out << "null" << endl;
6609  }
6610  } else if (getDomainMap () == getRowMap ()) {
6611  if (myRank == 0) {
6612  out << "same as row Map" << endl;
6613  }
6614  } else if (getDomainMap () == getColMap ()) {
6615  if (myRank == 0) {
6616  out << "same as column Map" << endl;
6617  }
6618  } else {
6619  if (myRank == 0) {
6620  out << endl;
6621  }
6622  getDomainMap ()->describe (out, vl);
6623  }
6624 
6625  // Describe the range Map.
6626  if (myRank == 0) {
6627  out << "Range Map: ";
6628  }
6629  if (getRangeMap ().is_null ()) {
6630  if (myRank == 0) {
6631  out << "null" << endl;
6632  }
6633  } else if (getRangeMap () == getDomainMap ()) {
6634  if (myRank == 0) {
6635  out << "same as domain Map" << endl;
6636  }
6637  } else if (getRangeMap () == getRowMap ()) {
6638  if (myRank == 0) {
6639  out << "same as row Map" << endl;
6640  }
6641  } else {
6642  if (myRank == 0) {
6643  out << endl;
6644  }
6645  getRangeMap ()->describe (out, vl);
6646  }
6647 
6648  // O(P) data
6649  for (int curRank = 0; curRank < numProcs; ++curRank) {
6650  if (myRank == curRank) {
6651  out << "Process rank: " << curRank << endl;
6652  Teuchos::OSTab tab2 (out);
6653  if (! staticGraph_->indicesAreAllocated ()) {
6654  out << "Graph indices not allocated" << endl;
6655  }
6656  else {
6657  out << "Number of allocated entries: "
6658  << staticGraph_->getNodeAllocationSize () << endl;
6659  }
6660  out << "Number of entries: " << getNodeNumEntries () << endl
6661  << "Max number of entries per row: " << getNodeMaxNumRowEntries ()
6662  << endl;
6663  }
6664  // Give output time to complete by executing some barriers.
6665  comm->barrier ();
6666  comm->barrier ();
6667  comm->barrier ();
6668  }
6669 
6670  if (vl < VERB_HIGH) {
6671  return; // all done!
6672  }
6673 
6674  // O(N) and O(NNZ) data
6675  for (int curRank = 0; curRank < numProcs; ++curRank) {
6676  if (myRank == curRank) {
6677  out << std::setw(width) << "Proc Rank"
6678  << std::setw(width) << "Global Row"
6679  << std::setw(width) << "Num Entries";
6680  if (vl == VERB_EXTREME) {
6681  out << std::setw(width) << "(Index,Value)";
6682  }
6683  out << endl;
6684  for (size_t r = 0; r < getNodeNumRows (); ++r) {
6685  const size_t nE = getNumEntriesInLocalRow(r);
6686  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
6687  out << std::setw(width) << myRank
6688  << std::setw(width) << gid
6689  << std::setw(width) << nE;
6690  if (vl == VERB_EXTREME) {
6691  if (isGloballyIndexed()) {
6692  ArrayView<const GlobalOrdinal> rowinds;
6693  ArrayView<const Scalar> rowvals;
6694  getGlobalRowView (gid, rowinds, rowvals);
6695  for (size_t j = 0; j < nE; ++j) {
6696  out << " (" << rowinds[j]
6697  << ", " << rowvals[j]
6698  << ") ";
6699  }
6700  }
6701  else if (isLocallyIndexed()) {
6702  ArrayView<const LocalOrdinal> rowinds;
6703  ArrayView<const Scalar> rowvals;
6704  getLocalRowView (r, rowinds, rowvals);
6705  for (size_t j=0; j < nE; ++j) {
6706  out << " (" << getColMap()->getGlobalElement(rowinds[j])
6707  << ", " << rowvals[j]
6708  << ") ";
6709  }
6710  } // globally or locally indexed
6711  } // vl == VERB_EXTREME
6712  out << endl;
6713  } // for each row r on this process
6714  } // if (myRank == curRank)
6715 
6716  // Give output time to complete
6717  comm->barrier ();
6718  comm->barrier ();
6719  comm->barrier ();
6720  } // for each process p
6721  }
6722 
6723  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6724  bool
6727  {
6728  // It's not clear what kind of compatibility checks on sizes can
6729  // be performed here. Epetra_CrsGraph doesn't check any sizes for
6730  // compatibility.
6731 
6732  // Currently, the source object must be a RowMatrix with the same
6733  // four template parameters as the target CrsMatrix. We might
6734  // relax this requirement later.
6736  const row_matrix_type* srcRowMat =
6737  dynamic_cast<const row_matrix_type*> (&source);
6738  return (srcRowMat != NULL);
6739  }
6740 
6741  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6742  void
6744  applyCrsPadding(const Kokkos::UnorderedMap<LocalOrdinal, size_t, device_type>& padding)
6745  {
6746  // const char tfecfFuncName[] = "applyCrsPadding";
6747  using execution_space = typename device_type::execution_space;
6748  using row_ptrs_type = typename local_graph_type::row_map_type::non_const_type;
6749  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LocalOrdinal>>;
6751 
6752  const char tfecfFuncName[] = "applyCrsPadding: ";
6753 
6754  if (! myGraph_->indicesAreAllocated ()) {
6755  this->allocateValues (GlobalIndices, GraphNotYetAllocated);
6756  }
6757 
6758  if (padding.size() == 0)
6759  return;
6760 
6761  // Making copies here because k_rowPtrs_ has a const type. Otherwise, we
6762  // would use it directly.
6763 
6764  row_ptrs_type row_ptr_beg("row_ptr_beg", myGraph_->k_rowPtrs_.extent(0));
6765  Kokkos::deep_copy(row_ptr_beg, myGraph_->k_rowPtrs_);
6766 
6767  const size_t N = (row_ptr_beg.extent(0) == 0 ? 0 : row_ptr_beg.extent(0) - 1);
6768  row_ptrs_type row_ptr_end("row_ptr_end", N);
6769 
6770  bool refill_num_row_entries = false;
6771  if (myGraph_->k_numRowEntries_.extent(0) > 0) {
6772  // Case 1: Unpacked storage
6773  refill_num_row_entries = true;
6774  auto num_row_entries = myGraph_->k_numRowEntries_;
6775  Kokkos::parallel_for("Fill end row pointers", range_policy(0, N),
6776  KOKKOS_LAMBDA(const size_t i){
6777  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries(i);
6778  }
6779  );
6780 
6781  } else {
6782  // mfh If packed storage, don't need row_ptr_end to be separate allocation;
6783  // could just have it alias row_ptr_beg+1.
6784  // Case 2: Packed storage
6785  Kokkos::parallel_for("Fill end row pointers", range_policy(0, N),
6786  KOKKOS_LAMBDA(const size_t i){
6787  row_ptr_end(i) = row_ptr_beg(i+1);
6788  }
6789  );
6790  }
6791 
6792  using values_type = typename local_matrix_type::values_type;
6793  values_type values("values", k_values1D_.size());
6794  Kokkos::deep_copy(values, k_values1D_);
6795 
6796  if(myGraph_->isGloballyIndexed()) {
6797  using indices_type = typename crs_graph_type::t_GlobalOrdinal_1D;
6798  indices_type indices("indices", myGraph_->k_gblInds1D_.extent(0));
6799  Kokkos::deep_copy(indices, myGraph_->k_gblInds1D_);
6800  using padding_type = Kokkos::UnorderedMap<LocalOrdinal, size_t, device_type>;
6801  padCrsArrays<row_ptrs_type,indices_type,values_type,padding_type>(row_ptr_beg, row_ptr_end, indices, values, padding);
6802  myGraph_->k_gblInds1D_ = indices;
6803  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6804  values.size() != indices.size(),
6805  std::logic_error,
6806  "After padding, values and indices should be same size");
6807  }
6808  else {
6809  using indices_type = typename local_graph_type::entries_type::non_const_type;
6810  indices_type indices("indices", myGraph_->k_lclInds1D_.extent(0));
6811  Kokkos::deep_copy(indices, myGraph_->k_lclInds1D_);
6812  using padding_type = Kokkos::UnorderedMap<LocalOrdinal, size_t, device_type>;
6813  padCrsArrays<row_ptrs_type,indices_type,values_type,padding_type>(row_ptr_beg, row_ptr_end, indices, values, padding);
6814  myGraph_->k_lclInds1D_ = indices;
6815  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6816  values.size() != indices.size(),
6817  std::logic_error,
6818  "After padding, values and indices should be same size");
6819  }
6820 
6821  if (refill_num_row_entries) {
6822  auto num_row_entries = myGraph_->k_numRowEntries_;
6823  Kokkos::parallel_for("Fill num entries", range_policy(0, N),
6824  KOKKOS_LAMBDA(const size_t i){
6825  num_row_entries(i) = row_ptr_end(i) - row_ptr_beg(i);
6826  }
6827  );
6828  }
6829  myGraph_->k_rowPtrs_ = row_ptr_beg;
6830  k_values1D_ = values;
6831  }
6832 
6833  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6834  void
6835  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6836  copyAndPermuteImpl (const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
6837  const size_t numSameIDs,
6838  const LocalOrdinal permuteToLIDs[],
6839  const LocalOrdinal permuteFromLIDs[],
6840  const size_t numPermutes)
6841  {
6843  using Teuchos::Array;
6844  using Teuchos::ArrayView;
6845  typedef LocalOrdinal LO;
6846  typedef GlobalOrdinal GO;
6847 #ifdef HAVE_TPETRA_DEBUG
6848  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6849  const char tfecfFuncName[] = "copyAndPermuteImpl: ";
6850 #endif // HAVE_TPETRA_DEBUG
6851 
6852  ProfilingRegion regionCAP ("Tpetra::CrsMatrix::copyAndPermuteImpl");
6853 
6854  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
6855  //
6856  // Copy the first numSame row from source to target (this matrix).
6857  // This involves copying rows corresponding to LIDs [0, numSame-1].
6858  //
6859  const map_type& srcRowMap = * (srcMat.getRowMap ());
6860  Array<GO> rowInds;
6861  Array<Scalar> rowVals;
6862  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
6863  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
6864  // Global ID for the current row index in the source matrix.
6865  // The first numSameIDs GIDs in the two input lists are the
6866  // same, so sourceGID == targetGID in this case.
6867  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
6868  const GO targetGID = sourceGID;
6869 
6870  // Input views for the combineGlobalValues() call below.
6871  ArrayView<const GO> rowIndsConstView;
6872  ArrayView<const Scalar> rowValsConstView;
6873 
6874  if (sourceIsLocallyIndexed) {
6875  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6876  if (rowLength > static_cast<size_t> (rowInds.size())) {
6877  rowInds.resize (rowLength);
6878  rowVals.resize (rowLength);
6879  }
6880  // Resizing invalidates an Array's views, so we must make new
6881  // ones, even if rowLength hasn't changed.
6882  ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
6883  ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
6884 
6885  // The source matrix is locally indexed, so we have to get a
6886  // copy. Really it's the GIDs that have to be copied (because
6887  // they have to be converted from LIDs).
6888  size_t checkRowLength = 0;
6889  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, checkRowLength);
6890 
6891 #ifdef HAVE_TPETRA_DEBUG
6892  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength,
6893  std::logic_error, "For global row index " << sourceGID << ", the source"
6894  " matrix's getNumEntriesInGlobalRow() method returns a row length of "
6895  << rowLength << ", but the getGlobalRowCopy() method reports that "
6896  "the row length is " << checkRowLength << ". Please report this bug "
6897  "to the Tpetra developers.");
6898 #endif // HAVE_TPETRA_DEBUG
6899 
6900  rowIndsConstView = rowIndsView.view (0, rowLength);
6901  rowValsConstView = rowValsView.view (0, rowLength);
6902  }
6903  else { // source matrix is globally indexed.
6904  srcMat.getGlobalRowView (sourceGID, rowIndsConstView, rowValsConstView);
6905  }
6906 
6907  // Combine the data into the target matrix.
6908  if (this->isStaticGraph ()) {
6909  // Applying a permutation to a matrix with a static graph
6910  // means REPLACE-ing entries.
6911  combineGlobalValues (targetGID, rowIndsConstView, rowValsConstView, REPLACE);
6912  }
6913  else {
6914  // Applying a permutation to a matrix with a dynamic graph
6915  // means INSERT-ing entries. This has the same effect as
6916  // ADD, if the target graph already has an entry there.
6917  combineGlobalValues (targetGID, rowIndsConstView, rowValsConstView, INSERT);
6918  }
6919  } // For each of the consecutive source and target IDs that are the same
6920 
6921  //
6922  // Permute the remaining rows.
6923  //
6924  const map_type& tgtRowMap = * (this->getRowMap ());
6925  for (size_t p = 0; p < numPermutes; ++p) {
6926  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
6927  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
6928 
6929  // Input views for the combineGlobalValues() call below.
6930  ArrayView<const GO> rowIndsConstView;
6931  ArrayView<const Scalar> rowValsConstView;
6932 
6933  if (sourceIsLocallyIndexed) {
6934  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6935  if (rowLength > static_cast<size_t> (rowInds.size ())) {
6936  rowInds.resize (rowLength);
6937  rowVals.resize (rowLength);
6938  }
6939  // Resizing invalidates an Array's views, so we must make new
6940  // ones, even if rowLength hasn't changed.
6941  ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
6942  ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
6943 
6944  // The source matrix is locally indexed, so we have to get a
6945  // copy. Really it's the GIDs that have to be copied (because
6946  // they have to be converted from LIDs).
6947  size_t checkRowLength = 0;
6948  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, checkRowLength);
6949 
6950 #ifdef HAVE_TPETRA_DEBUG
6951  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength,
6952  std::logic_error, "For the source matrix's global row index "
6953  << sourceGID << ", the source matrix's getNumEntriesInGlobalRow() "
6954  "method returns a row length of " << rowLength << ", but the "
6955  "getGlobalRowCopy() method reports that the row length is "
6956  << checkRowLength << ". Please report this bug to the Tpetra "
6957  "developers.");
6958 #endif // HAVE_TPETRA_DEBUG
6959 
6960  rowIndsConstView = rowIndsView.view (0, rowLength);
6961  rowValsConstView = rowValsView.view (0, rowLength);
6962  }
6963  else {
6964  srcMat.getGlobalRowView (sourceGID, rowIndsConstView, rowValsConstView);
6965  }
6966 
6967  // Combine the data into the target matrix.
6968  if (isStaticGraph()) {
6969  this->combineGlobalValues (targetGID, rowIndsConstView,
6970  rowValsConstView, REPLACE);
6971  }
6972  else {
6973  this->combineGlobalValues (targetGID, rowIndsConstView,
6974  rowValsConstView, INSERT);
6975  }
6976  } // For each ID to permute
6977  }
6978 
6979  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6980  void
6981  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6982 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
6983  copyAndPermuteNew
6984 #else // TPETRA_ENABLE_DEPRECATED_CODE
6985  copyAndPermute
6986 #endif // TPETRA_ENABLE_DEPRECATED_CODE
6987  (const SrcDistObject& srcObj,
6988  const size_t numSameIDs,
6989  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6990  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs)
6991  {
6994  using std::endl;
6995 
6996  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6997  const char tfecfFuncName[] = "copyAndPermute: ";
6998  ProfilingRegion regionCAP ("Tpetra::CrsMatrix::copyAndPermute");
6999 
7000  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
7001  std::unique_ptr<std::string> prefix;
7002  if (verbose) {
7003  int myRank = -1;
7004  auto map = this->getMap ();
7005  if (! map.is_null ()) {
7006  auto comm = map->getComm ();
7007  if (! comm.is_null ()) {
7008  myRank = comm->getRank ();
7009  }
7010  }
7011  prefix = [myRank] () {
7012  std::ostringstream pfxStrm;
7013  pfxStrm << "Proc " << myRank << ": Tpetra::CrsMatrix::copyAndPermute: ";
7014  return std::unique_ptr<std::string> (new std::string (pfxStrm.str ()));
7015  } ();
7016  std::ostringstream os;
7017  os << *prefix << endl
7018  << *prefix << " "
7019  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
7020  << *prefix << " "
7021  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
7022  std::cerr << os.str ();
7023  }
7024 
7025  const auto numPermute = permuteToLIDs.extent (0);
7026  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7027  (numPermute != permuteFromLIDs.extent (0),
7028  std::invalid_argument, "permuteToLIDs.extent(0) = "
7029  << numPermute << "!= permuteFromLIDs.extent(0) = "
7030  << permuteFromLIDs.extent (0) << ".");
7031 
7032  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
7033  auto permuteToLIDs_h = permuteToLIDs.view_host ();
7034  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
7035  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
7036 
7037  // This dynamic cast should succeed, because we've already tested
7038  // it in checkSizes().
7040  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
7041 
7042  if (!this->isStaticGraph () && this->getProfileType () == StaticProfile) {
7043  auto padding =
7044  this->myGraph_->computeCrsPadding(*srcMat.getGraph(), numSameIDs, permuteToLIDs, permuteFromLIDs);
7045  if (padding.size() > 0)
7046  this->applyCrsPadding(padding);
7047  }
7048 
7049 
7050  if (verbose) {
7051  std::ostringstream os;
7052  os << *prefix << "Call copyAndPermuteImpl" << endl;
7053  std::cerr << os.str ();
7054  }
7055  this->copyAndPermuteImpl (srcMat, numSameIDs, permuteToLIDs_h.data (),
7056  permuteFromLIDs_h.data (), numPermute);
7057  }
7058 
7059  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7060  void
7061  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7062 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
7063  packAndPrepareNew
7064 #else // TPETRA_ENABLE_DEPRECATED_CODE
7065  packAndPrepare
7066 #endif // TPETRA_ENABLE_DEPRECATED_CODE
7067  (const SrcDistObject& source,
7068  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
7069  Kokkos::DualView<char*, buffer_device_type>& exports,
7070  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7071  size_t& constantNumPackets,
7072  Distributor& distor)
7073  {
7076  using Teuchos::outArg;
7077  using Teuchos::REDUCE_MAX;
7078  using Teuchos::reduceAll;
7079  using std::endl;
7080  typedef LocalOrdinal LO;
7081  typedef GlobalOrdinal GO;
7082  const char tfecfFuncName[] = "packAndPrepare: ";
7083  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
7084 
7085  const bool debug = ::Tpetra::Details::Behavior::debug ();
7086  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
7087 
7088  // Processes on which the communicator is null should not participate.
7089  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
7090  if (pComm.is_null ()) {
7091  return;
7092  }
7093  const Teuchos::Comm<int>& comm = *pComm;
7094  const int myRank = comm.getSize ();
7095 
7096  std::unique_ptr<std::string> prefix;
7097  if (verbose) {
7098  prefix = [myRank] () {
7099  std::ostringstream pfxStrm;
7100  pfxStrm << "Proc " << myRank << ": Tpetra::CrsMatrix::packAndPrepare: ";
7101  return std::unique_ptr<std::string> (new std::string (pfxStrm.str ()));
7102  } ();
7103  std::ostringstream os;
7104  os << *prefix << "Start" << endl
7105  << *prefix << " "
7106  << dualViewStatusToString (exportLIDs, "exportLIDs")
7107  << endl
7108  << *prefix << " "
7109  << dualViewStatusToString (exports, "exports")
7110  << endl
7111  << *prefix << " "
7112  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7113  << endl;
7114  std::cerr << os.str ();
7115  }
7116 
7117  // Attempt to cast the source object to CrsMatrix. If successful,
7118  // use the source object's packNew() method to pack its data for
7119  // communication. Otherwise, attempt to cast to RowMatrix; if
7120  // successful, use the source object's pack() method. Otherwise,
7121  // the source object doesn't have the right type.
7122  //
7123  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
7124  // RowMatrix to have the same Node type. Unfortunately, we don't
7125  // have a way to ask if the RowMatrix is "a RowMatrix with any
7126  // Node type," since RowMatrix doesn't have a base class. A
7127  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
7128  // not currently exist, would satisfy this requirement.
7129  //
7130  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
7131  // type doesn't technically need to match the target object's
7132  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
7133  // and GO need not be the same, as long as there is no overflow of
7134  // the indices. However, checking for index overflow is global
7135  // and therefore undesirable.
7136 
7137  std::ostringstream msg; // for collecting error messages
7138  int lclBad = 0; // to be set below
7139 
7140  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
7141  const crs_matrix_type* srcCrsMat =
7142  dynamic_cast<const crs_matrix_type*> (&source);
7143  if (srcCrsMat != nullptr) {
7144  if (verbose) {
7145  std::ostringstream os;
7146  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
7147  "calling packNew" << endl;
7148  std::cerr << os.str ();
7149  }
7150  try {
7151  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
7152  constantNumPackets, distor);
7153  }
7154  catch (std::exception& e) {
7155  lclBad = 1;
7156  msg << "Proc " << myRank << ": " << e.what () << std::endl;
7157  }
7158  }
7159  else {
7160  using Kokkos::HostSpace;
7161  using Kokkos::subview;
7162  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
7163  using range_type = Kokkos::pair<size_t, size_t>;
7164 
7165  if (verbose) {
7166  std::ostringstream os;
7167  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
7168  << endl;
7169  std::cerr << os.str ();
7170  }
7171 
7172  using row_matrix_type = RowMatrix<Scalar, LO, GO, Node>;
7173  const row_matrix_type* srcRowMat =
7174  dynamic_cast<const row_matrix_type*> (&source);
7175  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7176  (srcRowMat == nullptr, std::invalid_argument,
7177  "The source object of the Import or Export operation is neither a "
7178  "CrsMatrix (with the same template parameters as the target object), "
7179  "nor a RowMatrix (with the same first four template parameters as the "
7180  "target object).");
7181 
7182  // For the RowMatrix case, we need to convert from
7183  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
7184  // so terribly efficient, since packing a non-CrsMatrix
7185  // RowMatrix for Import/Export into a CrsMatrix is not a
7186  // critical case. Thus, we may allocate Teuchos::Array objects
7187  // here and copy to and from Kokkos::*View.
7188 
7189  // View exportLIDs's host data as a Teuchos::ArrayView.
7190  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
7191  auto exportLIDs_h = exportLIDs.view_host ();
7192  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
7193  exportLIDs_h.size ());
7194 
7195  // pack() will allocate exports_a as needed. We'll copy back
7196  // into exports (after (re)allocating exports if needed) below.
7197  Teuchos::Array<char> exports_a;
7198 
7199  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
7200  // need to sync, since we're doing write-only access, but we do
7201  // need to mark the DualView as modified on host.
7202 
7203  numPacketsPerLID.clear_sync_state (); // write-only access
7204  numPacketsPerLID.modify_host ();
7205  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
7206  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
7207  numPacketsPerLID_h.size ());
7208 
7209  // Invoke RowMatrix's legacy pack() interface, using above
7210  // Teuchos::Array* objects.
7211  try {
7212  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
7213  constantNumPackets, distor);
7214  }
7215  catch (std::exception& e) {
7216  lclBad = 1;
7217  msg << "Proc " << myRank << ": " << e.what () << std::endl;
7218  }
7219 
7220  // Allocate 'exports', and copy exports_a back into it.
7221  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
7222  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
7223  const std::string oldLabel = exports.d_view.label ();
7224  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
7225  exports = exports_type (newLabel, newAllocSize);
7226  }
7227  // It's safe to assume that we're working on host anyway, so
7228  // just keep exports sync'd to host.
7229  // ignore current device contents
7230  exports.modify_host();
7231 
7232  auto exports_h = exports.view_host ();
7233  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
7234 
7235  // Kokkos::deep_copy needs a Kokkos::View input, so turn
7236  // exports_a into a nonowning Kokkos::View first before copying.
7237  typedef typename exports_type::t_host::execution_space HES;
7238  typedef Kokkos::Device<HES, HostSpace> host_device_type;
7239  Kokkos::View<const char*, host_device_type>
7240  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
7241  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
7242  }
7243 
7244  if (debug) {
7245  int gblBad = 0; // output argument; to be set below
7246  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
7247  if (gblBad != 0) {
7248  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
7249  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7250  (true, std::logic_error, "packNew() or pack() threw an exception on "
7251  "one or more participating processes.");
7252  }
7253  }
7254  else {
7255  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7256  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
7257  "or more participating processes. Here is this process' error "
7258  "message: " << msg.str ());
7259  }
7260 
7261  if (verbose) {
7262  std::ostringstream os;
7263  os << *prefix << "packAndPrepare: Done!" << endl
7264  << *prefix << " "
7265  << dualViewStatusToString (exportLIDs, "exportLIDs")
7266  << endl
7267  << *prefix << " "
7268  << dualViewStatusToString (exports, "exports")
7269  << endl
7270  << *prefix << " "
7271  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7272  << endl;
7273  std::cerr << os.str ();
7274  }
7275  }
7276 
7277  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7278  size_t
7279  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7280  packRow (char exports[],
7281  const size_t offset,
7282  const size_t numEnt,
7283  const GlobalOrdinal gidsIn[],
7284  const impl_scalar_type valsIn[],
7285  const size_t numBytesPerValue) const
7286  {
7287  using Kokkos::View;
7288  using Kokkos::subview;
7290  typedef LocalOrdinal LO;
7291  typedef GlobalOrdinal GO;
7292  typedef impl_scalar_type ST;
7293  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
7294 
7295  if (numEnt == 0) {
7296  // Empty rows always take zero bytes, to ensure sparsity.
7297  return 0;
7298  }
7299 
7300  const GO gid = 0; // packValueCount wants this
7301  const LO numEntLO = static_cast<size_t> (numEnt);
7302 
7303  const size_t numEntBeg = offset;
7304  const size_t numEntLen = PackTraits<LO, HES>::packValueCount (numEntLO);
7305  const size_t gidsBeg = numEntBeg + numEntLen;
7306  const size_t gidsLen = numEnt * PackTraits<GO, HES>::packValueCount (gid);
7307  const size_t valsBeg = gidsBeg + gidsLen;
7308  const size_t valsLen = numEnt * numBytesPerValue;
7309 
7310  char* const numEntOut = exports + numEntBeg;
7311  char* const gidsOut = exports + gidsBeg;
7312  char* const valsOut = exports + valsBeg;
7313 
7314  size_t numBytesOut = 0;
7315  int errorCode = 0;
7316  numBytesOut += PackTraits<LO, HES>::packValue (numEntOut, numEntLO);
7317 
7318  {
7319  Kokkos::pair<int, size_t> p;
7320  p = PackTraits<GO, HES>::packArray (gidsOut, gidsIn, numEnt);
7321  errorCode += p.first;
7322  numBytesOut += p.second;
7323 
7324  p = PackTraits<ST, HES>::packArray (valsOut, valsIn, numEnt);
7325  errorCode += p.first;
7326  numBytesOut += p.second;
7327  }
7328 
7329  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
7330  TEUCHOS_TEST_FOR_EXCEPTION
7331  (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
7332  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
7333  << expectedNumBytes << ".");
7334  TEUCHOS_TEST_FOR_EXCEPTION
7335  (errorCode != 0, std::runtime_error, "packRow: "
7336  "PackTraits::packArray returned a nonzero error code");
7337 
7338  return numBytesOut;
7339  }
7340 
7341  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7342  size_t
7343  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7344  unpackRow (GlobalOrdinal gidsOut[],
7345  impl_scalar_type valsOut[],
7346  const char imports[],
7347  const size_t offset,
7348  const size_t numBytes,
7349  const size_t numEnt,
7350  const size_t numBytesPerValue)
7351  {
7352  using Kokkos::View;
7353  using Kokkos::subview;
7355  typedef LocalOrdinal LO;
7356  typedef GlobalOrdinal GO;
7357  typedef impl_scalar_type ST;
7358  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
7359 
7360  if (numBytes == 0) {
7361  // Rows with zero bytes should always have zero entries.
7362  if (numEnt != 0) {
7363  const int myRank = this->getMap ()->getComm ()->getRank ();
7364  TEUCHOS_TEST_FOR_EXCEPTION
7365  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
7366  "unpackRow: The number of bytes to unpack numBytes=0, but the "
7367  "number of entries to unpack (as reported by numPacketsPerLID) "
7368  "for this row numEnt=" << numEnt << " != 0.");
7369  }
7370  return 0;
7371  }
7372 
7373  if (numEnt == 0 && numBytes != 0) {
7374  const int myRank = this->getMap ()->getComm ()->getRank ();
7375  TEUCHOS_TEST_FOR_EXCEPTION
7376  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
7377  "unpackRow: The number of entries to unpack (as reported by "
7378  "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
7379  "numBytes=" << numBytes << " != 0.");
7380  }
7381 
7382  const GO gid = 0; // packValueCount wants this
7383  const LO lid = 0; // packValueCount wants this
7384 
7385  const size_t numEntBeg = offset;
7386  const size_t numEntLen = PackTraits<LO, HES>::packValueCount (lid);
7387  const size_t gidsBeg = numEntBeg + numEntLen;
7388  const size_t gidsLen = numEnt * PackTraits<GO, HES>::packValueCount (gid);
7389  const size_t valsBeg = gidsBeg + gidsLen;
7390  const size_t valsLen = numEnt * numBytesPerValue;
7391 
7392  const char* const numEntIn = imports + numEntBeg;
7393  const char* const gidsIn = imports + gidsBeg;
7394  const char* const valsIn = imports + valsBeg;
7395 
7396  size_t numBytesOut = 0;
7397  int errorCode = 0;
7398  LO numEntOut;
7399  numBytesOut += PackTraits<LO, HES>::unpackValue (numEntOut, numEntIn);
7400  if (static_cast<size_t> (numEntOut) != numEnt ||
7401  numEntOut == static_cast<LO> (0)) {
7402  const int myRank = this->getMap ()->getComm ()->getRank ();
7403  std::ostringstream os;
7404  os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
7405  bool firstErrorCondition = false;
7406  if (static_cast<size_t> (numEntOut) != numEnt) {
7407  os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
7408  << " does not equal number of entries unpacked from imports "
7409  "buffer numEntOut=" << numEntOut << ".";
7410  firstErrorCondition = true;
7411  }
7412  if (numEntOut == static_cast<LO> (0)) {
7413  if (firstErrorCondition) {
7414  os << " Also, ";
7415  }
7416  os << "Number of entries unpacked from imports buffer numEntOut=0, "
7417  "but number of bytes to unpack for this row numBytes=" << numBytes
7418  << " != 0. This should never happen, since packRow should only "
7419  "ever pack rows with a nonzero number of entries. In this case, "
7420  "the number of entries from numPacketsPerLID is numEnt=" << numEnt
7421  << ".";
7422  }
7423  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
7424  }
7425 
7426  {
7427  Kokkos::pair<int, size_t> p;
7428  p = PackTraits<GO, HES>::unpackArray (gidsOut, gidsIn, numEnt);
7429  errorCode += p.first;
7430  numBytesOut += p.second;
7431 
7432  p = PackTraits<ST, HES>::unpackArray (valsOut, valsIn, numEnt);
7433  errorCode += p.first;
7434  numBytesOut += p.second;
7435  }
7436 
7437  TEUCHOS_TEST_FOR_EXCEPTION
7438  (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
7439  << numBytesOut << " != numBytes = " << numBytes << ".");
7440 
7441  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
7442  TEUCHOS_TEST_FOR_EXCEPTION
7443  (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
7444  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
7445  << expectedNumBytes << ".");
7446 
7447  TEUCHOS_TEST_FOR_EXCEPTION
7448  (errorCode != 0, std::runtime_error, "unpackRow: "
7449  "PackTraits::unpackArray returned a nonzero error code");
7450 
7451  return numBytesOut;
7452  }
7453 
7454  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7455  void
7456  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7457  allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
7458  size_t& totalNumEntries,
7459  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
7460  {
7462  using std::endl;
7463  typedef impl_scalar_type IST;
7464  typedef LocalOrdinal LO;
7465  typedef GlobalOrdinal GO;
7466  //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
7467 
7468  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
7469  // output to std::cerr on every MPI process. This is unwise for
7470  // runs with large numbers of MPI processes.
7471  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
7472  std::unique_ptr<std::string> prefix;
7473  if (verbose) {
7474  int myRank = 0;
7475  auto map = this->getMap ();
7476  if (! map.is_null ()) {
7477  auto comm = map->getComm ();
7478  if (! comm.is_null ()) {
7479  myRank = comm->getRank ();
7480  }
7481  }
7482  // Restrict pfxStrm to inner scope to reduce high-water memory usage.
7483  prefix = [myRank] () {
7484  std::ostringstream pfxStrm;
7485  pfxStrm << "Proc " << myRank << ": Tpetra::CrsMatrix::allocatePackSpaceNew: ";
7486  return std::unique_ptr<std::string> (new std::string (pfxStrm.str ()));
7487  } ();
7488 
7489  std::ostringstream os;
7490  os << *prefix << "Before:"
7491  << endl
7492  << *prefix << " "
7493  << dualViewStatusToString (exports, "exports")
7494  << endl
7495  << *prefix << " "
7496  << dualViewStatusToString (exportLIDs, "exportLIDs")
7497  << endl;
7498  std::cerr << os.str ();
7499  }
7500 
7501  // The number of export LIDs must fit in LocalOrdinal, assuming
7502  // that the LIDs are distinct and valid on the calling process.
7503  const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
7504 
7505  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
7506  auto exportLIDs_h = exportLIDs.view_host ();
7507 
7508  // Count the total number of matrix entries to send.
7509  totalNumEntries = 0;
7510  for (LO i = 0; i < numExportLIDs; ++i) {
7511  const LO lclRow = exportLIDs_h[i];
7512  size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
7513  // FIXME (mfh 25 Jan 2015) We should actually report invalid row
7514  // indices as an error. Just consider them nonowned for now.
7515  if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
7516  curNumEntries = 0;
7517  }
7518  totalNumEntries += curNumEntries;
7519  }
7520 
7521  // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
7522  // if sizeof(IST) is a meaningful representation of the amount of
7523  // data in a Scalar instance. (LO and GO are always built-in
7524  // integer types.)
7525  //
7526  // Allocate the exports array. It does NOT need padding for
7527  // alignment, since we use memcpy to write to / read from send /
7528  // receive buffers.
7529  const size_t allocSize =
7530  static_cast<size_t> (numExportLIDs) * sizeof (LO) +
7531  totalNumEntries * (sizeof (IST) + sizeof (GO));
7532  if (static_cast<size_t> (exports.extent (0)) < allocSize) {
7533  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
7534 
7535  const std::string oldLabel = exports.d_view.label ();
7536  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
7537  exports = exports_type (newLabel, allocSize);
7538  }
7539 
7540  if (verbose) {
7541  std::ostringstream os;
7542  os << *prefix << "After:"
7543  << endl
7544  << *prefix << " "
7545  << dualViewStatusToString (exports, "exports")
7546  << endl
7547  << *prefix << " "
7548  << dualViewStatusToString (exportLIDs, "exportLIDs")
7549  << endl;
7550  std::cerr << os.str ();
7551  }
7552  }
7553 
7554  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7555  void
7557  packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
7558  Kokkos::DualView<char*, buffer_device_type>& exports,
7559  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
7560  size_t& constantNumPackets,
7561  Distributor& dist) const
7562  {
7563  // The call to packNew in packAndPrepare catches and handles any exceptions.
7564  if (this->isStaticGraph ()) {
7565  using ::Tpetra::Details::packCrsMatrixNew;
7566  packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
7567  constantNumPackets, dist);
7568  }
7569  else {
7570  this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
7571  constantNumPackets, dist);
7572  }
7573  }
7574 
7575  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7576  void
7578  packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
7579  Kokkos::DualView<char*, buffer_device_type>& exports,
7580  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
7581  size_t& constantNumPackets,
7582  Distributor& /* distor */) const
7583  {
7584  using Kokkos::View;
7588  using std::endl;
7589  typedef LocalOrdinal LO;
7590  typedef GlobalOrdinal GO;
7591  typedef impl_scalar_type ST;
7592  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
7593  const char tfecfFuncName[] = "packNonStaticNew: ";
7594 
7595  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
7596  // output to std::cerr on every MPI process. This is unwise for
7597  // runs with large numbers of MPI processes.
7598  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
7599  std::unique_ptr<std::string> prefix;
7600  if (verbose) {
7601  int myRank = 0;
7602  auto map = this->getMap ();
7603  if (! map.is_null ()) {
7604  auto comm = map->getComm ();
7605  if (! comm.is_null ()) {
7606  myRank = comm->getRank ();
7607  }
7608  }
7609  // Restrict pfxStrm to inner scope to reduce high-water memory usage.
7610  prefix = [myRank] () {
7611  std::ostringstream pfxStrm;
7612  pfxStrm << "(Proc " << myRank << ") ";
7613  return std::unique_ptr<std::string> (new std::string (pfxStrm.str ()));
7614  } ();
7615 
7616  std::ostringstream os;
7617  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew:" << endl;
7618  std::cerr << os.str ();
7619  }
7620 
7621  const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
7622  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7623  (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
7624  std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
7625  << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
7626  << ".");
7627 
7628  // Setting this to zero tells the caller to expect a possibly
7629  // different ("nonconstant") number of packets per local index
7630  // (i.e., a possibly different number of entries per row).
7631  constantNumPackets = 0;
7632 
7633  // The pack buffer 'exports' enters this method possibly
7634  // unallocated. Do the first two parts of "Count, allocate, fill,
7635  // compute."
7636  size_t totalNumEntries = 0;
7637  this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
7638  const size_t bufSize = static_cast<size_t> (exports.extent (0));
7639 
7640  // Write-only host access
7641  exports.clear_sync_state();
7642  exports.modify_host();
7643  auto exports_h = exports.view_host ();
7644  if (verbose) {
7645  std::ostringstream os;
7646  os << *prefix << "After marking exports as modified on host, "
7647  << dualViewStatusToString (exports, "exports") << endl;
7648  std::cerr << os.str ();
7649  }
7650 
7651  // Read-only host access
7652  auto exportLIDs_h = exportLIDs.view_host ();
7653 
7654  // Write-only host access
7655  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
7656  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
7657  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
7658 
7659  // Compute the number of "packets" (in this case, bytes) per
7660  // export LID (in this case, local index of the row to send), and
7661  // actually pack the data.
7662  size_t offset = 0; // current index into 'exports' array.
7663  for (size_t i = 0; i < numExportLIDs; ++i) {
7664  const LO lclRow = exportLIDs_h[i];
7665 
7666  size_t numEnt;
7667  numEnt = this->getNumEntriesInLocalRow (lclRow);
7668 
7669  // Only pack this row's data if it has a nonzero number of
7670  // entries. We can do this because receiving processes get the
7671  // number of packets, and will know that zero packets means zero
7672  // entries.
7673  if (numEnt == 0) {
7674  numPacketsPerLID_h[i] = 0;
7675  continue;
7676  }
7677 
7678  // Temporary buffer for global column indices.
7679  View<GO*, HES> gidsIn_k;
7680  {
7681  GO gid = 0;
7682  gidsIn_k = PackTraits<GO, HES>::allocateArray(gid, numEnt, "gids");
7683  }
7684 
7685  Teuchos::ArrayView<const Scalar> valsIn;
7686  if (this->isLocallyIndexed ()) {
7687  // If the matrix is locally indexed on the calling process, we
7688  // have to use its column Map (which it _must_ have in this
7689  // case) to convert to global indices.
7690  Teuchos::ArrayView<const LO> lidsIn;
7691  this->getLocalRowView (lclRow, lidsIn, valsIn);
7692  const map_type& colMap = * (this->getColMap ());
7693  for (size_t k = 0; k < numEnt; ++k) {
7694  gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
7695  }
7696  }
7697  else if (this->isGloballyIndexed ()) {
7698  // If the matrix is globally indexed on the calling process,
7699  // then we can use the column indices directly. However, we
7700  // have to get the global row index. The calling process must
7701  // have a row Map, since otherwise it shouldn't be participating
7702  // in packing operations.
7703  Teuchos::ArrayView<const GO> gblIndView;;
7704  const map_type& rowMap = * (this->getRowMap ());
7705  const GO gblRow = rowMap.getGlobalElement (lclRow);
7706  this->getGlobalRowView (gblRow, gblIndView, valsIn);
7707  for (size_t k = 0; k < numEnt; ++k) {
7708  gidsIn_k[k] = gblIndView[k];
7709  }
7710  }
7711  // mfh 11 Sep 2017: Currently, if the matrix is neither globally
7712  // nor locally indexed, then it has no entries. Therefore,
7713  // there is nothing to pack. No worries!
7714 
7715  typename HES::device_type outputDevice;
7716  auto valsIn_k =
7718  reinterpret_cast<const ST*> (valsIn.getRawPtr ()),
7719  valsIn.size (),
7720  true, "valsIn");
7721  const size_t numBytesPerValue =
7722  PackTraits<ST,HES>::packValueCount (valsIn[0]);
7723  const size_t numBytes =
7724  this->packRow (exports_h.data (), offset, numEnt, gidsIn_k.data (),
7725  valsIn_k.data (), numBytesPerValue);
7726  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7727  (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
7728  "First invalid offset into 'exports' pack buffer at index i = " << i
7729  << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
7730  bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
7731  ".");
7732  // numPacketsPerLID_h[i] is the number of "packets" in the
7733  // current local row i. Packet=char (really "byte") so use the
7734  // number of bytes of the packed data for that row.
7735  numPacketsPerLID_h[i] = numBytes;
7736  offset += numBytes;
7737  }
7738 
7739  if (verbose) {
7740  std::ostringstream os;
7741  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
7742  << *prefix << " "
7743  << dualViewStatusToString (exports, "exports")
7744  << endl
7745  << *prefix << " "
7746  << dualViewStatusToString (exportLIDs, "exportLIDs")
7747  << endl;
7748  std::cerr << os.str ();
7749  }
7750  }
7751 
7752  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7753  LocalOrdinal
7754  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7755  combineGlobalValuesRaw (const LocalOrdinal lclRow,
7756  const LocalOrdinal numEnt,
7757  const impl_scalar_type vals[],
7758  const GlobalOrdinal cols[],
7759  const Tpetra::CombineMode combineMode)
7760  {
7761  typedef GlobalOrdinal GO;
7762  //const char tfecfFuncName[] = "combineGlobalValuesRaw: ";
7763 
7764  // mfh 23 Mar 2017: This branch is not thread safe in a debug
7765  // build, due to use of Teuchos::ArrayView; see #229.
7766  const GO gblRow = this->myGraph_->rowMap_->getGlobalElement (lclRow);
7767  Teuchos::ArrayView<const GO> cols_av (numEnt == 0 ? NULL : cols, numEnt);
7768  Teuchos::ArrayView<const Scalar> vals_av (numEnt == 0 ? NULL : reinterpret_cast<const Scalar*> (vals), numEnt);
7769 
7770  // FIXME (mfh 23 Mar 2017) This is a work-around for less common
7771  // combine modes. combineGlobalValues throws on error; it does
7772  // not return an error code. Thus, if it returns, it succeeded.
7773  this->combineGlobalValues (gblRow, cols_av, vals_av, combineMode);
7774  return numEnt;
7775  }
7776 
7777  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7778  void
7779  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7780  combineGlobalValues (const GlobalOrdinal globalRowIndex,
7781  const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
7782  const Teuchos::ArrayView<const Scalar>& values,
7783  const Tpetra::CombineMode combineMode)
7784  {
7785  const char tfecfFuncName[] = "combineGlobalValues: ";
7786 
7787  if (isStaticGraph ()) {
7788  // INSERT doesn't make sense for a static graph, since you
7789  // aren't allowed to change the structure of the graph.
7790  // However, all the other combine modes work.
7791  if (combineMode == ADD) {
7792  sumIntoGlobalValues (globalRowIndex, columnIndices, values);
7793  }
7794  else if (combineMode == REPLACE) {
7795  replaceGlobalValues (globalRowIndex, columnIndices, values);
7796  }
7797  else if (combineMode == ABSMAX) {
7798  using ::Tpetra::Details::AbsMax;
7799  AbsMax<Scalar> f;
7800  this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
7801  columnIndices,
7802  values, f);
7803  }
7804  else if (combineMode == INSERT) {
7805  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7806  isStaticGraph () && combineMode == INSERT, std::invalid_argument,
7807  "INSERT combine mode is not allowed if the matrix has a static graph "
7808  "(i.e., was constructed with the CrsMatrix constructor that takes a "
7809  "const CrsGraph pointer).");
7810  }
7811  else {
7812  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7813  true, std::logic_error, "Invalid combine mode; should never get "
7814  "here! Please report this bug to the Tpetra developers.");
7815  }
7816  }
7817  else { // The matrix has a dynamic graph.
7818  if (combineMode == ADD || combineMode == INSERT) {
7819  // For a dynamic graph, all incoming column indices are
7820  // inserted into the target graph. Duplicate indices will
7821  // have their values summed. In this context, ADD and INSERT
7822  // are equivalent. We need to call insertGlobalValues()
7823  // anyway if the column indices don't yet exist in this row,
7824  // so we just call insertGlobalValues() for both cases.
7825  try {
7826  this->insertGlobalValuesFiltered (globalRowIndex, columnIndices,
7827  values);
7828  }
7829  catch (std::exception& e) {
7830  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7831  (true, std::runtime_error, std::endl
7832  << "insertGlobalValuesFiltered(" << globalRowIndex << ", "
7833  << std::endl << Teuchos::toString (columnIndices) << ", "
7834  << std::endl << Teuchos::toString (values)
7835  << ") threw an exception: " << std::endl << e.what ());
7836  }
7837  }
7838  // FIXME (mfh 14 Mar 2012):
7839  //
7840  // Implementing ABSMAX or REPLACE for a dynamic graph would
7841  // require modifying assembly to attach a possibly different
7842  // combine mode to each inserted (i, j, A_ij) entry. For
7843  // example, consider two different Export operations to the same
7844  // target CrsMatrix, the first with ABSMAX combine mode and the
7845  // second with REPLACE. This isn't a common use case, so we
7846  // won't mess with it for now.
7847  else if (combineMode == ABSMAX) {
7848  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7849  ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
7850  "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
7851  "implemented.");
7852  }
7853  else if (combineMode == REPLACE) {
7854  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7855  ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
7856  "REPLACE combine mode when the matrix has a dynamic graph is not yet "
7857  "implemented.");
7858  }
7859  else {
7860  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7861  true, std::logic_error, "Should never get here! Please report this "
7862  "bug to the Tpetra developers.");
7863  }
7864  }
7865  }
7866 
7867  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7868  void
7869  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7870 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
7871  unpackAndCombineNew
7872 #else // TPETRA_ENABLE_DEPRECATED_CODE
7873  unpackAndCombine
7874 #endif // TPETRA_ENABLE_DEPRECATED_CODE
7875  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
7876  Kokkos::DualView<char*, buffer_device_type> imports,
7877  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7878  const size_t constantNumPackets,
7879  Distributor& distor,
7880  const CombineMode combineMode)
7881  {
7884  using std::endl;
7885  const char tfecfFuncName[] = "unpackAndCombine: ";
7886  ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
7887 
7888  const bool debug = ::Tpetra::Details::Behavior::debug ();
7889  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
7890  constexpr int numValidModes = 5;
7891  const CombineMode validModes[numValidModes] =
7892  {ADD, REPLACE, ABSMAX, INSERT, ZERO};
7893  const char* validModeNames[numValidModes] =
7894  {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
7895 
7896  std::unique_ptr<std::string> prefix;
7897  int myRank = 0;
7898  if (verbose) {
7899  auto map = this->getMap ();
7900  if (! map.is_null ()) {
7901  auto comm = map->getComm ();
7902  if (! comm.is_null ()) {
7903  myRank = comm->getRank ();
7904  }
7905  }
7906  prefix = [myRank] () {
7907  std::ostringstream pfxStrm;
7908  pfxStrm << "Proc " << myRank << ": Tpetra::CrsMatrix::unpackAndCombine: ";
7909  return std::unique_ptr<std::string> (new std::string (pfxStrm.str ()));
7910  } ();
7911  std::ostringstream os;
7912  os << *prefix << "Start:" << endl
7913  << *prefix << " "
7914  << dualViewStatusToString (importLIDs, "importLIDs")
7915  << endl
7916  << *prefix << " "
7917  << dualViewStatusToString (imports, "imports")
7918  << endl
7919  << *prefix << " "
7920  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7921  << endl
7922  << *prefix << " constantNumPackets: " << constantNumPackets
7923  << endl
7924  << *prefix << " combineMode: " << combineModeToString (combineMode)
7925  << endl;
7926  std::cerr << os.str ();
7927  }
7928 
7929  if (debug) {
7930  if (std::find (validModes, validModes+numValidModes, combineMode) ==
7931  validModes+numValidModes) {
7932  std::ostringstream os;
7933  os << "Invalid combine mode. Valid modes are {";
7934  for (int k = 0; k < numValidModes; ++k) {
7935  os << validModeNames[k];
7936  if (k < numValidModes - 1) {
7937  os << ", ";
7938  }
7939  }
7940  os << "}.";
7941  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7942  (true, std::invalid_argument, os.str ());
7943  }
7944  }
7945 
7946  if (combineMode == ZERO) {
7947  return; // nothing to do
7948  }
7949 
7950  if (!this->isStaticGraph() && this->getProfileType() == StaticProfile) {
7951  auto padding = myGraph_->computeCrsPadding(importLIDs, numPacketsPerLID);
7952  if (padding.size() > 0)
7953  this->applyCrsPadding(padding);
7954  }
7955 
7956  if (debug) {
7957  using Teuchos::reduceAll;
7958  std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
7959  int lclBad = 0;
7960  try {
7961  this->unpackAndCombineImpl (importLIDs, imports, numPacketsPerLID,
7962  constantNumPackets, distor, combineMode);
7963  } catch (std::exception& e) {
7964  lclBad = 1;
7965  *msg << e.what ();
7966  }
7967  int gblBad = 0;
7968  const Teuchos::Comm<int>& comm = * (this->getComm ());
7969  reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
7970  lclBad, Teuchos::outArg (gblBad));
7971  if (gblBad != 0) {
7972  // mfh 22 Oct 2017: 'prefix' might be null, since it is only
7973  // initialized in a debug build. Thus, we get the process
7974  // rank again here. This is an error message, so the small
7975  // run-time cost doesn't matter. See #1887.
7976  std::ostringstream os;
7977  os << "(Proc " << comm.getRank () << ") " << msg->str () << endl;
7978  msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
7979  ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
7980  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7981  (true, std::logic_error, std::endl << "unpackAndCombineImpl "
7982  "threw an exception on one or more participating processes: "
7983  << endl << msg->str ());
7984  }
7985  }
7986  else {
7987  this->unpackAndCombineImpl (importLIDs, imports, numPacketsPerLID,
7988  constantNumPackets, distor, combineMode);
7989  }
7990 
7991  if (verbose) {
7992  std::ostringstream os;
7993  os << *prefix << "Done!" << endl
7994  << *prefix << " "
7995  << dualViewStatusToString (importLIDs, "importLIDs")
7996  << endl
7997  << *prefix << " "
7998  << dualViewStatusToString (imports, "imports")
7999  << endl
8000  << *prefix << " "
8001  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
8002  << endl;
8003  std::cerr << os.str ();
8004  }
8005  }
8006 
8007  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8008  void
8009  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
8010  unpackAndCombineImpl (const Kokkos::DualView<const local_ordinal_type*,
8011  buffer_device_type>& importLIDs,
8012  const Kokkos::DualView<const char*,
8013  buffer_device_type>& imports,
8014  const Kokkos::DualView<const size_t*,
8015  buffer_device_type>& numPacketsPerLID,
8016  const size_t constantNumPackets,
8017  Distributor & distor,
8018  const CombineMode combineMode,
8019  const bool atomic)
8020  {
8021  // Exception are caught and handled upstream, so we just call the
8022  // implementations directly.
8023  if (this->isStaticGraph ()) {
8024  using ::Tpetra::Details::unpackCrsMatrixAndCombineNew;
8025  unpackCrsMatrixAndCombineNew (*this, imports, numPacketsPerLID,
8026  importLIDs, constantNumPackets,
8027  distor, combineMode, atomic);
8028  }
8029  else {
8030  this->unpackAndCombineImplNonStatic (importLIDs, imports,
8031  numPacketsPerLID,
8032  constantNumPackets,
8033  distor, combineMode);
8034  }
8035  }
8036 
8037  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8038  void
8039  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
8040  unpackAndCombineImplNonStatic (const Kokkos::DualView<const local_ordinal_type*,
8041  buffer_device_type>& importLIDs,
8042  const Kokkos::DualView<const char*,
8043  buffer_device_type>& imports,
8044  const Kokkos::DualView<const size_t*,
8045  buffer_device_type>& numPacketsPerLID,
8046  const size_t /* constantNumPackets */,
8047  Distributor& /* distor */,
8048  const CombineMode combineMode)
8049  {
8050  using Kokkos::View;
8051  using Kokkos::subview;
8052  using Kokkos::MemoryUnmanaged;
8056  using std::endl;
8057  typedef LocalOrdinal LO;
8058  typedef GlobalOrdinal GO;
8059  typedef impl_scalar_type ST;
8060  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
8061  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
8062  typedef std::pair<typename View<int*, HES>::size_type,
8063  typename View<int*, HES>::size_type> pair_type;
8064  typedef View<GO*, HES, MemoryUnmanaged> gids_out_type;
8065  typedef View<ST*, HES, MemoryUnmanaged> vals_out_type;
8066  const char tfecfFuncName[] = "unpackAndCombineImplNonStatic: ";
8067 
8068  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
8069  // output to std::cerr on every MPI process. This is unwise for
8070  // runs with large numbers of MPI processes.
8071  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
8072  std::unique_ptr<std::string> prefix;
8073  if (verbose) {
8074  int myRank = 0;
8075  auto map = this->getMap ();
8076  if (! map.is_null ()) {
8077  auto comm = map->getComm ();
8078  if (! comm.is_null ()) {
8079  myRank = comm->getRank ();
8080  }
8081  }
8082  // Restrict pfxStrm to inner scope to reduce high-water memory usage.
8083  prefix = [myRank] () {
8084  std::ostringstream pfxStrm;
8085  pfxStrm << "Proc " << myRank << ": Tpetra::CrsMatrix::"
8086  "unpackAndCombineImplNonStatic: ";
8087  return std::unique_ptr<std::string> (new std::string (pfxStrm.str ()));
8088  } ();
8089 
8090  std::ostringstream os;
8091  os << *prefix << endl; // we've already printed DualViews' statuses
8092  std::cerr << os.str ();
8093  }
8094 
8095  const size_type numImportLIDs = importLIDs.extent (0);
8096  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
8097  (numImportLIDs != static_cast<size_type> (numPacketsPerLID.extent (0)),
8098  std::invalid_argument, "importLIDs.size() = " << numImportLIDs
8099  << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
8100  << ".");
8101  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
8102  (combineMode != ADD && combineMode != INSERT && combineMode != REPLACE &&
8103  combineMode != ABSMAX && combineMode != ZERO, std::invalid_argument,
8104  "Invalid CombineMode value " << combineMode << ". Valid "
8105  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO.");
8106  if (combineMode == ZERO || numImportLIDs == 0) {
8107  return; // nothing to do; no need to combine entries
8108  }
8109 
8110  // We're unpacking on host. This is read-only host access of imports.
8111  {
8112  auto imports_nc = castAwayConstDualView (imports);
8113  imports_nc.sync_host ();
8114  }
8115  auto imports_h = imports.view_host ();
8116 
8117  // Read-only host access.
8118  {
8119  auto numPacketsPerLID_nc = castAwayConstDualView (numPacketsPerLID);
8120  numPacketsPerLID_nc.sync_host ();
8121  }
8122  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
8123 
8124  TEUCHOS_ASSERT( ! importLIDs.need_sync_host () );
8125  auto importLIDs_h = importLIDs.view_host ();
8126 
8127  size_t numBytesPerValue;
8128  {
8129  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
8130  // with run-time size? We already assume that all entries in both the
8131  // source and target matrices have the same size. If the calling process
8132  // owns at least one entry in either matrix, we can use that entry to set
8133  // the size. However, it is possible that the calling process owns no
8134  // entries. In that case, we're in trouble. One way to fix this would be
8135  // for each row's data to contain the run-time size. This is only
8136  // necessary if the size is not a compile-time constant.
8137  Scalar val;
8138  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
8139  }
8140 
8141  // Determine the maximum number of entries in any one row
8142  size_t offset = 0;
8143  size_t maxRowNumEnt = 0;
8144  for (size_type i = 0; i < numImportLIDs; ++i) {
8145  const size_t numBytes = numPacketsPerLID_h[i];
8146  if (numBytes == 0) {
8147  continue; // empty buffer for that row means that the row is empty
8148  }
8149  // We need to unpack a nonzero number of entries for this row.
8150 #ifdef HAVE_TPETRA_DEBUG
8151  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
8152  (offset + numBytes > static_cast<size_t> (imports_h.extent (0)),
8153  std::logic_error, "At local row index importLIDs_h[i=" << i << "]="
8154  << importLIDs_h[i] << ", offset (=" << offset << ") + numBytes (="
8155  << numBytes << ") > imports_h.extent(0)="
8156  << imports_h.extent (0) << ".");
8157 #endif // HAVE_TPETRA_DEBUG
8158 
8159  LO numEntLO = 0;
8160 
8161 #ifdef HAVE_TPETRA_DEBUG
8162  const size_t theNumBytes = PackTraits<LO, HES>::packValueCount (numEntLO);
8163  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
8164  (theNumBytes > numBytes, std::logic_error, "theNumBytes = "
8165  << theNumBytes << " > numBytes = " << numBytes << ".");
8166 #endif // HAVE_TPETRA_DEBUG
8167 
8168  const char* const inBuf = imports_h.data () + offset;
8169  const size_t actualNumBytes =
8170  PackTraits<LO, HES>::unpackValue (numEntLO, inBuf);
8171 
8172 #ifdef HAVE_TPETRA_DEBUG
8173  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
8174  (actualNumBytes > numBytes, std::logic_error, "At i = " << i
8175  << ", actualNumBytes=" << actualNumBytes
8176  << " > numBytes=" << numBytes << ".");
8177  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
8178  (numEntLO == 0, std::logic_error, "At local row index importLIDs_h[i="
8179  << i << "]=" << importLIDs_h[i] << ", the number of entries read "
8180  "from the packed data is numEntLO=" << numEntLO << ", but numBytes="
8181  << numBytes << " != 0.");
8182 #else
8183  (void) actualNumBytes;
8184 #endif // HAVE_TPETRA_DEBUG
8185 
8186  maxRowNumEnt = std::max (static_cast<size_t> (numEntLO), maxRowNumEnt);
8187  offset += numBytes;
8188  }
8189 
8190  // Temporary space to cache incoming global column indices and
8191  // values. Column indices come in as global indices, in case the
8192  // source object's column Map differs from the target object's
8193  // (this's) column Map.
8194  View<GO*, HES> gblColInds;
8195  View<LO*, HES> lclColInds;
8196  View<ST*, HES> vals;
8197  {
8198  GO gid = 0;
8199  LO lid = 0;
8200  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
8201  // with run-time size? We already assume that all entries in both the
8202  // source and target matrices have the same size. If the calling process
8203  // owns at least one entry in either matrix, we can use that entry to set
8204  // the size. However, it is possible that the calling process owns no
8205  // entries. In that case, we're in trouble. One way to fix this would be
8206  // for each row's data to contain the run-time size. This is only
8207  // necessary if the size is not a compile-time constant.
8208  Scalar val;
8209  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowNumEnt, "gids");
8210  lclColInds = PackTraits<LO, HES>::allocateArray (lid, maxRowNumEnt, "lids");
8211  vals = PackTraits<ST, HES>::allocateArray (val, maxRowNumEnt, "vals");
8212  }
8213 
8214  offset = 0;
8215  for (size_type i = 0; i < numImportLIDs; ++i) {
8216  const size_t numBytes = numPacketsPerLID_h[i];
8217  if (numBytes == 0) {
8218  continue; // empty buffer for that row means that the row is empty
8219  }
8220  LO numEntLO = 0;
8221  const char* const inBuf = imports_h.data () + offset;
8222  const size_t actualNumBytes = PackTraits<LO, HES>::unpackValue (numEntLO, inBuf);
8223  (void) actualNumBytes;
8224 
8225  const size_t numEnt = static_cast<size_t>(numEntLO);;
8226  const LO lclRow = importLIDs_h[i];
8227 
8228  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
8229  vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
8230 
8231  const size_t numBytesOut =
8232  unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
8233  offset, numBytes, numEnt, numBytesPerValue);
8234  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
8235  (numBytes != numBytesOut, std::logic_error, "At i = " << i << ", "
8236  << "numBytes = " << numBytes << " != numBytesOut = " << numBytesOut
8237  << ".");
8238 
8239  const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
8240  const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
8241  this->combineGlobalValuesRaw (lclRow, numEnt, valsRaw, gidsRaw, combineMode);
8242 
8243  // Don't update offset until current LID has succeeded.
8244  offset += numBytes;
8245  } // for each import LID i
8246  }
8247 
8248  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8249  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
8251  getColumnMapMultiVector (const MV& X_domainMap,
8252  const bool force) const
8253  {
8254  using Teuchos::null;
8255  using Teuchos::RCP;
8256  using Teuchos::rcp;
8257 
8258  TEUCHOS_TEST_FOR_EXCEPTION(
8259  ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
8260  "MapMultiVector: You may only call this method if the matrix has a "
8261  "column Map. If the matrix does not yet have a column Map, you should "
8262  "first call fillComplete (with domain and range Map if necessary).");
8263 
8264  // If the graph is not fill complete, then the Import object (if
8265  // one should exist) hasn't been constructed yet.
8266  TEUCHOS_TEST_FOR_EXCEPTION(
8267  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
8268  "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
8269  "this matrix's graph is fill complete.");
8270 
8271  const size_t numVecs = X_domainMap.getNumVectors ();
8272  RCP<const import_type> importer = this->getGraph ()->getImporter ();
8273  RCP<const map_type> colMap = this->getColMap ();
8274 
8275  RCP<MV> X_colMap; // null by default
8276 
8277  // If the Import object is trivial (null), then we don't need a
8278  // separate column Map multivector. Just return null in that
8279  // case. The caller is responsible for knowing not to use the
8280  // returned null pointer.
8281  //
8282  // If the Import is nontrivial, then we do need a separate
8283  // column Map multivector for the Import operation. Check in
8284  // that case if we have to (re)create the column Map
8285  // multivector.
8286  if (! importer.is_null () || force) {
8287  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
8288  X_colMap = rcp (new MV (colMap, numVecs));
8289 
8290  // Cache the newly created multivector for later reuse.
8291  importMV_ = X_colMap;
8292  }
8293  else { // Yay, we can reuse the cached multivector!
8294  X_colMap = importMV_;
8295  // mfh 09 Jan 2013: We don't have to fill with zeros first,
8296  // because the Import uses INSERT combine mode, which overwrites
8297  // existing entries.
8298  //
8299  //X_colMap->putScalar (ZERO);
8300  }
8301  }
8302  return X_colMap;
8303  }
8304 
8305  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8306  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
8309  const bool force) const
8310  {
8311  using Teuchos::null;
8312  using Teuchos::RCP;
8313  using Teuchos::rcp;
8314 
8315  // If the graph is not fill complete, then the Export object (if
8316  // one should exist) hasn't been constructed yet.
8317  TEUCHOS_TEST_FOR_EXCEPTION(
8318  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
8319  "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
8320  "matrix's graph is fill complete.");
8321 
8322  const size_t numVecs = Y_rangeMap.getNumVectors ();
8323  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
8324  // Every version of the constructor takes either a row Map, or a
8325  // graph (all of whose constructors take a row Map). Thus, the
8326  // matrix always has a row Map.
8327  RCP<const map_type> rowMap = this->getRowMap ();
8328 
8329  RCP<MV> Y_rowMap; // null by default
8330 
8331  // If the Export object is trivial (null), then we don't need a
8332  // separate row Map multivector. Just return null in that case.
8333  // The caller is responsible for knowing not to use the returned
8334  // null pointer.
8335  //
8336  // If the Export is nontrivial, then we do need a separate row
8337  // Map multivector for the Export operation. Check in that case
8338  // if we have to (re)create the row Map multivector.
8339  if (! exporter.is_null () || force) {
8340  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
8341  Y_rowMap = rcp (new MV (rowMap, numVecs));
8342  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
8343  }
8344  else { // Yay, we can reuse the cached multivector!
8345  Y_rowMap = exportMV_;
8346  }
8347  }
8348  return Y_rowMap;
8349  }
8350 
8351  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8352  void
8354  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
8355  {
8356  TEUCHOS_TEST_FOR_EXCEPTION(
8357  myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
8358  "removeEmptyProcessesInPlace: This method does not work when the matrix "
8359  "was created with a constant graph (that is, when it was created using "
8360  "the version of its constructor that takes an RCP<const CrsGraph>). "
8361  "This is because the matrix is not allowed to modify the graph in that "
8362  "case, but removing empty processes requires modifying the graph.");
8363  myGraph_->removeEmptyProcessesInPlace (newMap);
8364  // Even though CrsMatrix's row Map (as returned by getRowMap())
8365  // comes from its CrsGraph, CrsMatrix still implements DistObject,
8366  // so we also have to change the DistObject's Map.
8367  this->map_ = this->getRowMap ();
8368  // In the nonconst graph case, staticGraph_ is just a const
8369  // pointer to myGraph_. This assignment is probably redundant,
8370  // but it doesn't hurt.
8371  staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
8372  }
8373 
8374  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8375  Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
8377  add (const Scalar& alpha,
8379  const Scalar& beta,
8380  const Teuchos::RCP<const map_type>& domainMap,
8381  const Teuchos::RCP<const map_type>& rangeMap,
8382  const Teuchos::RCP<Teuchos::ParameterList>& params) const
8383  {
8384  using Teuchos::Array;
8385  using Teuchos::ArrayView;
8386  using Teuchos::ParameterList;
8387  using Teuchos::RCP;
8388  using Teuchos::rcp;
8389  using Teuchos::rcp_implicit_cast;
8390  using Teuchos::sublist;
8391  typedef LocalOrdinal LO;
8392  typedef GlobalOrdinal GO;
8395 
8396  const crs_matrix_type& B = *this; // a convenient abbreviation
8397  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
8398  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
8399 
8400  // If the user didn't supply a domain or range Map, then try to
8401  // get one from B first (if it has them), then from A (if it has
8402  // them). If we don't have any domain or range Maps, scold the
8403  // user.
8404  RCP<const map_type> A_domainMap = A.getDomainMap ();
8405  RCP<const map_type> A_rangeMap = A.getRangeMap ();
8406  RCP<const map_type> B_domainMap = B.getDomainMap ();
8407  RCP<const map_type> B_rangeMap = B.getRangeMap ();
8408 
8409  RCP<const map_type> theDomainMap = domainMap;
8410  RCP<const map_type> theRangeMap = rangeMap;
8411 
8412  if (domainMap.is_null ()) {
8413  if (B_domainMap.is_null ()) {
8414  TEUCHOS_TEST_FOR_EXCEPTION(
8415  A_domainMap.is_null (), std::invalid_argument,
8416  "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
8417  "then you must supply a nonnull domain Map to this method.");
8418  theDomainMap = A_domainMap;
8419  } else {
8420  theDomainMap = B_domainMap;
8421  }
8422  }
8423  if (rangeMap.is_null ()) {
8424  if (B_rangeMap.is_null ()) {
8425  TEUCHOS_TEST_FOR_EXCEPTION(
8426  A_rangeMap.is_null (), std::invalid_argument,
8427  "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
8428  "then you must supply a nonnull range Map to this method.");
8429  theRangeMap = A_rangeMap;
8430  } else {
8431  theRangeMap = B_rangeMap;
8432  }
8433  }
8434 
8435 #ifdef HAVE_TPETRA_DEBUG
8436  // In a debug build, check that A and B have matching domain and
8437  // range Maps, if they have domain and range Maps at all. (If
8438  // they aren't fill complete, then they may not yet have them.)
8439  if (! A_domainMap.is_null () && ! A_rangeMap.is_null ()) {
8440  if (! B_domainMap.is_null () && ! B_rangeMap.is_null ()) {
8441  TEUCHOS_TEST_FOR_EXCEPTION(
8442  ! B_domainMap->isSameAs (*A_domainMap), std::invalid_argument,
8443  "Tpetra::CrsMatrix::add: The input RowMatrix A must have a domain Map "
8444  "which is the same as (isSameAs) this RowMatrix's domain Map.");
8445  TEUCHOS_TEST_FOR_EXCEPTION(
8446  ! B_rangeMap->isSameAs (*A_rangeMap), std::invalid_argument,
8447  "Tpetra::CrsMatrix::add: The input RowMatrix A must have a range Map "
8448  "which is the same as (isSameAs) this RowMatrix's range Map.");
8449  TEUCHOS_TEST_FOR_EXCEPTION(
8450  ! domainMap.is_null () && ! domainMap->isSameAs (*B_domainMap),
8451  std::invalid_argument,
8452  "Tpetra::CrsMatrix::add: The input domain Map must be the same as "
8453  "(isSameAs) this RowMatrix's domain Map.");
8454  TEUCHOS_TEST_FOR_EXCEPTION(
8455  ! rangeMap.is_null () && ! rangeMap->isSameAs (*B_rangeMap),
8456  std::invalid_argument,
8457  "Tpetra::CrsMatrix::add: The input range Map must be the same as "
8458  "(isSameAs) this RowMatrix's range Map.");
8459  }
8460  }
8461  else if (! B_domainMap.is_null () && ! B_rangeMap.is_null ()) {
8462  TEUCHOS_TEST_FOR_EXCEPTION(
8463  ! domainMap.is_null () && ! domainMap->isSameAs (*B_domainMap),
8464  std::invalid_argument,
8465  "Tpetra::CrsMatrix::add: The input domain Map must be the same as "
8466  "(isSameAs) this RowMatrix's domain Map.");
8467  TEUCHOS_TEST_FOR_EXCEPTION(
8468  ! rangeMap.is_null () && ! rangeMap->isSameAs (*B_rangeMap),
8469  std::invalid_argument,
8470  "Tpetra::CrsMatrix::add: The input range Map must be the same as "
8471  "(isSameAs) this RowMatrix's range Map.");
8472  }
8473  else {
8474  TEUCHOS_TEST_FOR_EXCEPTION(
8475  domainMap.is_null () || rangeMap.is_null (), std::invalid_argument,
8476  "Tpetra::CrsMatrix::add: If neither A nor B have a domain and range "
8477  "Map, then you must supply a nonnull domain and range Map to this "
8478  "method.");
8479  }
8480 #endif // HAVE_TPETRA_DEBUG
8481 
8482  // What parameters do we pass to C's constructor? Do we call
8483  // fillComplete on C after filling it? And if so, what parameters
8484  // do we pass to C's fillComplete call?
8485  bool callFillComplete = true;
8486  RCP<ParameterList> constructorSublist;
8487  RCP<ParameterList> fillCompleteSublist;
8488  if (! params.is_null ()) {
8489  callFillComplete = params->get ("Call fillComplete", callFillComplete);
8490  constructorSublist = sublist (params, "Constructor parameters");
8491  fillCompleteSublist = sublist (params, "fillComplete parameters");
8492  }
8493 
8494  RCP<const map_type> A_rowMap = A.getRowMap ();
8495  RCP<const map_type> B_rowMap = B.getRowMap ();
8496  RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
8497  RCP<crs_matrix_type> C; // The result matrix.
8498 
8499  // If A and B's row Maps are the same, we can compute an upper
8500  // bound on the number of entries in each row of C, before
8501  // actually computing the sum. A reasonable upper bound is the
8502  // sum of the two entry counts in each row. If we choose this as
8503  // the actual per-row upper bound, we can use static profile.
8504  if (A_rowMap->isSameAs (*B_rowMap)) {
8505  const LO localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
8506  Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
8507 
8508  // Get the number of entries in each row of A.
8509  if (alpha != ZERO) {
8510  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
8511  const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
8512  C_maxNumEntriesPerRow[localRow] += A_numEntries;
8513  }
8514  }
8515  // Get the number of entries in each row of B.
8516  if (beta != ZERO) {
8517  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
8518  const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
8519  C_maxNumEntriesPerRow[localRow] += B_numEntries;
8520  }
8521  }
8522  // Construct the result matrix C.
8523  if (constructorSublist.is_null ()) {
8524  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
8525  StaticProfile));
8526  } else {
8527  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
8528  StaticProfile, constructorSublist));
8529  }
8530  // Since A and B have the same row Maps, we could add them
8531  // together all at once and merge values before we call
8532  // insertGlobalValues. However, we don't really need to, since
8533  // we've already allocated enough space in each row of C for C
8534  // to do the merge itself.
8535  }
8536  else { // the row Maps of A and B are not the same
8537  // Construct the result matrix C.
8538 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
8539  if (constructorSublist.is_null ()) {
8540  C = rcp (new crs_matrix_type (C_rowMap, 0, ProfileType(StaticProfile+1) /*DynamicProfile*/));
8541  } else {
8542  C = rcp (new crs_matrix_type (C_rowMap, 0, ProfileType(StaticProfile+1) /*DynamicProfile*/,
8543  constructorSublist));
8544  }
8545 #else
8546  // true: !A_rowMap->isSameAs (*B_rowMap)
8547  TEUCHOS_TEST_FOR_EXCEPTION(true,
8548  std::invalid_argument,
8549  "Tpetra::CrsMatrix::add: The row maps must be the same for statically "
8550  "allocated matrices in order to be sure that there is sufficient space "
8551  "to do the addition");
8552 
8553 #endif
8554  }
8555 
8556 #ifdef HAVE_TPETRA_DEBUG
8557  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
8558  "Tpetra::RowMatrix::add: C should not be null at this point. "
8559  "Please report this bug to the Tpetra developers.");
8560 #endif // HAVE_TPETRA_DEBUG
8561  //
8562  // Compute C = alpha*A + beta*B.
8563  //
8564  Array<GO> ind;
8565  Array<Scalar> val;
8566 
8567  if (alpha != ZERO) {
8568  const LO A_localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
8569  for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
8570  size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
8571  const GO globalRow = A_rowMap->getGlobalElement (localRow);
8572  if (A_numEntries > static_cast<size_t> (ind.size ())) {
8573  ind.resize (A_numEntries);
8574  val.resize (A_numEntries);
8575  }
8576  ArrayView<GO> indView = ind (0, A_numEntries);
8577  ArrayView<Scalar> valView = val (0, A_numEntries);
8578  A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
8579 
8580  if (alpha != ONE) {
8581  for (size_t k = 0; k < A_numEntries; ++k) {
8582  valView[k] *= alpha;
8583  }
8584  }
8585  C->insertGlobalValues (globalRow, indView, valView);
8586  }
8587  }
8588 
8589  if (beta != ZERO) {
8590  const LO B_localNumRows = static_cast<LO> (B_rowMap->getNodeNumElements ());
8591  for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
8592  size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
8593  const GO globalRow = B_rowMap->getGlobalElement (localRow);
8594  if (B_numEntries > static_cast<size_t> (ind.size ())) {
8595  ind.resize (B_numEntries);
8596  val.resize (B_numEntries);
8597  }
8598  ArrayView<GO> indView = ind (0, B_numEntries);
8599  ArrayView<Scalar> valView = val (0, B_numEntries);
8600  B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
8601 
8602  if (beta != ONE) {
8603  for (size_t k = 0; k < B_numEntries; ++k) {
8604  valView[k] *= beta;
8605  }
8606  }
8607  C->insertGlobalValues (globalRow, indView, valView);
8608  }
8609  }
8610 
8611  if (callFillComplete) {
8612  if (fillCompleteSublist.is_null ()) {
8613  C->fillComplete (theDomainMap, theRangeMap);
8614  } else {
8615  C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
8616  }
8617  }
8618  return rcp_implicit_cast<row_matrix_type> (C);
8619  }
8620 
8621 
8622 
8623  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8624  void
8627  const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
8628  const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
8629  const Teuchos::RCP<const map_type>& domainMap,
8630  const Teuchos::RCP<const map_type>& rangeMap,
8631  const Teuchos::RCP<Teuchos::ParameterList>& params) const
8632  {
8637  using Teuchos::ArrayRCP;
8638  using Teuchos::ArrayView;
8639  using Teuchos::Comm;
8640  using Teuchos::ParameterList;
8641  using Teuchos::RCP;
8642  typedef LocalOrdinal LO;
8643  typedef GlobalOrdinal GO;
8644  typedef node_type NT;
8645  typedef CrsMatrix<Scalar, LO, GO, NT> this_type;
8646  typedef Vector<int, LO, GO, NT> IntVectorType;
8647  using Teuchos::as;
8648 
8649  const bool debug = ::Tpetra::Details::Behavior::debug ();
8650  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
8651  int MyPID = getComm ()->getRank ();
8652 
8653  std::unique_ptr<std::string> verbosePrefix;
8654  if (verbose) {
8655  std::ostringstream os;
8656  os << "Proc " << MyPID << ": transferAndFillComplete: ";
8657  verbosePrefix = std::unique_ptr<std::string> (new std::string (os.str ()));
8658 
8659  os << "start" << std::endl;
8660  std::cerr << os.str ();
8661  }
8662 
8663  //
8664  // Get the caller's parameters
8665  //
8666  bool isMM = false; // optimize for matrix-matrix ops.
8667  bool reverseMode = false; // Are we in reverse mode?
8668  bool restrictComm = false; // Do we need to restrict the communicator?
8669 
8670  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
8671  RCP<ParameterList> matrixparams; // parameters for the destination matrix
8672  bool overrideAllreduce = false;
8673  if (! params.is_null ()) {
8674  matrixparams = sublist (params, "CrsMatrix");
8675  reverseMode = params->get ("Reverse Mode", reverseMode);
8676  restrictComm = params->get ("Restrict Communicator", restrictComm);
8677  auto & slist = params->sublist("matrixmatrix: kernel params",false);
8678  isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
8679  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
8680 
8681  overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
8682  if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
8683  if(reverseMode) isMM = false;
8684  }
8685 
8686  // Only used in the sparse matrix-matrix multiply (isMM) case.
8687  std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
8688  int mismatch = 0;
8689  int reduced_mismatch = 0;
8690  if (isMM && !overrideAllreduce) {
8691 
8692  // Test for pathological matrix transfer
8693  const bool source_vals = ! getGraph ()->getImporter ().is_null();
8694  const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
8695  rowTransfer.getRemoteLIDs ().size() == 0);
8696  mismatch = (source_vals != target_vals) ? 1 : 0;
8697  iallreduceRequest =
8698  ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
8699  Teuchos::REDUCE_MAX, * (getComm ()));
8700  }
8701 
8702 #ifdef HAVE_TPETRA_MMM_TIMINGS
8703  using Teuchos::TimeMonitor;
8704  std::string label;
8705  if(!params.is_null())
8706  label = params->get("Timer Label",label);
8707  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
8708  std::string tlstr;
8709  {
8710  std::ostringstream os;
8711  if(isMM) os<<":MMOpt";
8712  else os<<":MMLegacy";
8713  tlstr = os.str();
8714  }
8715 
8716  Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
8717 #endif
8718 
8719  // Make sure that the input argument rowTransfer is either an
8720  // Import or an Export. Import and Export are the only two
8721  // subclasses of Transfer that we defined, but users might
8722  // (unwisely, for now at least) decide to implement their own
8723  // subclasses. Exclude this possibility.
8724  const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
8725  const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
8726  TEUCHOS_TEST_FOR_EXCEPTION(
8727  xferAsImport == NULL && xferAsExport == NULL, std::invalid_argument,
8728  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
8729  "argument must be either an Import or an Export, and its template "
8730  "parameters must match the corresponding template parameters of the "
8731  "CrsMatrix.");
8732 
8733  // Make sure that the input argument domainTransfer is either an
8734  // Import or an Export. Import and Export are the only two
8735  // subclasses of Transfer that we defined, but users might
8736  // (unwisely, for now at least) decide to implement their own
8737  // subclasses. Exclude this possibility.
8738  Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
8739  Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
8740 
8741  if(! domainTransfer.is_null()) {
8742  TEUCHOS_TEST_FOR_EXCEPTION(
8743  (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
8744  "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
8745  "argument must be either an Import or an Export, and its template "
8746  "parameters must match the corresponding template parameters of the "
8747  "CrsMatrix.");
8748 
8749  TEUCHOS_TEST_FOR_EXCEPTION(
8750  ( xferAsImport != NULL || ! xferDomainAsImport.is_null() ) &&
8751  (( xferAsImport != NULL && xferDomainAsImport.is_null() ) ||
8752  ( xferAsImport == NULL && ! xferDomainAsImport.is_null() )), std::invalid_argument,
8753  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
8754  "arguments must be of the same type (either Import or Export).");
8755 
8756  TEUCHOS_TEST_FOR_EXCEPTION(
8757  ( xferAsExport != NULL || ! xferDomainAsExport.is_null() ) &&
8758  (( xferAsExport != NULL && xferDomainAsExport.is_null() ) ||
8759  ( xferAsExport == NULL && ! xferDomainAsExport.is_null() )), std::invalid_argument,
8760  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
8761  "arguments must be of the same type (either Import or Export).");
8762  } // domainTransfer != null
8763 
8764 
8765  // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
8766  // if the source Map is not distributed but the target Map is?
8767  const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
8768 
8769  // Get the new domain and range Maps. We need some of them for
8770  // error checking, now that we have the reverseMode parameter.
8771  RCP<const map_type> MyRowMap = reverseMode ?
8772  rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
8773  RCP<const map_type> MyColMap; // create this below
8774  RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
8775  domainMap : getDomainMap ();
8776  RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
8777  rangeMap : getRangeMap ();
8778  RCP<const map_type> BaseRowMap = MyRowMap;
8779  RCP<const map_type> BaseDomainMap = MyDomainMap;
8780 
8781  // If the user gave us a nonnull destMat, then check whether it's
8782  // "pristine." That means that it has no entries.
8783  //
8784  // FIXME (mfh 15 May 2014) If this is not true on all processes,
8785  // then this exception test may hang. It would be better to
8786  // forward an error flag to the next communication phase.
8787  if (! destMat.is_null ()) {
8788  // FIXME (mfh 15 May 2014): The Epetra idiom for checking
8789  // whether a graph or matrix has no entries on the calling
8790  // process, is that it is neither locally nor globally indexed.
8791  // This may change eventually with the Kokkos refactor version
8792  // of Tpetra, so it would be better just to check the quantity
8793  // of interest directly. Note that with the Kokkos refactor
8794  // version of Tpetra, asking for the total number of entries in
8795  // a graph or matrix that is not fill complete might require
8796  // computation (kernel launch), since it is not thread scalable
8797  // to update a count every time an entry is inserted.
8798  const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
8799  ! destMat->getGraph ()->isGloballyIndexed ();
8800  TEUCHOS_TEST_FOR_EXCEPTION(
8801  ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
8802  "transferAndFillComplete: The input argument 'destMat' is only allowed "
8803  "to be nonnull, if its graph is empty (neither locally nor globally "
8804  "indexed).");
8805  // FIXME (mfh 15 May 2014) At some point, we want to change
8806  // graphs and matrices so that their DistObject Map
8807  // (this->getMap()) may differ from their row Map. This will
8808  // make redistribution for 2-D distributions more efficient. I
8809  // hesitate to change this check, because I'm not sure how much
8810  // the code here depends on getMap() and getRowMap() being the
8811  // same.
8812  TEUCHOS_TEST_FOR_EXCEPTION(
8813  ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
8814  "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
8815  "input argument 'destMat' is not the same as the (row) Map specified "
8816  "by the input argument 'rowTransfer'.");
8817  TEUCHOS_TEST_FOR_EXCEPTION(
8818  ! destMat->checkSizes (*this), std::invalid_argument,
8819  "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
8820  "destination matrix, but checkSizes() indicates that it is not a legal "
8821  "legal target for redistribution from the source matrix (*this). This "
8822  "may mean that they do not have the same dimensions.");
8823  }
8824 
8825  // If forward mode (the default), then *this's (row) Map must be
8826  // the same as the source Map of the Transfer. If reverse mode,
8827  // then *this's (row) Map must be the same as the target Map of
8828  // the Transfer.
8829  //
8830  // FIXME (mfh 15 May 2014) At some point, we want to change graphs
8831  // and matrices so that their DistObject Map (this->getMap()) may
8832  // differ from their row Map. This will make redistribution for
8833  // 2-D distributions more efficient. I hesitate to change this
8834  // check, because I'm not sure how much the code here depends on
8835  // getMap() and getRowMap() being the same.
8836  TEUCHOS_TEST_FOR_EXCEPTION(
8837  ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
8838  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
8839  "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
8840  TEUCHOS_TEST_FOR_EXCEPTION(
8841  ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
8842  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
8843  "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
8844 
8845  // checks for domainTransfer
8846  TEUCHOS_TEST_FOR_EXCEPTION(
8847  ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
8848  std::invalid_argument,
8849  "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
8850  "argument must be the same as the rebalanced domain map 'domainMap'");
8851 
8852  TEUCHOS_TEST_FOR_EXCEPTION(
8853  ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
8854  std::invalid_argument,
8855  "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
8856  "argument must be the same as the rebalanced domain map 'domainMap'");
8857 
8858  // The basic algorithm here is:
8859  //
8860  // 1. Call the moral equivalent of "distor.do" to handle the import.
8861  // 2. Copy all the Imported and Copy/Permuted data into the raw
8862  // CrsMatrix / CrsGraphData pointers, still using GIDs.
8863  // 3. Call an optimized version of MakeColMap that avoids the
8864  // Directory lookups (since the importer knows who owns all the
8865  // GIDs) AND reindexes to LIDs.
8866  // 4. Call expertStaticFillComplete()
8867 
8868  // Get information from the Importer
8869  const size_t NumSameIDs = rowTransfer.getNumSameIDs();
8870  ArrayView<const LO> ExportLIDs = reverseMode ?
8871  rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
8872  ArrayView<const LO> RemoteLIDs = reverseMode ?
8873  rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs ();
8874  ArrayView<const LO> PermuteToLIDs = reverseMode ?
8875  rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs ();
8876  ArrayView<const LO> PermuteFromLIDs = reverseMode ?
8877  rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs ();
8878  Distributor& Distor = rowTransfer.getDistributor ();
8879 
8880  // Owning PIDs
8881  Teuchos::Array<int> SourcePids;
8882  Teuchos::Array<int> TargetPids;
8883 
8884  // Temp variables for sub-communicators
8885  RCP<const map_type> ReducedRowMap, ReducedColMap,
8886  ReducedDomainMap, ReducedRangeMap;
8887  RCP<const Comm<int> > ReducedComm;
8888 
8889  // If the user gave us a null destMat, then construct the new
8890  // destination matrix. We will replace its column Map later.
8891  if (destMat.is_null ()) {
8892  destMat = rcp (new this_type (MyRowMap, 0, StaticProfile, matrixparams));
8893  }
8894 
8895  /***************************************************/
8896  /***** 1) First communicator restriction phase ****/
8897  /***************************************************/
8898  if (restrictComm) {
8899  ReducedRowMap = MyRowMap->removeEmptyProcesses ();
8900  ReducedComm = ReducedRowMap.is_null () ?
8901  Teuchos::null :
8902  ReducedRowMap->getComm ();
8903  destMat->removeEmptyProcessesInPlace (ReducedRowMap);
8904 
8905  ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
8906  ReducedRowMap :
8907  MyDomainMap->replaceCommWithSubset (ReducedComm);
8908  ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
8909  ReducedRowMap :
8910  MyRangeMap->replaceCommWithSubset (ReducedComm);
8911 
8912  // Reset the "my" maps
8913  MyRowMap = ReducedRowMap;
8914  MyDomainMap = ReducedDomainMap;
8915  MyRangeMap = ReducedRangeMap;
8916 
8917  // Update my PID, if we've restricted the communicator
8918  if (! ReducedComm.is_null ()) {
8919  MyPID = ReducedComm->getRank ();
8920  }
8921  else {
8922  MyPID = -2; // For debugging
8923  }
8924  }
8925  else {
8926  ReducedComm = MyRowMap->getComm ();
8927  }
8928 
8929 
8930 
8931  /***************************************************/
8932  /***** 2) From Tpera::DistObject::doTransfer() ****/
8933  /***************************************************/
8934  // Get the owning PIDs
8935  RCP<const import_type> MyImporter = getGraph ()->getImporter ();
8936 
8937  // check whether domain maps of source matrix and base domain map is the same
8938  bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
8939 
8940  if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
8941  // Same domain map as source matrix
8942  //
8943  // NOTE: This won't work for restrictComm (because the Import
8944  // doesn't know the restricted PIDs), though writing an
8945  // optimized version for that case would be easy (Import an
8946  // IntVector of the new PIDs). Might want to add this later.
8947  Import_Util::getPids (*MyImporter, SourcePids, false);
8948  }
8949  else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
8950  // Same domain map as source matrix (restricted communicator)
8951  // We need one import from the domain to the column map
8952  IntVectorType SourceDomain_pids(getDomainMap (),true);
8953  IntVectorType SourceCol_pids(getColMap());
8954  // SourceDomain_pids contains the restricted pids
8955  SourceDomain_pids.putScalar(MyPID);
8956 
8957  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8958  SourcePids.resize (getColMap ()->getNodeNumElements ());
8959  SourceCol_pids.get1dCopy (SourcePids ());
8960  }
8961  else if (MyImporter.is_null () && bSameDomainMap) {
8962  // Matrix has no off-process entries
8963  SourcePids.resize (getColMap ()->getNodeNumElements ());
8964  SourcePids.assign (getColMap ()->getNodeNumElements (), MyPID);
8965  }
8966  else if ( ! MyImporter.is_null () &&
8967  ! domainTransfer.is_null () ) {
8968  // general implementation for rectangular matrices with
8969  // domain map different than SourceMatrix domain map.
8970  // User has to provide a DomainTransfer object. We need
8971  // to communications (import/export)
8972 
8973  // TargetDomain_pids lives on the rebalanced new domain map
8974  IntVectorType TargetDomain_pids (domainMap);
8975  TargetDomain_pids.putScalar (MyPID);
8976 
8977  // SourceDomain_pids lives on the non-rebalanced old domain map
8978  IntVectorType SourceDomain_pids (getDomainMap ());
8979 
8980  // SourceCol_pids lives on the non-rebalanced old column map
8981  IntVectorType SourceCol_pids (getColMap ());
8982 
8983  if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8984  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8985  }
8986  else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8987  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8988  }
8989  else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8990  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8991  }
8992  else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8993  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8994  }
8995  else {
8996  TEUCHOS_TEST_FOR_EXCEPTION(
8997  true, std::logic_error, "Tpetra::CrsMatrix::"
8998  "transferAndFillComplete: Should never get here! "
8999  "Please report this bug to a Tpetra developer.");
9000  }
9001  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
9002  SourcePids.resize (getColMap ()->getNodeNumElements ());
9003  SourceCol_pids.get1dCopy (SourcePids ());
9004  }
9005  else if ( ! MyImporter.is_null () &&
9006  BaseDomainMap->isSameAs (*BaseRowMap) &&
9007  getDomainMap ()->isSameAs (*getRowMap ())) {
9008  // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
9009 
9010  IntVectorType TargetRow_pids (domainMap);
9011  IntVectorType SourceRow_pids (getRowMap ());
9012  IntVectorType SourceCol_pids (getColMap ());
9013 
9014  TargetRow_pids.putScalar (MyPID);
9015  if (! reverseMode && xferAsImport != NULL) {
9016  SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
9017  }
9018  else if (reverseMode && xferAsExport != NULL) {
9019  SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
9020  }
9021  else if (! reverseMode && xferAsExport != NULL) {
9022  SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
9023  }
9024  else if (reverseMode && xferAsImport != NULL) {
9025  SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
9026  }
9027  else {
9028  TEUCHOS_TEST_FOR_EXCEPTION(
9029  true, std::logic_error, "Tpetra::CrsMatrix::"
9030  "transferAndFillComplete: Should never get here! "
9031  "Please report this bug to a Tpetra developer.");
9032  }
9033 
9034  SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
9035  SourcePids.resize (getColMap ()->getNodeNumElements ());
9036  SourceCol_pids.get1dCopy (SourcePids ());
9037  }
9038  else {
9039  TEUCHOS_TEST_FOR_EXCEPTION(
9040  true, std::invalid_argument, "Tpetra::CrsMatrix::"
9041  "transferAndFillComplete: This method only allows either domainMap == "
9042  "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
9043  "getDomainMap () == getRowMap ()).");
9044  }
9045 
9046  // Tpetra-specific stuff
9047  size_t constantNumPackets = destMat->constantNumberOfPackets ();
9048  if (constantNumPackets == 0) {
9049  destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
9050  RemoteLIDs.size ());
9051  }
9052  else {
9053  // There are a constant number of packets per element. We
9054  // already know (from the number of "remote" (incoming)
9055  // elements) how many incoming elements we expect, so we can
9056  // resize the buffer accordingly.
9057  const size_t rbufLen = RemoteLIDs.size() * constantNumPackets;
9058  destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
9059  }
9060 
9061  // Pack & Prepare w/ owning PIDs
9062  if (debug) {
9063  using Teuchos::outArg;
9064  using Teuchos::REDUCE_MAX;
9065  using Teuchos::reduceAll;
9066  using std::cerr;
9067  using std::endl;
9068  RCP<const Teuchos::Comm<int> > comm = this->getComm ();
9069  const int myRank = comm->getRank ();
9070 
9071  std::ostringstream errStrm;
9072  int lclErr = 0;
9073  int gblErr = 0;
9074 
9075  Teuchos::ArrayView<size_t> numExportPacketsPerLID;
9076  try {
9077  // packAndPrepare* methods modify numExportPacketsPerLID_.
9078  destMat->numExportPacketsPerLID_.modify_host ();
9079  numExportPacketsPerLID =
9080  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
9081  }
9082  catch (std::exception& e) {
9083  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
9084  << e.what () << std::endl;
9085  lclErr = 1;
9086  }
9087  catch (...) {
9088  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
9089  "an exception not a subclass of std::exception" << std::endl;
9090  lclErr = 1;
9091  }
9092 
9093  if (! comm.is_null ()) {
9094  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
9095  }
9096  if (gblErr != 0) {
9097  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
9098  TEUCHOS_TEST_FOR_EXCEPTION(
9099  true, std::runtime_error, "getArrayViewFromDualView threw an "
9100  "exception on at least one process.");
9101  }
9102 
9103  if (verbose) {
9104  std::ostringstream os;
9105  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
9106  << std::endl;
9107  std::cerr << os.str ();
9108  }
9109  try {
9111  destMat->exports_,
9112  numExportPacketsPerLID,
9113  ExportLIDs,
9114  SourcePids,
9115  constantNumPackets,
9116  Distor);
9117  }
9118  catch (std::exception& e) {
9119  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
9120  << e.what () << std::endl;
9121  lclErr = 1;
9122  }
9123  catch (...) {
9124  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
9125  "an exception not a subclass of std::exception" << std::endl;
9126  lclErr = 1;
9127  }
9128 
9129  if (verbose) {
9130  std::ostringstream os;
9131  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
9132  << std::endl;
9133  std::cerr << os.str ();
9134  }
9135 
9136  if (! comm.is_null ()) {
9137  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
9138  }
9139  if (gblErr != 0) {
9140  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
9141  TEUCHOS_TEST_FOR_EXCEPTION(
9142  true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
9143  "exception on at least one process.");
9144  }
9145  }
9146  else {
9147  // packAndPrepare* methods modify numExportPacketsPerLID_.
9148  destMat->numExportPacketsPerLID_.modify_host ();
9149  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
9150  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
9151  if (verbose) {
9152  std::ostringstream os;
9153  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
9154  << std::endl;
9155  std::cerr << os.str ();
9156  }
9158  destMat->exports_,
9159  numExportPacketsPerLID,
9160  ExportLIDs,
9161  SourcePids,
9162  constantNumPackets,
9163  Distor);
9164  if (verbose) {
9165  std::ostringstream os;
9166  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
9167  << std::endl;
9168  std::cerr << os.str ();
9169  }
9170  }
9171 
9172  // Do the exchange of remote data.
9173  if (! communication_needed) {
9174  if (verbose) {
9175  std::ostringstream os;
9176  os << *verbosePrefix << "Communication not needed" << std::endl;
9177  std::cerr << os.str ();
9178  }
9179  }
9180  else {
9181  if (reverseMode) {
9182  if (constantNumPackets == 0) { // variable number of packets per LID
9183  if (verbose) {
9184  std::ostringstream os;
9185  os << *verbosePrefix << "Reverse mode, variable # packets / LID"
9186  << std::endl;
9187  std::cerr << os.str ();
9188  }
9189  // Make sure that host has the latest version, since we're
9190  // using the version on host. If host has the latest
9191  // version, syncing to host does nothing.
9192  destMat->numExportPacketsPerLID_.sync_host ();
9193  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
9194  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
9195  destMat->numImportPacketsPerLID_.sync_host ();
9196  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
9197  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
9198 
9199  if (verbose) {
9200  std::ostringstream os;
9201  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
9202  << std::endl;
9203  std::cerr << os.str ();
9204  }
9205  Distor.doReversePostsAndWaits (numExportPacketsPerLID, 1,
9206  numImportPacketsPerLID);
9207  if (verbose) {
9208  std::ostringstream os;
9209  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
9210  << std::endl;
9211  std::cerr << os.str ();
9212  }
9213 
9214  size_t totalImportPackets = 0;
9215  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
9216  totalImportPackets += numImportPacketsPerLID[i];
9217  }
9218 
9219  // Reallocation MUST go before setting the modified flag,
9220  // because it may clear out the flags.
9221  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
9222  verbosePrefix.get ());
9223  destMat->imports_.modify_host ();
9224  Teuchos::ArrayView<char> hostImports =
9225  getArrayViewFromDualView (destMat->imports_);
9226  // This is a legacy host pack/unpack path, so use the host
9227  // version of exports_.
9228  destMat->exports_.sync_host ();
9229  Teuchos::ArrayView<const char> hostExports =
9230  getArrayViewFromDualView (destMat->exports_);
9231  if (verbose) {
9232  std::ostringstream os;
9233  os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
9234  << std::endl;
9235  std::cerr << os.str ();
9236  }
9237  Distor.doReversePostsAndWaits (hostExports,
9238  numExportPacketsPerLID,
9239  hostImports,
9240  numImportPacketsPerLID);
9241  if (verbose) {
9242  std::ostringstream os;
9243  os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
9244  << std::endl;
9245  std::cerr << os.str ();
9246  }
9247  }
9248  else { // constant number of packets per LID
9249  if (verbose) {
9250  std::ostringstream os;
9251  os << *verbosePrefix << "Reverse mode, constant # packets / LID"
9252  << std::endl;
9253  std::cerr << os.str ();
9254  }
9255  destMat->imports_.modify_host ();
9256  Teuchos::ArrayView<char> hostImports =
9257  getArrayViewFromDualView (destMat->imports_);
9258  // This is a legacy host pack/unpack path, so use the host
9259  // version of exports_.
9260  destMat->exports_.sync_host ();
9261  Teuchos::ArrayView<const char> hostExports =
9262  getArrayViewFromDualView (destMat->exports_);
9263  if (verbose) {
9264  std::ostringstream os;
9265  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
9266  << std::endl;
9267  std::cerr << os.str ();
9268  }
9269  Distor.doReversePostsAndWaits (hostExports,
9270  constantNumPackets,
9271  hostImports);
9272  if (verbose) {
9273  std::ostringstream os;
9274  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
9275  << std::endl;
9276  std::cerr << os.str ();
9277  }
9278  }
9279  }
9280  else { // forward mode (the default)
9281  if (constantNumPackets == 0) { // variable number of packets per LID
9282  if (verbose) {
9283  std::ostringstream os;
9284  os << *verbosePrefix << "Forward mode, variable # packets / LID"
9285  << std::endl;
9286  std::cerr << os.str ();
9287  }
9288  // Make sure that host has the latest version, since we're
9289  // using the version on host. If host has the latest
9290  // version, syncing to host does nothing.
9291  destMat->numExportPacketsPerLID_.sync_host ();
9292  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
9293  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
9294  destMat->numImportPacketsPerLID_.sync_host ();
9295  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
9296  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
9297  if (verbose) {
9298  std::ostringstream os;
9299  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
9300  << std::endl;
9301  std::cerr << os.str ();
9302  }
9303  Distor.doPostsAndWaits (numExportPacketsPerLID, 1,
9304  numImportPacketsPerLID);
9305  if (verbose) {
9306  std::ostringstream os;
9307  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
9308  << std::endl;
9309  std::cerr << os.str ();
9310  }
9311 
9312  size_t totalImportPackets = 0;
9313  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
9314  totalImportPackets += numImportPacketsPerLID[i];
9315  }
9316 
9317  // Reallocation MUST go before setting the modified flag,
9318  // because it may clear out the flags.
9319  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
9320  verbosePrefix.get ());
9321  destMat->imports_.modify_host ();
9322  Teuchos::ArrayView<char> hostImports =
9323  getArrayViewFromDualView (destMat->imports_);
9324  // This is a legacy host pack/unpack path, so use the host
9325  // version of exports_.
9326  destMat->exports_.sync_host ();
9327  Teuchos::ArrayView<const char> hostExports =
9328  getArrayViewFromDualView (destMat->exports_);
9329  if (verbose) {
9330  std::ostringstream os;
9331  os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
9332  << std::endl;
9333  std::cerr << os.str ();
9334  }
9335  Distor.doPostsAndWaits (hostExports,
9336  numExportPacketsPerLID,
9337  hostImports,
9338  numImportPacketsPerLID);
9339  if (verbose) {
9340  std::ostringstream os;
9341  os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
9342  << std::endl;
9343  std::cerr << os.str ();
9344  }
9345  }
9346  else { // constant number of packets per LID
9347  if (verbose) {
9348  std::ostringstream os;
9349  os << *verbosePrefix << "Forward mode, constant # packets / LID"
9350  << std::endl;
9351  std::cerr << os.str ();
9352  }
9353  destMat->imports_.modify_host ();
9354  Teuchos::ArrayView<char> hostImports =
9355  getArrayViewFromDualView (destMat->imports_);
9356  // This is a legacy host pack/unpack path, so use the host
9357  // version of exports_.
9358  destMat->exports_.sync_host ();
9359  Teuchos::ArrayView<const char> hostExports =
9360  getArrayViewFromDualView (destMat->exports_);
9361  if (verbose) {
9362  std::ostringstream os;
9363  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
9364  << std::endl;
9365  std::cerr << os.str ();
9366  }
9367  Distor.doPostsAndWaits (hostExports,
9368  constantNumPackets,
9369  hostImports);
9370  if (verbose) {
9371  std::ostringstream os;
9372  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
9373  << std::endl;
9374  std::cerr << os.str ();
9375  }
9376  }
9377  }
9378  }
9379 
9380  /*********************************************************************/
9381  /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
9382  /*********************************************************************/
9383 
9384  // Backwards compatibility measure. We'll use this again below.
9385  destMat->numImportPacketsPerLID_.sync_host ();
9386  Teuchos::ArrayView<const size_t> numImportPacketsPerLID =
9387  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
9388  destMat->imports_.sync_host ();
9389  Teuchos::ArrayView<const char> hostImports =
9390  getArrayViewFromDualView (destMat->imports_);
9391 
9392  if (verbose) {
9393  std::ostringstream os;
9394  os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount"
9395  << std::endl;
9396  std::cerr << os.str ();
9397  }
9398  size_t mynnz =
9400  RemoteLIDs,
9401  hostImports,
9402  numImportPacketsPerLID,
9403  constantNumPackets,
9404  Distor,
9405  INSERT,
9406  NumSameIDs,
9407  PermuteToLIDs,
9408  PermuteFromLIDs);
9409  if (verbose) {
9410  std::ostringstream os;
9411  os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned "
9412  << mynnz << std::endl;
9413  std::cerr << os.str ();
9414  }
9415  size_t N = BaseRowMap->getNodeNumElements ();
9416 
9417  // Allocations
9418  ArrayRCP<size_t> CSR_rowptr(N+1);
9419  ArrayRCP<GO> CSR_colind_GID;
9420  ArrayRCP<LO> CSR_colind_LID;
9421  ArrayRCP<Scalar> CSR_vals;
9422  CSR_colind_GID.resize (mynnz);
9423  CSR_vals.resize (mynnz);
9424 
9425  // If LO and GO are the same, we can reuse memory when
9426  // converting the column indices from global to local indices.
9427  if (typeid (LO) == typeid (GO)) {
9428  CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
9429  }
9430  else {
9431  CSR_colind_LID.resize (mynnz);
9432  }
9433 
9434  if (verbose) {
9435  std::ostringstream os;
9436  os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays"
9437  << std::endl;
9438  std::cerr << os.str ();
9439  }
9440  // FIXME (mfh 15 May 2014) Why can't we abstract this out as an
9441  // unpackAndCombine method on a "CrsArrays" object? This passing
9442  // in a huge list of arrays is icky. Can't we have a bit of an
9443  // abstraction? Implementing a concrete DistObject subclass only
9444  // takes five methods.
9446  RemoteLIDs,
9447  hostImports,
9448  numImportPacketsPerLID,
9449  constantNumPackets,
9450  Distor,
9451  INSERT,
9452  NumSameIDs,
9453  PermuteToLIDs,
9454  PermuteFromLIDs,
9455  N,
9456  mynnz,
9457  MyPID,
9458  CSR_rowptr (),
9459  CSR_colind_GID (),
9460  Teuchos::av_reinterpret_cast<impl_scalar_type> (CSR_vals ()),
9461  SourcePids (),
9462  TargetPids);
9463 
9464  /**************************************************************/
9465  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
9466  /**************************************************************/
9467  // Call an optimized version of makeColMap that avoids the
9468  // Directory lookups (since the Import object knows who owns all
9469  // the GIDs).
9470  Teuchos::Array<int> RemotePids;
9471  if (verbose) {
9472  std::ostringstream os;
9473  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
9474  << std::endl;
9475  std::cerr << os.str ();
9476  }
9477  Import_Util::lowCommunicationMakeColMapAndReindex (CSR_rowptr (),
9478  CSR_colind_LID (),
9479  CSR_colind_GID (),
9480  BaseDomainMap,
9481  TargetPids,
9482  RemotePids,
9483  MyColMap);
9484 
9485  if (verbose) {
9486  std::ostringstream os;
9487  os << *verbosePrefix << "restrictComm="
9488  << (restrictComm ? "true" : "false") << std::endl;
9489  std::cerr << os.str ();
9490  }
9491 
9492  /*******************************************************/
9493  /**** 4) Second communicator restriction phase ****/
9494  /*******************************************************/
9495  if (restrictComm) {
9496  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
9497  ReducedRowMap :
9498  MyColMap->replaceCommWithSubset (ReducedComm);
9499  MyColMap = ReducedColMap; // Reset the "my" maps
9500  }
9501 
9502  // Replace the col map
9503  if (verbose) {
9504  std::ostringstream os;
9505  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
9506  std::cerr << os.str ();
9507  }
9508  destMat->replaceColMap (MyColMap);
9509 
9510  // Short circuit if the processor is no longer in the communicator
9511  //
9512  // NOTE: Epetra replaces modifies all "removed" processes so they
9513  // have a dummy (serial) Map that doesn't touch the original
9514  // communicator. Duplicating that here might be a good idea.
9515  if (ReducedComm.is_null ()) {
9516  if (verbose) {
9517  std::ostringstream os;
9518  os << *verbosePrefix << "I am no longer in the communicator; "
9519  "returning" << std::endl;
9520  std::cerr << os.str ();
9521  }
9522  return;
9523  }
9524 
9525  /***************************************************/
9526  /**** 5) Sort ****/
9527  /***************************************************/
9528  if ((! reverseMode && xferAsImport != NULL) ||
9529  (reverseMode && xferAsExport != NULL)) {
9530  if (verbose) {
9531  std::ostringstream os;
9532  os << *verbosePrefix << "Calling sortCrsEntries" << std::endl;
9533  std::cerr << os.str ();
9534  }
9535  Import_Util::sortCrsEntries (CSR_rowptr (),
9536  CSR_colind_LID (),
9537  CSR_vals ());
9538  }
9539  else if ((! reverseMode && xferAsExport != NULL) ||
9540  (reverseMode && xferAsImport != NULL)) {
9541  if (verbose) {
9542  std::ostringstream os;
9543  os << *verbosePrefix << "Calling sortAndMergeCrsEntries" << std::endl;
9544  std::cerr << os.str ();
9545  }
9546  Import_Util::sortAndMergeCrsEntries (CSR_rowptr (),
9547  CSR_colind_LID (),
9548  CSR_vals ());
9549  if (CSR_rowptr[N] != mynnz) {
9550  CSR_colind_LID.resize (CSR_rowptr[N]);
9551  CSR_vals.resize (CSR_rowptr[N]);
9552  }
9553  }
9554  else {
9555  TEUCHOS_TEST_FOR_EXCEPTION(
9556  true, std::logic_error, "Tpetra::CrsMatrix::"
9557  "transferAndFillComplete: Should never get here! "
9558  "Please report this bug to a Tpetra developer.");
9559  }
9560  /***************************************************/
9561  /**** 6) Reset the colmap and the arrays ****/
9562  /***************************************************/
9563 
9564  if (verbose) {
9565  std::ostringstream os;
9566  os << *verbosePrefix << "Calling destMat->setAllValues" << std::endl;
9567  std::cerr << os.str ();
9568  }
9569 
9570  // Call constructor for the new matrix (restricted as needed)
9571  //
9572  // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
9573  // refactor version of CrsMatrix, though it reserves the right to
9574  // make a deep copy of the arrays.
9575  destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
9576 
9577  /***************************************************/
9578  /**** 7) Build Importer & Call ESFC ****/
9579  /***************************************************/
9580  // Pre-build the importer using the existing PIDs
9581  Teuchos::ParameterList esfc_params;
9582 
9583  RCP<import_type> MyImport;
9584 
9585  // Fulfull the non-blocking allreduce on reduced_mismatch.
9586  if (iallreduceRequest.get () != nullptr) {
9587  if (verbose) {
9588  std::ostringstream os;
9589  os << *verbosePrefix << "Calling iallreduceRequest->wait()" << std::endl;
9590  std::cerr << os.str ();
9591  }
9592  iallreduceRequest->wait ();
9593  if (reduced_mismatch != 0) {
9594  isMM = false;
9595  }
9596  }
9597 
9598  if( isMM ) {
9599 #ifdef HAVE_TPETRA_MMM_TIMINGS
9600  Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
9601 #endif
9602  // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
9603 
9604  if (verbose) {
9605  std::ostringstream os;
9606  os << *verbosePrefix << "Calling getAllValues" << std::endl;
9607  std::cerr << os.str ();
9608  }
9609 
9610  Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
9611  Teuchos::ArrayRCP<int> type3PIDs;
9612  Teuchos::ArrayRCP<const size_t> rowptr;
9613  Teuchos::ArrayRCP<const LO> colind;
9614  Teuchos::ArrayRCP<const Scalar> vals;
9615  {
9616 #ifdef HAVE_TPETRA_MMM_TIMINGS
9617  TimeMonitor tm_getAllValues (*TimeMonitor::getNewTimer(prefix + std::string("isMMgetAllValues")));
9618 #endif
9619  getAllValues(rowptr,colind,vals);
9620  }
9621 
9622  if (verbose) {
9623  std::ostringstream os;
9624  os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
9625  std::cerr << os.str ();
9626  }
9627 
9628  {
9629 #ifdef HAVE_TPETRA_MMM_TIMINGS
9630  TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
9631 #endif
9632  Import_Util::reverseNeighborDiscovery(*this,
9633  rowptr,
9634  colind,
9635  rowTransfer,
9636  MyImporter,
9637  MyDomainMap,
9638  type3PIDs,
9639  type3LIDs,
9640  ReducedComm);
9641  }
9642 
9643  if (verbose) {
9644  std::ostringstream os;
9645  os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
9646  std::cerr << os.str ();
9647  }
9648 
9649  Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
9650  Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportLIDs();
9651 
9652  Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
9653  Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
9654 
9655  const int numCols = getGraph()->getColMap()->getNodeNumElements(); // may be dup
9656  // from EpetraExt_MMHelpers.cpp: build_type2_exports
9657  std::vector<bool> IsOwned(numCols,true);
9658  std::vector<int> SentTo(numCols,-1);
9659  if (! MyImporter.is_null ()) {
9660  for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
9661  IsOwned[rlid]=false;
9662  }
9663  }
9664 
9665  std::vector<std::pair<int,GO> > usrtg;
9666  usrtg.reserve(TEPID2.size());
9667 
9668  {
9669  const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
9670  for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
9671  const LO row = TELID2[i];
9672  const int pid = TEPID2[i];
9673  for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
9674  const int col = colind[j];
9675  if (IsOwned[col] && SentTo[col] != pid) {
9676  SentTo[col] = pid;
9677  GO gid = colMap.getGlobalElement (col);
9678  usrtg.push_back (std::pair<int,GO> (pid, gid));
9679  }
9680  }
9681  }
9682  }
9683 
9684 // This sort can _not_ be omitted.[
9685  std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
9686  auto eopg = std ::unique(usrtg.begin(),usrtg.end());
9687  // 25 Jul 2018: Could just ignore the entries at and after eopg.
9688  usrtg.erase(eopg,usrtg.end());
9689 
9690  const Array_size_type type2_us_size = usrtg.size();
9691  Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
9692  Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
9693 
9694  int pos=0;
9695  for(auto && p : usrtg) {
9696  EPID2[pos]= p.first;
9697  ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
9698  pos++;
9699  }
9700 
9701  Teuchos::ArrayView<int> EPID3 = type3PIDs();
9702  Teuchos::ArrayView< LO> ELID3 = type3LIDs();
9703  GO InfGID = std::numeric_limits<GO>::max();
9704  int InfPID = INT_MAX;
9705 #ifdef TPETRA_MIN3
9706 # undef TPETRA_MIN3
9707 #endif // TPETRA_MIN3
9708 #define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
9709  int i1=0, i2=0, i3=0;
9710  int Len1 = EPID1.size();
9711  int Len2 = EPID2.size();
9712  int Len3 = EPID3.size();
9713 
9714  int MyLen=Len1+Len2+Len3;
9715  Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
9716  Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
9717  int iloc = 0; // will be the size of the userExportLID/PIDs
9718 
9719  while(i1 < Len1 || i2 < Len2 || i3 < Len3){
9720  int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
9721  int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
9722  int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
9723 
9724  GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
9725  GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
9726  GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
9727 
9728  int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
9729  GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
9730 #ifdef TPETRA_MIN3
9731 # undef TPETRA_MIN3
9732 #endif // TPETRA_MIN3
9733  bool added_entry=false;
9734 
9735  if(PID1 == MIN_PID && GID1 == MIN_GID){
9736  userExportLIDs[iloc]=ELID1[i1];
9737  userExportPIDs[iloc]=EPID1[i1];
9738  i1++;
9739  added_entry=true;
9740  iloc++;
9741  }
9742  if(PID2 == MIN_PID && GID2 == MIN_GID){
9743  if(!added_entry) {
9744  userExportLIDs[iloc]=ELID2[i2];
9745  userExportPIDs[iloc]=EPID2[i2];
9746  added_entry=true;
9747  iloc++;
9748  }
9749  i2++;
9750  }
9751  if(PID3 == MIN_PID && GID3 == MIN_GID){
9752  if(!added_entry) {
9753  userExportLIDs[iloc]=ELID3[i3];
9754  userExportPIDs[iloc]=EPID3[i3];
9755  iloc++;
9756  }
9757  i3++;
9758  }
9759  }
9760 
9761  if (verbose) {
9762  std::ostringstream os;
9763  os << *verbosePrefix << "Create Import" << std::endl;
9764  std::cerr << os.str ();
9765  }
9766 
9767 #ifdef HAVE_TPETRA_MMM_TIMINGS
9768  auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
9769 #endif
9770  Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
9771  // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
9772  MyImport = rcp ( new import_type (MyDomainMap,
9773  MyColMap,
9774  RemotePids,
9775  userExportLIDs.view(0,iloc).getConst(),
9776  userExportPIDs.view(0,iloc).getConst(),
9777  plist)
9778  );
9779 
9780  if (verbose) {
9781  std::ostringstream os;
9782  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9783  std::cerr << os.str ();
9784  }
9785 
9786  {
9787 #ifdef HAVE_TPETRA_MMM_TIMINGS
9788  TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
9789  esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
9790 #endif
9791  if(!params.is_null())
9792  esfc_params.set("compute global constants",params->get("compute global constants",true));
9793  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
9794 
9795  }
9796 
9797  } // if(isMM)
9798  else {
9799 #ifdef HAVE_TPETRA_MMM_TIMINGS
9800  TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
9801 #endif
9802  if (verbose) {
9803  std::ostringstream os;
9804  os << *verbosePrefix << "Create Import" << std::endl;
9805  std::cerr << os.str ();
9806  }
9807 
9808 #ifdef HAVE_TPETRA_MMM_TIMINGS
9809  TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
9810 #endif
9811  Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
9812  mypars->set("Timer Label","notMMFrom_tAFC");
9813  MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
9814 
9815  if (verbose) {
9816  std::ostringstream os;
9817  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9818  std::cerr << os.str ();
9819  }
9820 
9821 #ifdef HAVE_TPETRA_MMM_TIMINGS
9822  TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
9823  esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
9824 #else
9825  esfc_params.set("Timer Label",std::string("notMM eSFC"));
9826 #endif
9827 
9828  if (!params.is_null ()) {
9829  esfc_params.set ("compute global constants",
9830  params->get ("compute global constants", true));
9831  }
9832  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
9833  MyImport, Teuchos::null,
9834  rcp (new Teuchos::ParameterList (esfc_params)));
9835  }
9836 
9837  if (verbose) {
9838  std::ostringstream os;
9839  os << *verbosePrefix << "Done!" << std::endl;
9840  std::cerr << os.str ();
9841  }
9842  }
9843 
9844 
9845  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9846  void
9849  const import_type& importer,
9850  const Teuchos::RCP<const map_type>& domainMap,
9851  const Teuchos::RCP<const map_type>& rangeMap,
9852  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9853  {
9854  transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
9855  }
9856 
9857  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9858  void
9861  const import_type& rowImporter,
9862  const import_type& domainImporter,
9863  const Teuchos::RCP<const map_type>& domainMap,
9864  const Teuchos::RCP<const map_type>& rangeMap,
9865  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9866  {
9867  transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
9868  }
9869 
9870  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9871  void
9874  const export_type& exporter,
9875  const Teuchos::RCP<const map_type>& domainMap,
9876  const Teuchos::RCP<const map_type>& rangeMap,
9877  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9878  {
9879  transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
9880  }
9881 
9882  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9883  void
9886  const export_type& rowExporter,
9887  const export_type& domainExporter,
9888  const Teuchos::RCP<const map_type>& domainMap,
9889  const Teuchos::RCP<const map_type>& rangeMap,
9890  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9891  {
9892  transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
9893  }
9894 
9895 
9896 } // namespace Tpetra
9897 
9898 //
9899 // Explicit instantiation macro
9900 //
9901 // Must be expanded from within the Tpetra namespace!
9902 //
9903 
9904 #define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
9905  \
9906  template class CrsMatrix< SCALAR , LO , GO , NODE >; \
9907  template Teuchos::RCP< CrsMatrix< SCALAR , LO , GO , NODE > > \
9908  CrsMatrix< SCALAR , LO , GO , NODE >::convert< SCALAR > () const;
9909 
9910 #define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
9911  \
9912  template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
9913  CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
9914 
9915 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9916  template<> \
9917  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9918  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9919  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9920  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9921  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
9922  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9923  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9924  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9925  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9926  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9927  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9928  const Teuchos::RCP<Teuchos::ParameterList>& params);
9929 
9930 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9931  template<> \
9932  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9933  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9934  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9935  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9936  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
9937  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9938  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9939  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
9940  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9941  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9942  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9943  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9944  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9945  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9946  const Teuchos::RCP<Teuchos::ParameterList>& params);
9947 
9948 
9949 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9950  template<> \
9951  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9952  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9953  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9954  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9955  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
9956  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9957  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9958  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9959  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9960  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9961  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9962  const Teuchos::RCP<Teuchos::ParameterList>& params);
9963 
9964 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9965  template<> \
9966  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9967  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9968  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9969  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9970  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9971  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9972  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9973  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9974  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9975  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9976  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9977  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9978  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9979  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9980  const Teuchos::RCP<Teuchos::ParameterList>& params);
9981 
9982 
9983 #define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9984  TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9985  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9986  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9987  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9988  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9989 
9990 #endif // TPETRA_CRSMATRIX_DEF_HPP
Teuchos::ArrayRCP< Teuchos::Array< local_ordinal_type > > lclInds2D_
Local column indices for all rows.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
dual_view_type::t_host getLocalViewHost() const
A local Kokkos::View of host memory.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
bool indicesAreSorted_
Whether the graph&#39;s indices are sorted in each row, on this process.
static int TAFC_OptimizationCoreCount()
The core count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced nei...
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Functor for the the ABSMAX CombineMode of Import and Export operations.
void checkInternalState() const
Check that this object&#39;s state is sane; throw if it&#39;s not.
Abstract interface for local operators (e.g., matrices and preconditioners).
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
size_t getNodeNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types...
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
LocalOrdinal getViewRaw(impl_scalar_type *&vals, LocalOrdinal &numEnt, const RowInfo &rowinfo) const
Nonconst pointer to all entries (including extra space) in the given row.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
bool isNodeGlobalElement(GlobalOrdinal globalIndex) const
Whether the given global index is owned by this Map on the calling process.
LocalOrdinal replaceLocalValues(const LocalOrdinal localRow, const typename UnmanagedView< LocalIndicesViewType >::type &inputInds, const typename UnmanagedView< ImplScalarViewType >::type &inputVals) const
Replace one or more entries&#39; values, using local row and column indices.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
typename device_type::execution_space execution_space
The Kokkos execution space.
void getLocalRowCopy(LocalOrdinal localRow, const Teuchos::ArrayView< LocalOrdinal > &colInds, const Teuchos::ArrayView< Scalar > &vals, size_t &numEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
void clear_sync_state()
Clear &quot;modified&quot; flags on both host and device sides.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
One or more distributed dense vectors.
size_t getNodeNumEntries() const override
The local number of entries in this matrix.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular...
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
bool noRedundancies_
Whether the graph&#39;s indices are non-redundant (merged) in each row, on this process.
bool isDistributed() const
Whether this is a globally distributed object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets, Distributor &distor)
Pack specified entries of the given local sparse matrix for communication.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, const Teuchos::ArrayView< GlobalOrdinal > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
static bool debug()
Whether Tpetra is in debug mode.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix&#39;s communicator...
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y...
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T...
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
LocalOrdinal sumIntoLocalValues(const LocalOrdinal localRow, const typename UnmanagedView< LocalIndicesViewType >::type &inputInds, const typename UnmanagedView< ImplScalarViewType >::type &inputVals, const bool atomic=useAtomicUpdatesByDefault) const
Sum into one or more sparse matrix entries, using local row and column indices.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a copy of the diagonal entries of the matrix.
bool isNodeLocalElement(LocalOrdinal localIndex) const
Whether the given local index is valid for this Map on the calling process.
std::shared_ptr< local_multiply_op_type > lclMatrix_
The local sparse matrix, wrapped in a multiply operator.
void gaussSeidel(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps) const
&quot;Hybrid&quot; Jacobi + (Gauss-Seidel or SOR) on .
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply() and gaussSeidel().
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, const Teuchos::ArrayView< GlobalOrdinal > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const =0
Get a copy of the given global row&#39;s entries.
void clearGlobalConstants()
Clear matrix properties that require collectives.
void gaussSeidelCopy(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
LocalOrdinal getLocalRowViewRaw(const LocalOrdinal lclRow, LocalOrdinal &numEnt, const LocalOrdinal *&lclColInds, const Scalar *&vals) const override
Get a constant, nonpersisting, locally indexed view of the given row of the matrix, using &quot;raw&quot; pointers instead of Teuchos::ArrayView.
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas)
Allocate values (and optionally indices) using the Node.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
Insert new values that don&#39;t currently exist.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified, possibly nondefault Kokkos Node type.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets, Distributor &dist) const
Pack this object&#39;s data for an Import or Export.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
void reorderedGaussSeidelCopy(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Teuchos::ArrayView< LocalOrdinal > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Functions for manipulating CRS arrays.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
Sets up and executes a communication plan for a Tpetra DistObject.
mag_type frobNorm_
Cached Frobenius norm of the (global) matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
CrsGraphType::global_ordinal_type getGlobalNumDiags(const CrsGraphType &G)
Number of populated diagonal entries in the given sparse graph, over all processes in the graph&#39;s (MP...
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
bool fillComplete_
Whether the matrix is fill complete.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
typename Node::device_type device_type
The Kokkos device type.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process...
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps...
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
std::string description() const override
A one-line description of this object.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
LO getLocalDiagCopyWithoutOffsetsNotFillComplete(::Tpetra::Vector< SC, LO, GO, NT > &diag, const ::Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool debug=false)
Given a locally indexed, global sparse matrix, extract the matrix&#39;s diagonal entries into a Tpetra::V...
Replace existing values with new values.
void computeGlobalConstants()
Compute matrix properties that require collectives.
#define TPETRA_EFFICIENCY_WARNING(throw_exception_test, Exception, msg)
Print or throw an efficency warning.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Replace old values with zero.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void modify_host()
Mark data as modified on the host side.
bool isLocallyComplete() const
Is this Export or Import locally complete?
local_matrix_type::values_type getLocalValuesView() const
Get the Kokkos local values.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
ProfileType getProfileType() const
Returns true if the matrix was allocated with static data structures.
void getGlobalRowView(GlobalOrdinal GlobalRow, Teuchos::ArrayView< const GlobalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
LocalOrdinal getLocalElement(GlobalOrdinal globalIndex) const
The local index corresponding to the given global index.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
local_map_type getLocalMap() const
Get the local Map for Kokkos kernels.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using local column indices.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2)
Sort the first array, and apply the resulting permutation to the second array.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::ArrayView< const impl_scalar_type > getView(RowInfo rowinfo) const
Constant view of all entries (including extra space) in the given row.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A read-only, row-oriented interface to a sparse matrix.
local_matrix_type getLocalMatrix() const
The local sparse matrix.
void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView< const LocalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
A distributed dense vector.
Declaration of Tpetra::Details::iallreduce.
void reduce()
Sum values of a locally replicated multivector across all processes.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.
KOKKOS_INLINE_FUNCTION void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process...
size_t mergeRowIndicesAndValues(crs_graph_type &graph, const RowInfo &rowInfo)
Merge duplicate row indices in the given row, along with their corresponding values.
dual_view_type::t_dev getLocalViewDevice() const
A local Kokkos::View of device memory.
LocalOrdinal replaceGlobalValues(const GlobalOrdinal globalRow, const typename UnmanagedView< GlobalIndicesViewType >::type &inputInds, const typename UnmanagedView< ImplScalarViewType >::type &inputVals) const
Replace one or more entries&#39; values, using global indices.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices, const Padding &padding)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void reorderedGaussSeidel(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Teuchos::ArrayView< LocalOrdinal > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps) const
Reordered &quot;Hybrid&quot; Jacobi + (Gauss-Seidel or SOR) on .
Declaration and definition of Tpetra::Details::getEntryOnHost.
Teuchos::ArrayView< impl_scalar_type > getViewNonConst(const RowInfo &rowinfo) const
Nonconst view of all entries (including extra space) in the given row.
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
::Tpetra::Details::EStorageStatus storageStatus_
Status of the matrix&#39;s storage, when not in a fill-complete state.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
LocalOrdinal getViewRawConst(const impl_scalar_type *&vals, LocalOrdinal &numEnt, const RowInfo &rowinfo) const
Const pointer to all entries (including extra space) in the given row.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
void sync_device()
Synchronize to Device.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.