Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_CRSMATRIX_DEF_HPP
41 #define TPETRA_CRSMATRIX_DEF_HPP
42 
50 
51 #include "Tpetra_Import_Util.hpp"
52 #include "Tpetra_Import_Util2.hpp"
53 #include "Tpetra_RowMatrix.hpp"
54 #include "Tpetra_LocalCrsMatrixOperator.hpp"
55 
62 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
70 #include "Tpetra_Details_packCrsMatrix.hpp"
71 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
73 #include "Teuchos_FancyOStream.hpp"
74 #include "Teuchos_RCP.hpp"
75 #include "Teuchos_DataAccess.hpp"
76 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
77 #include "KokkosBlas1_scal.hpp"
78 #include "KokkosSparse_getDiagCopy.hpp"
79 #include "KokkosSparse_spmv.hpp"
80 
81 #include <memory>
82 #include <sstream>
83 #include <typeinfo>
84 #include <utility>
85 #include <vector>
86 
87 namespace Tpetra {
88 
89 namespace { // (anonymous)
90 
91  template<class T, class BinaryFunction>
92  T atomic_binary_function_update (volatile T* const dest,
93  const T& inputVal,
94  BinaryFunction f)
95  {
96  T oldVal = *dest;
97  T assume;
98 
99  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
100  // POWER architectures, because 'newval' depends on 'assume',
101  // which depends on 'oldVal', which depends on '*dest'. This
102  // sets up a chain of read dependencies that should ensure
103  // correct behavior given a sane memory model.
104  do {
105  assume = oldVal;
106  T newVal = f (assume, inputVal);
107  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
108  } while (assume != oldVal);
109 
110  return oldVal;
111  }
112 } // namespace (anonymous)
113 
114 //
115 // Users must never rely on anything in the Details namespace.
116 //
117 namespace Details {
118 
128 template<class Scalar>
129 struct AbsMax {
131  Scalar operator() (const Scalar& x, const Scalar& y) {
132  typedef Teuchos::ScalarTraits<Scalar> STS;
133  return std::max (STS::magnitude (x), STS::magnitude (y));
134  }
135 };
136 
137 } // namespace Details
138 } // namespace Tpetra
139 
140 namespace Tpetra {
141 
142  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
143  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
144  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
145  size_t maxNumEntriesPerRow,
146  const Teuchos::RCP<Teuchos::ParameterList>& params) :
147  dist_object_type (rowMap)
148  {
149  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
150  "[, RCP<ParameterList>]): ";
151  Teuchos::RCP<crs_graph_type> graph;
152  try {
153  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
154  params));
155  }
156  catch (std::exception& e) {
157  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
158  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
159  "size_t [, RCP<ParameterList>]) threw an exception: "
160  << e.what ());
161  }
162  // myGraph_ not null means that the matrix owns the graph. That's
163  // different than the const CrsGraph constructor, where the matrix
164  // does _not_ own the graph.
165  myGraph_ = graph;
166  staticGraph_ = myGraph_;
167  resumeFill (params);
169  }
170 
171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
173  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
174  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
175  const Teuchos::RCP<Teuchos::ParameterList>& params) :
176  dist_object_type (rowMap)
177  {
178  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
179  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
180  Teuchos::RCP<crs_graph_type> graph;
181  try {
182  using Teuchos::rcp;
183  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
184  params));
185  }
186  catch (std::exception& e) {
187  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
188  (true, std::runtime_error, "CrsGraph constructor "
189  "(RCP<const Map>, ArrayView<const size_t>"
190  "[, RCP<ParameterList>]) threw an exception: "
191  << e.what ());
192  }
193  // myGraph_ not null means that the matrix owns the graph. That's
194  // different than the const CrsGraph constructor, where the matrix
195  // does _not_ own the graph.
196  myGraph_ = graph;
197  staticGraph_ = graph;
198  resumeFill (params);
200  }
201 
202  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
204  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
205  const Teuchos::RCP<const map_type>& colMap,
206  const size_t maxNumEntPerRow,
207  const Teuchos::RCP<Teuchos::ParameterList>& params) :
208  dist_object_type (rowMap)
209  {
210  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
211  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
212  const char suffix[] =
213  " Please report this bug to the Tpetra developers.";
214 
215  // An artifact of debugging something a while back.
216  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
217  (! staticGraph_.is_null (), std::logic_error,
218  "staticGraph_ is not null at the beginning of the constructor."
219  << suffix);
220  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
221  (! myGraph_.is_null (), std::logic_error,
222  "myGraph_ is not null at the beginning of the constructor."
223  << suffix);
224  Teuchos::RCP<crs_graph_type> graph;
225  try {
226  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
227  maxNumEntPerRow,
228  params));
229  }
230  catch (std::exception& e) {
231  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
232  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
233  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
234  "exception: " << e.what ());
235  }
236  // myGraph_ not null means that the matrix owns the graph. That's
237  // different than the const CrsGraph constructor, where the matrix
238  // does _not_ own the graph.
239  myGraph_ = graph;
240  staticGraph_ = myGraph_;
241  resumeFill (params);
243  }
244 
245  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
247  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
248  const Teuchos::RCP<const map_type>& colMap,
249  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
250  const Teuchos::RCP<Teuchos::ParameterList>& params) :
251  dist_object_type (rowMap)
252  {
253  const char tfecfFuncName[] =
254  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
255  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
256  Teuchos::RCP<crs_graph_type> graph;
257  try {
258  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
259  numEntPerRowToAlloc,
260  params));
261  }
262  catch (std::exception& e) {
263  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
264  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
265  "RCP<const Map>, ArrayView<const size_t>[, "
266  "RCP<ParameterList>]) threw an exception: " << e.what ());
267  }
268  // myGraph_ not null means that the matrix owns the graph. That's
269  // different than the const CrsGraph constructor, where the matrix
270  // does _not_ own the graph.
271  myGraph_ = graph;
272  staticGraph_ = graph;
273  resumeFill (params);
275  }
276 
277 
278  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
280  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
281  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
282  dist_object_type (graph->getRowMap ()),
283  staticGraph_ (graph),
284  storageStatus_ (Details::STORAGE_1D_PACKED)
285  {
286  using std::endl;
287  typedef typename local_matrix_device_type::values_type values_type;
288  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
289  "RCP<ParameterList>]): ";
290  const bool verbose = Details::Behavior::verbose("CrsMatrix");
291 
292  std::unique_ptr<std::string> prefix;
293  if (verbose) {
294  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
295  std::ostringstream os;
296  os << *prefix << "Start" << endl;
297  std::cerr << os.str ();
298  }
299 
300  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
301  (graph.is_null (), std::runtime_error, "Input graph is null.");
302  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
303  (! graph->isFillComplete (), std::runtime_error, "Input graph "
304  "is not fill complete. You must call fillComplete on the "
305  "graph before using it to construct a CrsMatrix. Note that "
306  "calling resumeFill on the graph makes it not fill complete, "
307  "even if you had previously called fillComplete. In that "
308  "case, you must call fillComplete on the graph again.");
309 
310  // The graph is fill complete, so it is locally indexed and has a
311  // fixed structure. This means we can allocate the (1-D) array of
312  // values and build the local matrix right now. Note that the
313  // local matrix's number of columns comes from the column Map, not
314  // the domain Map.
315 
316  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
317  if (verbose) {
318  std::ostringstream os;
319  os << *prefix << "Allocate values: " << numEnt << endl;
320  std::cerr << os.str ();
321  }
322 
323  values_type val ("Tpetra::CrsMatrix::values", numEnt);
324  valuesPacked_wdv = values_wdv_type(val);
325  valuesUnpacked_wdv = valuesPacked_wdv;
326 
328 
329  if (verbose) {
330  std::ostringstream os;
331  os << *prefix << "Done" << endl;
332  std::cerr << os.str ();
333  }
334  }
335 
336  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
339  const Teuchos::RCP<const crs_graph_type>& graph,
340  const Teuchos::RCP<Teuchos::ParameterList>& params) :
341  dist_object_type (graph->getRowMap ()),
342  staticGraph_ (graph),
343  storageStatus_ (matrix.storageStatus_)
344  {
345  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
346  "local_matrix_device_type::values_type, "
347  "[,RCP<ParameterList>]): ";
348  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
349  (graph.is_null (), std::runtime_error, "Input graph is null.");
350  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
351  (! graph->isFillComplete (), std::runtime_error, "Input graph "
352  "is not fill complete. You must call fillComplete on the "
353  "graph before using it to construct a CrsMatrix. Note that "
354  "calling resumeFill on the graph makes it not fill complete, "
355  "even if you had previously called fillComplete. In that "
356  "case, you must call fillComplete on the graph again.");
357 
358  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
359  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
360 
361  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
362  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
363 
365  }
366 
367 
368  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
370  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
371  const typename local_matrix_device_type::values_type& values,
372  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
373  dist_object_type (graph->getRowMap ()),
374  staticGraph_ (graph),
375  storageStatus_ (Details::STORAGE_1D_PACKED)
376  {
377  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
378  "local_matrix_device_type::values_type, "
379  "[,RCP<ParameterList>]): ";
380  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
381  (graph.is_null (), std::runtime_error, "Input graph is null.");
382  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
383  (! graph->isFillComplete (), std::runtime_error, "Input graph "
384  "is not fill complete. You must call fillComplete on the "
385  "graph before using it to construct a CrsMatrix. Note that "
386  "calling resumeFill on the graph makes it not fill complete, "
387  "even if you had previously called fillComplete. In that "
388  "case, you must call fillComplete on the graph again.");
389 
390  // The graph is fill complete, so it is locally indexed and has a
391  // fixed structure. This means we can allocate the (1-D) array of
392  // values and build the local matrix right now. Note that the
393  // local matrix's number of columns comes from the column Map, not
394  // the domain Map.
395 
396  valuesPacked_wdv = values_wdv_type(values);
397  valuesUnpacked_wdv = valuesPacked_wdv;
398 
399  // FIXME (22 Jun 2016) I would very much like to get rid of
400  // k_values1D_ at some point. I find it confusing to have all
401  // these extra references lying around.
402  // KDDKDD ALMOST THERE, MARK!
403 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
404 
406  }
407 
408  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
410  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
411  const Teuchos::RCP<const map_type>& colMap,
412  const typename local_graph_device_type::row_map_type& rowPointers,
413  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
414  const typename local_matrix_device_type::values_type& values,
415  const Teuchos::RCP<Teuchos::ParameterList>& params) :
416  dist_object_type (rowMap),
417  storageStatus_ (Details::STORAGE_1D_PACKED)
418  {
419  using Details::getEntryOnHost;
420  using Teuchos::RCP;
421  using std::endl;
422  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
423  "RCP<const Map>, ptr, ind, val[, params]): ";
424  const char suffix[] =
425  ". Please report this bug to the Tpetra developers.";
426  const bool debug = Details::Behavior::debug("CrsMatrix");
427  const bool verbose = Details::Behavior::verbose("CrsMatrix");
428 
429  std::unique_ptr<std::string> prefix;
430  if (verbose) {
431  prefix = this->createPrefix(
432  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
433  std::ostringstream os;
434  os << *prefix << "Start" << endl;
435  std::cerr << os.str ();
436  }
437 
438  // Check the user's input. Note that this might throw only on
439  // some processes but not others, causing deadlock. We prefer
440  // deadlock due to exceptions to segfaults, because users can
441  // catch exceptions.
442  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
443  (values.extent(0) != columnIndices.extent(0),
444  std::invalid_argument, "values.extent(0)=" << values.extent(0)
445  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
446  << ".");
447  if (debug && rowPointers.extent(0) != 0) {
448  const size_t numEnt =
449  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
450  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
451  (numEnt != size_t(columnIndices.extent(0)) ||
452  numEnt != size_t(values.extent(0)),
453  std::invalid_argument, "Last entry of rowPointers says that "
454  "the matrix has " << numEnt << " entr"
455  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
456  "columnIndices and values don't match this. "
457  "columnIndices.extent(0)=" << columnIndices.extent (0)
458  << " and values.extent(0)=" << values.extent (0) << ".");
459  }
460 
461  RCP<crs_graph_type> graph;
462  try {
463  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
464  columnIndices, params));
465  }
466  catch (std::exception& e) {
467  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
468  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
469  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
470  << e.what ());
471  }
472  // The newly created CrsGraph _must_ have a local graph at this
473  // point. We don't really care whether CrsGraph's constructor
474  // deep-copies or shallow-copies the input, but the dimensions
475  // have to be right. That's how we tell whether the CrsGraph has
476  // a local graph.
477  auto lclGraph = graph->getLocalGraphDevice ();
478  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
479  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
480  lclGraph.entries.extent (0) != columnIndices.extent (0),
481  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
482  "ind[, params]) did not set the local graph correctly." << suffix);
483  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
484  (lclGraph.entries.extent (0) != values.extent (0),
485  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
486  "params]) did not set the local graph correctly. "
487  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
488  << " != values.extent(0) = " << values.extent (0) << suffix);
489 
490  // myGraph_ not null means that the matrix owns the graph. This
491  // is true because the column indices come in as nonconst,
492  // implying shared ownership.
493  myGraph_ = graph;
494  staticGraph_ = graph;
495 
496  // The graph may not be fill complete yet. However, it is locally
497  // indexed (since we have a column Map) and has a fixed structure
498  // (due to the input arrays). This means we can allocate the
499  // (1-D) array of values and build the local matrix right now.
500  // Note that the local matrix's number of columns comes from the
501  // column Map, not the domain Map.
502 
503  valuesPacked_wdv = values_wdv_type(values);
504  valuesUnpacked_wdv = valuesPacked_wdv;
505 
506  // FIXME (22 Jun 2016) I would very much like to get rid of
507  // k_values1D_ at some point. I find it confusing to have all
508  // these extra references lying around.
509 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
510 
512  if (verbose) {
513  std::ostringstream os;
514  os << *prefix << "Done" << endl;
515  std::cerr << os.str();
516  }
517  }
518 
519  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
521  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
522  const Teuchos::RCP<const map_type>& colMap,
523  const Teuchos::ArrayRCP<size_t>& ptr,
524  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
525  const Teuchos::ArrayRCP<Scalar>& val,
526  const Teuchos::RCP<Teuchos::ParameterList>& params) :
527  dist_object_type (rowMap),
528  storageStatus_ (Details::STORAGE_1D_PACKED)
529  {
530  using Kokkos::Compat::getKokkosViewDeepCopy;
531  using Teuchos::av_reinterpret_cast;
532  using Teuchos::RCP;
533  using values_type = typename local_matrix_device_type::values_type;
534  using IST = impl_scalar_type;
535  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
536  "RCP<const Map>, ptr, ind, val[, params]): ";
537 
538  RCP<crs_graph_type> graph;
539  try {
540  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
541  ind, params));
542  }
543  catch (std::exception& e) {
544  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
545  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
546  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
547  "RCP<ParameterList>]) threw an exception: " << e.what ());
548  }
549  // myGraph_ not null means that the matrix owns the graph. This
550  // is true because the column indices come in as nonconst,
551  // implying shared ownership.
552  myGraph_ = graph;
553  staticGraph_ = graph;
554 
555  // The graph may not be fill complete yet. However, it is locally
556  // indexed (since we have a column Map) and has a fixed structure
557  // (due to the input arrays). This means we can allocate the
558  // (1-D) array of values and build the local matrix right now.
559  // Note that the local matrix's number of columns comes from the
560  // column Map, not the domain Map.
561 
562  // The graph _must_ have a local graph at this point. We don't
563  // really care whether CrsGraph's constructor deep-copies or
564  // shallow-copies the input, but the dimensions have to be right.
565  // That's how we tell whether the CrsGraph has a local graph.
566  auto lclGraph = staticGraph_->getLocalGraphDevice ();
567  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
568  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
569  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
570  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
571  "ptr, ind[, params]) did not set the local graph correctly. "
572  "Please report this bug to the Tpetra developers.");
573 
574  values_type valIn =
575  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
576  valuesPacked_wdv = values_wdv_type(valIn);
577  valuesUnpacked_wdv = valuesPacked_wdv;
578 
579  // FIXME (22 Jun 2016) I would very much like to get rid of
580  // k_values1D_ at some point. I find it confusing to have all
581  // these extra references lying around.
582 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
583 
585  }
586 
587  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
589  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
590  const Teuchos::RCP<const map_type>& colMap,
591  const local_matrix_device_type& lclMatrix,
592  const Teuchos::RCP<Teuchos::ParameterList>& params) :
593  dist_object_type (rowMap),
594  storageStatus_ (Details::STORAGE_1D_PACKED),
595  fillComplete_ (true)
596  {
597  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
598  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
599  const char suffix[] =
600  " Please report this bug to the Tpetra developers.";
601 
602  Teuchos::RCP<crs_graph_type> graph;
603  try {
604  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
605  lclMatrix.graph, params));
606  }
607  catch (std::exception& e) {
608  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
609  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
610  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
611  "exception: " << e.what ());
612  }
613  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
614  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
615  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
616  "did not produce a fill-complete graph. Please report this bug to the "
617  "Tpetra developers.");
618  // myGraph_ not null means that the matrix owns the graph. This
619  // is true because the column indices come in as nonconst through
620  // the matrix, implying shared ownership.
621  myGraph_ = graph;
622  staticGraph_ = graph;
623 
624  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
625  valuesUnpacked_wdv = valuesPacked_wdv;
626 
627  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
628  (isFillActive (), std::logic_error,
629  "At the end of a CrsMatrix constructor that should produce "
630  "a fillComplete matrix, isFillActive() is true." << suffix);
631  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
632  (! isFillComplete (), std::logic_error, "At the end of a "
633  "CrsMatrix constructor that should produce a fillComplete "
634  "matrix, isFillComplete() is false." << suffix);
636  }
637 
638  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
641  const Teuchos::RCP<const map_type>& rowMap,
642  const Teuchos::RCP<const map_type>& colMap,
643  const Teuchos::RCP<const map_type>& domainMap,
644  const Teuchos::RCP<const map_type>& rangeMap,
645  const Teuchos::RCP<Teuchos::ParameterList>& params) :
646  dist_object_type (rowMap),
647  storageStatus_ (Details::STORAGE_1D_PACKED),
648  fillComplete_ (true)
649  {
650  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
651  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
652  "local_matrix_device_type[, RCP<ParameterList>]): ";
653  const char suffix[] =
654  " Please report this bug to the Tpetra developers.";
655 
656  Teuchos::RCP<crs_graph_type> graph;
657  try {
658  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
659  domainMap, rangeMap, params));
660  }
661  catch (std::exception& e) {
662  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
663  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
664  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
665  "RCP<ParameterList>]) threw an exception: " << e.what ());
666  }
667  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
668  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
669  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
670  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
671  "not produce a fillComplete graph." << suffix);
672  // myGraph_ not null means that the matrix owns the graph. This
673  // is true because the column indices come in as nonconst through
674  // the matrix, implying shared ownership.
675  myGraph_ = graph;
676  staticGraph_ = graph;
677 
678  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
679  valuesUnpacked_wdv = valuesPacked_wdv;
680 
681  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
682  (isFillActive (), std::logic_error,
683  "At the end of a CrsMatrix constructor that should produce "
684  "a fillComplete matrix, isFillActive() is true." << suffix);
685  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
686  (! isFillComplete (), std::logic_error, "At the end of a "
687  "CrsMatrix constructor that should produce a fillComplete "
688  "matrix, isFillComplete() is false." << suffix);
690  }
691 
692  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
695  const Teuchos::RCP<const map_type>& rowMap,
696  const Teuchos::RCP<const map_type>& colMap,
697  const Teuchos::RCP<const map_type>& domainMap,
698  const Teuchos::RCP<const map_type>& rangeMap,
699  const Teuchos::RCP<const import_type>& importer,
700  const Teuchos::RCP<const export_type>& exporter,
701  const Teuchos::RCP<Teuchos::ParameterList>& params) :
702  dist_object_type (rowMap),
703  storageStatus_ (Details::STORAGE_1D_PACKED),
704  fillComplete_ (true)
705  {
706  using Teuchos::rcp;
707  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
708  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
709  const char suffix[] =
710  " Please report this bug to the Tpetra developers.";
711 
712  Teuchos::RCP<crs_graph_type> graph;
713  try {
714  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
715  domainMap, rangeMap, importer,
716  exporter, params));
717  }
718  catch (std::exception& e) {
719  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
720  (true, std::runtime_error, "CrsGraph constructor "
721  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
722  "params) threw: " << e.what ());
723  }
724  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
725  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
726  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
727  "Export, params) did not produce a fill-complete graph. "
728  "Please report this bug to the Tpetra developers.");
729  // myGraph_ not null means that the matrix owns the graph. This
730  // is true because the column indices come in as nonconst through
731  // the matrix, implying shared ownership.
732  myGraph_ = graph;
733  staticGraph_ = graph;
734 
735  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
736  valuesUnpacked_wdv = valuesPacked_wdv;
737 
738  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
739  (isFillActive (), std::logic_error,
740  "At the end of a CrsMatrix constructor that should produce "
741  "a fillComplete matrix, isFillActive() is true." << suffix);
742  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
743  (! isFillComplete (), std::logic_error, "At the end of a "
744  "CrsMatrix constructor that should produce a fillComplete "
745  "matrix, isFillComplete() is false." << suffix);
747  }
748 
749  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
752  const Teuchos::DataAccess copyOrView):
753  dist_object_type (source.getCrsGraph()->getRowMap ()),
754  staticGraph_ (source.getCrsGraph()),
755  storageStatus_ (source.storageStatus_)
756  {
757  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
758  "const CrsMatrix&, const Teuchos::DataAccess): ";
759  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
760  (! source.isFillComplete (), std::invalid_argument,
761  "Source graph must be fillComplete().");
762 
763  if (copyOrView == Teuchos::Copy) {
764  using values_type = typename local_matrix_device_type::values_type;
765  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
766  using Kokkos::view_alloc;
767  using Kokkos::WithoutInitializing;
768  values_type newvals (view_alloc ("val", WithoutInitializing),
769  vals.extent (0));
770  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
771  Kokkos::deep_copy (newvals, vals);
772  valuesPacked_wdv = values_wdv_type(newvals);
773  valuesUnpacked_wdv = valuesPacked_wdv;
774  fillComplete (source.getDomainMap (), source.getRangeMap ());
775  }
776  else if (copyOrView == Teuchos::View) {
777  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
778  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
779  fillComplete (source.getDomainMap (), source.getRangeMap ());
780  }
781  else {
782  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
783  (true, std::invalid_argument, "Second argument 'copyOrView' "
784  "has an invalid value " << copyOrView << ". Valid values "
785  "include Teuchos::Copy = " << Teuchos::Copy << " and "
786  "Teuchos::View = " << Teuchos::View << ".");
787  }
789  }
790 
791  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
792  void
795  {
796  std::swap(crs_matrix.importMV_, this->importMV_);
797  std::swap(crs_matrix.exportMV_, this->exportMV_);
798  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
799  std::swap(crs_matrix.myGraph_, this->myGraph_);
800  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
801  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
802  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
803  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
804  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
805  }
806 
807  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
808  Teuchos::RCP<const Teuchos::Comm<int> >
810  getComm () const {
811  return getCrsGraphRef ().getComm ();
812  }
813 
814  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
815  bool
817  isFillComplete () const {
818  return fillComplete_;
819  }
820 
821  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
822  bool
824  isFillActive () const {
825  return ! fillComplete_;
826  }
827 
828  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
829  bool
832  return this->getCrsGraphRef ().isStorageOptimized ();
833  }
834 
835  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
836  bool
839  return getCrsGraphRef ().isLocallyIndexed ();
840  }
841 
842  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
843  bool
846  return getCrsGraphRef ().isGloballyIndexed ();
847  }
848 
849  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
850  bool
852  hasColMap () const {
853  return getCrsGraphRef ().hasColMap ();
854  }
855 
856  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
860  return getCrsGraphRef ().getGlobalNumEntries ();
861  }
862 
863  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
864  size_t
867  return getCrsGraphRef ().getLocalNumEntries ();
868  }
869 
870  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
874  return getCrsGraphRef ().getGlobalNumRows ();
875  }
876 
877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
881  return getCrsGraphRef ().getGlobalNumCols ();
882  }
883 
884  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
885  size_t
888  return getCrsGraphRef ().getLocalNumRows ();
889  }
890 
891 
892  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
893  size_t
896  return getCrsGraphRef ().getLocalNumCols ();
897  }
898 
899 
900  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
901  size_t
903  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
904  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
905  }
906 
907  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
908  size_t
910  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
911  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
912  }
913 
914  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
915  size_t
918  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
919  }
920 
921  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
922  size_t
925  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
926  }
927 
928  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
929  GlobalOrdinal
931  getIndexBase () const {
932  return getRowMap ()->getIndexBase ();
933  }
934 
935  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
936  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
938  getRowMap () const {
939  return getCrsGraphRef ().getRowMap ();
940  }
941 
942  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
943  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
945  getColMap () const {
946  return getCrsGraphRef ().getColMap ();
947  }
948 
949  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
950  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
952  getDomainMap () const {
953  return getCrsGraphRef ().getDomainMap ();
954  }
955 
956  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
957  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
959  getRangeMap () const {
960  return getCrsGraphRef ().getRangeMap ();
961  }
962 
963  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
964  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
966  getGraph () const {
967  if (staticGraph_ != Teuchos::null) {
968  return staticGraph_;
969  }
970  return myGraph_;
971  }
972 
973  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
974  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
976  getCrsGraph () const {
977  if (staticGraph_ != Teuchos::null) {
978  return staticGraph_;
979  }
980  return myGraph_;
981  }
982 
983  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
986  getCrsGraphRef () const
987  {
988 #ifdef HAVE_TPETRA_DEBUG
989  constexpr bool debug = true;
990 #else
991  constexpr bool debug = false;
992 #endif // HAVE_TPETRA_DEBUG
993 
994  if (! this->staticGraph_.is_null ()) {
995  return * (this->staticGraph_);
996  }
997  else {
998  if (debug) {
999  const char tfecfFuncName[] = "getCrsGraphRef: ";
1000  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1001  (this->myGraph_.is_null (), std::logic_error,
1002  "Both staticGraph_ and myGraph_ are null. "
1003  "Please report this bug to the Tpetra developers.");
1004  }
1005  return * (this->myGraph_);
1006  }
1007  }
1008 
1009  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1010  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
1013  {
1014  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1015  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
1016  numCols,
1017  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
1018  staticGraph_->getLocalGraphDevice());
1019  }
1020 
1021  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1022  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
1024  getLocalMatrixHost () const
1025  {
1026  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1027  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
1028  valuesPacked_wdv.getHostView(Access::ReadWrite),
1029  staticGraph_->getLocalGraphHost());
1030  }
1031 
1032 #if KOKKOSKERNELS_VERSION < 40299
1033 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1034  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1035  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1038  {
1039  auto localMatrix = getLocalMatrixDevice();
1040 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
1041  if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()))
1042  {
1043  if(this->ordinalRowptrs.data() == nullptr)
1044  {
1045  auto originalRowptrs = localMatrix.graph.row_map;
1046  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1047  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1048  this->ordinalRowptrs = ordinal_rowptrs_type(
1049  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1050  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1051  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1052  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1053  KOKKOS_LAMBDA(LocalOrdinal i)
1054  {
1055  ordinalRowptrs_(i) = originalRowptrs(i);
1056  });
1057  }
1058  //return local operator using ordinalRowptrs
1059  return std::make_shared<local_multiply_op_type>(
1060  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1061  }
1062 #endif
1063 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1064  return std::make_shared<local_multiply_op_type>(
1065  std::make_shared<local_matrix_device_type>(localMatrix));
1066  }
1067 #endif
1068 
1069  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1070  bool
1072  isStaticGraph () const {
1073  return myGraph_.is_null ();
1074  }
1075 
1076  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1077  bool
1080  return true;
1081  }
1082 
1083  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1084  bool
1087  return true;
1088  }
1089 
1090  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1091  void
1093  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1094  const bool verbose)
1095  {
1096  using Details::Behavior;
1098  using std::endl;
1099  const char tfecfFuncName[] = "allocateValues: ";
1100  const char suffix[] =
1101  " Please report this bug to the Tpetra developers.";
1102  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1103 
1104  std::unique_ptr<std::string> prefix;
1105  if (verbose) {
1106  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1107  std::ostringstream os;
1108  os << *prefix << "lg: "
1109  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1110  << ", gas: Graph"
1111  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1112  << "Allocated" << endl;
1113  std::cerr << os.str();
1114  }
1115 
1116  const bool debug = Behavior::debug("CrsMatrix");
1117  if (debug) {
1118  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1119  (this->staticGraph_.is_null (), std::logic_error,
1120  "staticGraph_ is null." << suffix);
1121 
1122  // If the graph indices are already allocated, then gas should be
1123  // GraphAlreadyAllocated. Otherwise, gas should be
1124  // GraphNotYetAllocated.
1125  if ((gas == GraphAlreadyAllocated) !=
1126  staticGraph_->indicesAreAllocated ()) {
1127  const char err1[] = "The caller has asserted that the graph "
1128  "is ";
1129  const char err2[] = "already allocated, but the static graph "
1130  "says that its indices are ";
1131  const char err3[] = "already allocated. ";
1132  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1133  (gas == GraphAlreadyAllocated &&
1134  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1135  err1 << err2 << "not " << err3 << suffix);
1136  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1137  (gas != GraphAlreadyAllocated &&
1138  staticGraph_->indicesAreAllocated (), std::logic_error,
1139  err1 << "not " << err2 << err3 << suffix);
1140  }
1141 
1142  // If the graph is unallocated, then it had better be a
1143  // matrix-owned graph. ("Matrix-owned graph" means that the
1144  // matrix gets to define the graph structure. If the CrsMatrix
1145  // constructor that takes an RCP<const CrsGraph> was used, then
1146  // the matrix does _not_ own the graph.)
1147  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1148  (! this->staticGraph_->indicesAreAllocated () &&
1149  this->myGraph_.is_null (), std::logic_error,
1150  "The static graph says that its indices are not allocated, "
1151  "but the graph is not owned by the matrix." << suffix);
1152  }
1153 
1154  if (gas == GraphNotYetAllocated) {
1155  if (debug) {
1156  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1157  (this->myGraph_.is_null (), std::logic_error,
1158  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1159  }
1160  try {
1161  this->myGraph_->allocateIndices (lg, verbose);
1162  }
1163  catch (std::exception& e) {
1164  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1165  (true, std::runtime_error, "CrsGraph::allocateIndices "
1166  "threw an exception: " << e.what ());
1167  }
1168  catch (...) {
1169  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1170  (true, std::runtime_error, "CrsGraph::allocateIndices "
1171  "threw an exception not a subclass of std::exception.");
1172  }
1173  }
1174 
1175  // Allocate matrix values.
1176  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1177  if (debug) {
1178  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1179  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1180  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1181  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1182  }
1183 
1184  // Allocate array of (packed???) matrix values.
1185  using values_type = typename local_matrix_device_type::values_type;
1186  if (verbose) {
1187  std::ostringstream os;
1188  os << *prefix << "Allocate values_wdv: Pre "
1189  << valuesUnpacked_wdv.extent(0) << ", post "
1190  << lclTotalNumEntries << endl;
1191  std::cerr << os.str();
1192  }
1193 // this->k_values1D_ =
1194  valuesUnpacked_wdv = values_wdv_type(
1195  values_type("Tpetra::CrsMatrix::values",
1196  lclTotalNumEntries));
1197  }
1198 
1199 
1200  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1201  void
1203  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1204  {
1206  using ::Tpetra::Details::getEntryOnHost;
1207  using Teuchos::arcp_const_cast;
1208  using Teuchos::Array;
1209  using Teuchos::ArrayRCP;
1210  using Teuchos::null;
1211  using Teuchos::RCP;
1212  using Teuchos::rcp;
1213  using std::endl;
1214  using row_map_type = typename local_graph_device_type::row_map_type;
1215  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1216  using values_type = typename local_matrix_device_type::values_type;
1217  Details::ProfilingRegion regionFLGAM
1218  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1219 
1220  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1221  "fillComplete or expertStaticFillComplete): ";
1222  const char suffix[] =
1223  " Please report this bug to the Tpetra developers.";
1224  const bool debug = Details::Behavior::debug("CrsMatrix");
1225  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1226 
1227  std::unique_ptr<std::string> prefix;
1228  if (verbose) {
1229  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1230  std::ostringstream os;
1231  os << *prefix << endl;
1232  std::cerr << os.str ();
1233  }
1234 
1235  if (debug) {
1236  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1237  // matrix owns the graph, which means myGraph_ is not null.
1238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1239  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1240  "(myGraph_) is null. This means that the matrix has a "
1241  "const (a.k.a. \"static\") graph. fillComplete or "
1242  "expertStaticFillComplete should never call "
1243  "fillLocalGraphAndMatrix in that case." << suffix);
1244  }
1245 
1246  const size_t lclNumRows = this->getLocalNumRows ();
1247 
1248  // This method's goal is to fill in the three arrays (compressed
1249  // sparse row format) that define the sparse graph's and matrix's
1250  // structure, and the sparse matrix's values.
1251  //
1252  // Get references to the data in myGraph_, so we can modify them
1253  // as well. Note that we only call fillLocalGraphAndMatrix() if
1254  // the matrix owns the graph, which means myGraph_ is not null.
1255 
1256  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1257  // See: https://github.com/trilinos/Trilinos/issues/12237
1258  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1259  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1260 
1261  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1262  myGraph_->rowPtrsUnpacked_dev_;
1263 
1264  if (debug) {
1265  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1266  (curRowOffsets.extent (0) == 0, std::logic_error,
1267  "curRowOffsets.extent(0) == 0.");
1268  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1269  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1270  "curRowOffsets.extent(0) = "
1271  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1272  << (lclNumRows + 1) << ".");
1273  const size_t numOffsets = curRowOffsets.extent (0);
1274  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1275  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1276  (numOffsets != 0 &&
1277  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1278  std::logic_error, "numOffsets = " <<
1279  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1280  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1281  << numOffsets << ") = " << valToCheck << ".");
1282  }
1283 
1284  if (myGraph_->getLocalNumEntries() !=
1285  myGraph_->getLocalAllocationSize()) {
1286 
1287  // Use the nonconst version of row_map_type for k_ptrs,
1288  // because row_map_type is const and we need to modify k_ptrs here.
1289  typename row_map_type::non_const_type k_ptrs;
1290  row_map_type k_ptrs_const;
1291  lclinds_1d_type k_inds;
1292  values_type k_vals;
1293 
1294  if (verbose) {
1295  std::ostringstream os;
1296  const auto numEnt = myGraph_->getLocalNumEntries();
1297  const auto allocSize = myGraph_->getLocalAllocationSize();
1298  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1299  << ", allocSize=" << allocSize << endl;
1300  std::cerr << os.str ();
1301  }
1302  // The matrix's current 1-D storage is "unpacked." This means
1303  // the row offsets may differ from what the final row offsets
1304  // should be. This could happen, for example, if the user
1305  // set an upper
1306  // bound on the number of entries per row, but didn't fill all
1307  // those entries.
1308  if (debug && curRowOffsets.extent (0) != 0) {
1309  const size_t numOffsets =
1310  static_cast<size_t> (curRowOffsets.extent (0));
1311  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1312  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1313  (static_cast<size_t> (valToCheck) !=
1314  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1315  std::logic_error, "(unpacked branch) Before "
1316  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1317  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1318  " = " << valuesUnpacked_wdv.extent (0) << ".");
1319  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1320  (static_cast<size_t> (valToCheck) !=
1321  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1322  std::logic_error, "(unpacked branch) Before "
1323  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1324  << ") = " << valToCheck
1325  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1326  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1327  }
1328  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1329  // the array of valid entry counts per row.
1330 
1331  // Total number of entries in the matrix on the calling
1332  // process. We will compute this in the loop below. It's
1333  // cheap to compute and useful as a sanity check.
1334  size_t lclTotalNumEntries = 0;
1335  {
1336  // Allocate the packed row offsets array. We use a nonconst
1337  // temporary (packedRowOffsets) here, because k_ptrs is
1338  // const. We will assign packedRowOffsets to k_ptrs below.
1339  if (verbose) {
1340  std::ostringstream os;
1341  os << *prefix << "Allocate packed row offsets: "
1342  << (lclNumRows+1) << endl;
1343  std::cerr << os.str ();
1344  }
1345  typename row_map_type::non_const_type
1346  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1347  typename row_entries_type::const_type numRowEnt_h =
1348  myGraph_->k_numRowEntries_;
1349  // We're computing offsets on device. This function can
1350  // handle numRowEnt_h being a host View.
1351  lclTotalNumEntries =
1352  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1353  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1354  // to use packedRowOffsets in the loop above and assign here.
1355  k_ptrs = packedRowOffsets;
1356  k_ptrs_const = k_ptrs;
1357  }
1358 
1359  if (debug) {
1360  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1361  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1362  std::logic_error,
1363  "(unpacked branch) After packing k_ptrs, "
1364  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1365  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1366  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1367  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1368  (valToCheck != lclTotalNumEntries, std::logic_error,
1369  "(unpacked branch) After filling k_ptrs, "
1370  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1371  << " != total number of entries on the calling process = "
1372  << lclTotalNumEntries << ".");
1373  }
1374 
1375  // Allocate the arrays of packed column indices and values.
1376  if (verbose) {
1377  std::ostringstream os;
1378  os << *prefix << "Allocate packed local column indices: "
1379  << lclTotalNumEntries << endl;
1380  std::cerr << os.str ();
1381  }
1382  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1383  if (verbose) {
1384  std::ostringstream os;
1385  os << *prefix << "Allocate packed values: "
1386  << lclTotalNumEntries << endl;
1387  std::cerr << os.str ();
1388  }
1389  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1390 
1391  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1392  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1393  // the packed row offsets array k_ptrs that we created above.
1394  //
1395  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1396  // need to keep around the unpacked row offsets, column
1397  // indices, and values arrays.
1398 
1399  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1400  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1401  using inds_packer_type = pack_functor<
1402  typename Graph::local_graph_device_type::entries_type::non_const_type,
1403  typename Graph::local_inds_dualv_type::t_dev::const_type,
1404  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1405  typename Graph::local_graph_device_type::row_map_type>;
1406  inds_packer_type indsPacker (
1407  k_inds,
1408  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1409  k_ptrs, curRowOffsets);
1410  using exec_space = typename decltype (k_inds)::execution_space;
1411  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1412  Kokkos::parallel_for
1413  ("Tpetra::CrsMatrix pack column indices",
1414  range_type (0, lclNumRows), indsPacker);
1415 
1416  // Pack the values from unpacked valuesUnpacked_wdv into packed
1417  // k_vals. We will replace valuesPacked_wdv below.
1418  using vals_packer_type = pack_functor<
1419  typename values_type::non_const_type,
1420  typename values_type::const_type,
1421  typename row_map_type::non_const_type,
1422  typename row_map_type::const_type>;
1423  vals_packer_type valsPacker (
1424  k_vals,
1425  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1426  k_ptrs, curRowOffsets);
1427  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1428  range_type (0, lclNumRows), valsPacker);
1429 
1430  if (debug) {
1431  const char myPrefix[] = "(\"Optimize Storage\""
1432  "=true branch) After packing, ";
1433  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1434  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1435  << "k_ptrs.extent(0) = 0. This probably means that "
1436  "rowPtrsUnpacked_ was never allocated.");
1437  if (k_ptrs.extent (0) != 0) {
1438  const size_t numOffsets (k_ptrs.extent (0));
1439  const auto valToCheck =
1440  getEntryOnHost (k_ptrs, numOffsets - 1);
1441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1442  (size_t (valToCheck) != k_vals.extent (0),
1443  std::logic_error, myPrefix <<
1444  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1445  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1446  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1447  (size_t (valToCheck) != k_inds.extent (0),
1448  std::logic_error, myPrefix <<
1449  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1450  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1451  }
1452  }
1453  // Build the local graph.
1454  myGraph_->setRowPtrsPacked(k_ptrs_const);
1455  myGraph_->lclIndsPacked_wdv =
1456  typename crs_graph_type::local_inds_wdv_type(k_inds);
1457  valuesPacked_wdv = values_wdv_type(k_vals);
1458  }
1459  else { // We don't have to pack, so just set the pointers.
1460  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1461  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1462  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1463  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1464  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1465  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1466  valuesPacked_wdv = valuesUnpacked_wdv;
1467 
1468  if (verbose) {
1469  std::ostringstream os;
1470  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1471  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1472  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1473  << valuesUnpacked_wdv.extent(0) << endl;
1474  std::cerr << os.str();
1475  }
1476 
1477  if (debug) {
1478  const char myPrefix[] =
1479  "(\"Optimize Storage\"=false branch) ";
1480  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1481  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1482  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1483  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1484  "that rowPtrsUnpacked_ was never allocated.");
1485  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1486  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1487  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1488  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1489  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1490  std::logic_error, myPrefix <<
1491  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1492  << " != valuesPacked_wdv.extent(0) = "
1493  << valuesPacked_wdv.extent (0) << ".");
1494  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1495  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1496  std::logic_error, myPrefix <<
1497  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1498  << " != myGraph_->lclIndsPacked.extent(0) = "
1499  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1500  }
1501  }
1502  }
1503 
1504  if (debug) {
1505  const char myPrefix[] = "After packing, ";
1506  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1507  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1508  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1509  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1510  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1511  (lclNumRows+1) << ".");
1512  if (rowPtrsPackedHost.extent (0) != 0) {
1513  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1514  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1515  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1516  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1517  std::logic_error, myPrefix << "k_ptrs_const(" <<
1518  (numOffsets-1) << ") = " << valToCheck
1519  << " != valuesPacked_wdv.extent(0) = "
1520  << valuesPacked_wdv.extent (0) << ".");
1521  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1522  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1523  std::logic_error, myPrefix << "k_ptrs_const(" <<
1524  (numOffsets-1) << ") = " << valToCheck
1525  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1526  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1527  }
1528  }
1529 
1530  // May we ditch the old allocations for the packed (and otherwise
1531  // "optimized") allocations, later in this routine? Optimize
1532  // storage if the graph is not static, or if the graph already has
1533  // optimized storage.
1534  const bool defaultOptStorage =
1535  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1536  const bool requestOptimizedStorage =
1537  (! params.is_null () &&
1538  params->get ("Optimize Storage", defaultOptStorage)) ||
1539  (params.is_null () && defaultOptStorage);
1540 
1541  // The graph has optimized storage when indices are allocated,
1542  // myGraph_->k_numRowEntries_ is empty, and there are more than
1543  // zero rows on this process.
1544  if (requestOptimizedStorage) {
1545  // Free the old, unpacked, unoptimized allocations.
1546  // Free graph data structures that are only needed for
1547  // unpacked 1-D storage.
1548  if (verbose) {
1549  std::ostringstream os;
1550  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1551  << myGraph_->k_numRowEntries_.extent(0) << endl;
1552  std::cerr << os.str();
1553  }
1554 
1555  myGraph_->k_numRowEntries_ = row_entries_type ();
1556 
1557  // Keep the new 1-D packed allocations.
1558  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1559  // We directly set the memory spaces to avoid a memcpy from device to host
1560  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1561  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1562  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1563  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1564  valuesUnpacked_wdv = valuesPacked_wdv;
1565 
1566  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1567  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1568  }
1569  else {
1570  if (verbose) {
1571  std::ostringstream os;
1572  os << *prefix << "User requested NOT to optimize storage"
1573  << endl;
1574  std::cerr << os.str();
1575  }
1576  }
1577  }
1578 
1579  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1580  void
1582  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1583  {
1584  using ::Tpetra::Details::ProfilingRegion;
1585  using Teuchos::ArrayRCP;
1586  using Teuchos::Array;
1587  using Teuchos::null;
1588  using Teuchos::RCP;
1589  using Teuchos::rcp;
1590  using std::endl;
1591  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1592  using non_const_row_map_type = typename row_map_type::non_const_type;
1593  using values_type = typename local_matrix_device_type::values_type;
1594  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1595  const size_t lclNumRows = getLocalNumRows();
1596 
1597  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1598  std::unique_ptr<std::string> prefix;
1599  if (verbose) {
1600  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1601  std::ostringstream os;
1602  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1603  std::cerr << os.str ();
1604  }
1605 
1606  // The goals of this routine are first, to allocate and fill
1607  // packed 1-D storage (see below for an explanation) in the vals
1608  // array, and second, to give vals to the local matrix and
1609  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1610  // row offsets, within the scope of this routine, since we're only
1611  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1612  // fill both the graph and the matrix at the same time).
1613 
1614  // get data from staticGraph_
1615  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1616  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1617  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1618 
1619  row_map_type k_ptrs; // "packed" row offsets array
1620  values_type k_vals; // "packed" values array
1621 
1622  // May we ditch the old allocations for the packed (and otherwise
1623  // "optimized") allocations, later in this routine? Request
1624  // optimized storage by default.
1625  bool requestOptimizedStorage = true;
1626  const bool default_OptimizeStorage =
1627  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1628  if (! params.is_null() &&
1629  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1630  requestOptimizedStorage = false;
1631  }
1632  // If we're not allowed to change a static graph, then we can't
1633  // change the storage of the matrix, either. This means that if
1634  // the graph's storage isn't already optimized, we can't optimize
1635  // the matrix's storage either. Check and give warning, as
1636  // appropriate.
1637  if (! staticGraph_->isStorageOptimized () &&
1638  requestOptimizedStorage) {
1640  (true, std::runtime_error, "You requested optimized storage "
1641  "by setting the \"Optimize Storage\" flag to \"true\" in "
1642  "the ParameterList, or by virtue of default behavior. "
1643  "However, the associated CrsGraph was filled separately and "
1644  "requested not to optimize storage. Therefore, the "
1645  "CrsMatrix cannot optimize storage.");
1646  requestOptimizedStorage = false;
1647  }
1648 
1649  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1650  // See: https://github.com/trilinos/Trilinos/issues/12237
1651  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1652  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1653 
1654  // The matrix's values are currently
1655  // stored in a 1-D format. However, this format is "unpacked";
1656  // it doesn't necessarily have the same row offsets as indicated
1657  // by the ptrs array returned by allocRowPtrs. This could
1658  // happen, for example, if the user
1659  // fixed the number of matrix entries in
1660  // each row, but didn't fill all those entries.
1661  //
1662  // As above, we don't need to keep the "packed" row offsets
1663  // array ptrs here, but we do need it here temporarily, so we
1664  // have to allocate it. We'll free ptrs later in this method.
1665  //
1666  // Note that this routine checks whether storage has already
1667  // been packed. This is a common case for solution of nonlinear
1668  // PDEs using the finite element method, as long as the
1669  // structure of the sparse matrix does not change between linear
1670  // solves.
1671  if (nodeNumEntries != nodeNumAllocated) {
1672  if (verbose) {
1673  std::ostringstream os;
1674  os << *prefix << "Unpacked 1-D storage: numEnt="
1675  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1676  << endl;
1677  std::cerr << os.str();
1678  }
1679  // We have to pack the 1-D storage, since the user didn't fill
1680  // up all requested storage.
1681  if (verbose) {
1682  std::ostringstream os;
1683  os << *prefix << "Allocate packed row offsets: "
1684  << (lclNumRows+1) << endl;
1685  std::cerr << os.str();
1686  }
1687  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1688  lclNumRows+1);
1689  // Total number of entries in the matrix on the calling
1690  // process. We will compute this in the loop below. It's
1691  // cheap to compute and useful as a sanity check.
1692  size_t lclTotalNumEntries = 0;
1693  k_ptrs = tmpk_ptrs;
1694  {
1695  typename row_entries_type::const_type numRowEnt_h =
1696  staticGraph_->k_numRowEntries_;
1697  // This function can handle the counts being a host View.
1698  lclTotalNumEntries =
1699  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1700  }
1701 
1702  // Allocate the "packed" values array.
1703  // It has exactly the right number of entries.
1704  if (verbose) {
1705  std::ostringstream os;
1706  os << *prefix << "Allocate packed values: "
1707  << lclTotalNumEntries << endl;
1708  std::cerr << os.str ();
1709  }
1710  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1711 
1712  // Pack values_wdv into k_vals. We will replace values_wdv below.
1713  pack_functor<
1714  typename values_type::non_const_type,
1715  typename values_type::const_type,
1716  typename row_map_type::non_const_type,
1717  typename row_map_type::const_type> valsPacker
1718  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1719  tmpk_ptrs, k_rowPtrs);
1720 
1721  using exec_space = typename decltype (k_vals)::execution_space;
1722  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1723  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1724  range_type (0, lclNumRows), valsPacker);
1725  valuesPacked_wdv = values_wdv_type(k_vals);
1726  }
1727  else { // We don't have to pack, so just set the pointer.
1728  valuesPacked_wdv = valuesUnpacked_wdv;
1729  if (verbose) {
1730  std::ostringstream os;
1731  os << *prefix << "Storage already packed: "
1732  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1733  std::cerr << os.str();
1734  }
1735  }
1736 
1737  // May we ditch the old allocations for the packed one?
1738  if (requestOptimizedStorage) {
1739  // The user requested optimized storage, so we can dump the
1740  // unpacked 1-D storage, and keep the packed storage.
1741  valuesUnpacked_wdv = valuesPacked_wdv;
1742 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1743  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1744  }
1745  }
1746 
1747  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1748  void
1750  insertIndicesAndValues (crs_graph_type& graph,
1751  RowInfo& rowInfo,
1752  const typename crs_graph_type::SLocalGlobalViews& newInds,
1753  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1754  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1755  const ELocalGlobal lg,
1756  const ELocalGlobal I)
1757  {
1758  const size_t oldNumEnt = rowInfo.numEntries;
1759  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1760 
1761  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1762  // that probably relates to scalar_type vs. impl_scalar_type
1763  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1764  // details; look for GCC_WORKAROUND macro definition.
1765  if (numInserted > 0) {
1766  const size_t startOffset = oldNumEnt;
1767  memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1768  numInserted * sizeof (impl_scalar_type));
1769  }
1770  }
1771 
1772  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1773  void
1775  insertLocalValues (const LocalOrdinal lclRow,
1776  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1777  const Teuchos::ArrayView<const Scalar>& values,
1778  const CombineMode CM)
1779  {
1780  using std::endl;
1781  const char tfecfFuncName[] = "insertLocalValues: ";
1782 
1783  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1784  (! this->isFillActive (), std::runtime_error,
1785  "Fill is not active. After calling fillComplete, you must call "
1786  "resumeFill before you may insert entries into the matrix again.");
1787  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1788  (this->isStaticGraph (), std::runtime_error,
1789  "Cannot insert indices with static graph; use replaceLocalValues() "
1790  "instead.");
1791  // At this point, we know that myGraph_ is nonnull.
1792  crs_graph_type& graph = * (this->myGraph_);
1793  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1794  (graph.colMap_.is_null (), std::runtime_error,
1795  "Cannot insert local indices without a column map.");
1796  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1797  (graph.isGloballyIndexed (),
1798  std::runtime_error, "Graph indices are global; use "
1799  "insertGlobalValues().");
1800  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1801  (values.size () != indices.size (), std::runtime_error,
1802  "values.size() = " << values.size ()
1803  << " != indices.size() = " << indices.size () << ".");
1804  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1805  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1806  "Local row index " << lclRow << " does not belong to this process.");
1807 
1808  if (! graph.indicesAreAllocated ()) {
1809  // We only allocate values at most once per process, so it's OK
1810  // to check TPETRA_VERBOSE here.
1811  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1812  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1813  }
1814 
1815 #ifdef HAVE_TPETRA_DEBUG
1816  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1817  // In a debug build, test whether any of the given column indices
1818  // are not in the column Map. Keep track of the invalid column
1819  // indices so we can tell the user about them.
1820  {
1821  using Teuchos::toString;
1822 
1823  const map_type& colMap = * (graph.colMap_);
1824  Teuchos::Array<LocalOrdinal> badColInds;
1825  bool allInColMap = true;
1826  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1827  if (! colMap.isNodeLocalElement (indices[k])) {
1828  allInColMap = false;
1829  badColInds.push_back (indices[k]);
1830  }
1831  }
1832  if (! allInColMap) {
1833  std::ostringstream os;
1834  os << "You attempted to insert entries in owned row " << lclRow
1835  << ", at the following column indices: " << toString (indices)
1836  << "." << endl;
1837  os << "Of those, the following indices are not in the column Map on "
1838  "this process: " << toString (badColInds) << "." << endl << "Since "
1839  "the matrix has a column Map already, it is invalid to insert "
1840  "entries at those locations.";
1841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1842  (true, std::invalid_argument, os.str ());
1843  }
1844  }
1845 #endif // HAVE_TPETRA_DEBUG
1846 
1847  RowInfo rowInfo = graph.getRowInfo (lclRow);
1848 
1849  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1850  if (CM == ADD) {
1851  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1852  valsView[offset] += values[k]; };
1853  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1854  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1855  } else if (CM == INSERT) {
1856  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1857  valsView[offset] = values[k]; };
1858  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1859  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1860  } else {
1861  std::ostringstream os;
1862  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1863  << "but this has not been implemented." << endl;
1864  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1865  (true, std::invalid_argument, os.str ());
1866  }
1867  }
1868 
1869  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1870  void
1872  insertLocalValues (const LocalOrdinal localRow,
1873  const LocalOrdinal numEnt,
1874  const Scalar vals[],
1875  const LocalOrdinal cols[],
1876  const CombineMode CM)
1877  {
1878  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1879  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1880  this->insertLocalValues (localRow, colsT, valsT, CM);
1881  }
1882 
1883  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1884  void
1887  RowInfo& rowInfo,
1888  const GlobalOrdinal gblColInds[],
1889  const impl_scalar_type vals[],
1890  const size_t numInputEnt)
1891  {
1892 #ifdef HAVE_TPETRA_DEBUG
1893  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1894  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1895  const size_t curNumEnt = rowInfo.numEntries;
1896 #endif // HAVE_TPETRA_DEBUG
1897 
1898  if (! graph.indicesAreAllocated ()) {
1899  // We only allocate values at most once per process, so it's OK
1900  // to check TPETRA_VERBOSE here.
1901  using ::Tpetra::Details::Behavior;
1902  const bool verbose = Behavior::verbose("CrsMatrix");
1903  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1904  // mfh 23 Jul 2017: allocateValues invalidates existing
1905  // getRowInfo results. Once we get rid of lazy graph
1906  // allocation, we'll be able to move the getRowInfo call outside
1907  // of this method.
1908  rowInfo = graph.getRowInfo (rowInfo.localRow);
1909  }
1910 
1911  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1912  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1913  valsView[offset] += vals[k];
1914  };
1915  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1916 #ifdef HAVE_TPETRA_DEBUG
1917  //numInserted is only used inside the debug code below.
1918  auto numInserted =
1919 #endif
1920  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1921 
1922 #ifdef HAVE_TPETRA_DEBUG
1923  size_t newNumEnt = curNumEnt + numInserted;
1924  const size_t chkNewNumEnt =
1925  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1926  if (chkNewNumEnt != newNumEnt) {
1927  std::ostringstream os;
1928  os << std::endl << "newNumEnt = " << newNumEnt
1929  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1930  << ") = " << chkNewNumEnt << "." << std::endl
1931  << "\torigNumEnt: " << origNumEnt << std::endl
1932  << "\tnumInputEnt: " << numInputEnt << std::endl
1933  << "\tgblColInds: [";
1934  for (size_t k = 0; k < numInputEnt; ++k) {
1935  os << gblColInds[k];
1936  if (k + size_t (1) < numInputEnt) {
1937  os << ",";
1938  }
1939  }
1940  os << "]" << std::endl
1941  << "\tvals: [";
1942  for (size_t k = 0; k < numInputEnt; ++k) {
1943  os << vals[k];
1944  if (k + size_t (1) < numInputEnt) {
1945  os << ",";
1946  }
1947  }
1948  os << "]" << std::endl;
1949 
1950  if (this->supportsRowViews ()) {
1951  values_host_view_type vals2;
1952  if (this->isGloballyIndexed ()) {
1953  global_inds_host_view_type gblColInds2;
1954  const GlobalOrdinal gblRow =
1955  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1956  if (gblRow ==
1957  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1958  os << "Local row index " << rowInfo.localRow << " is invalid!"
1959  << std::endl;
1960  }
1961  else {
1962  bool getViewThrew = false;
1963  try {
1964  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1965  }
1966  catch (std::exception& e) {
1967  getViewThrew = true;
1968  os << "getGlobalRowView threw exception:" << std::endl
1969  << e.what () << std::endl;
1970  }
1971  if (! getViewThrew) {
1972  os << "\tNew global column indices: ";
1973  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1974  os << gblColInds2[jjj] << " ";
1975  os << std::endl;
1976  os << "\tNew values: ";
1977  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1978  os << vals2[jjj] << " ";
1979  os << std::endl;
1980  }
1981  }
1982  }
1983  else if (this->isLocallyIndexed ()) {
1984  local_inds_host_view_type lclColInds2;
1985  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1986  os << "\tNew local column indices: ";
1987  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1988  os << lclColInds2[jjj] << " ";
1989  os << std::endl;
1990  os << "\tNew values: ";
1991  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1992  os << vals2[jjj] << " ";
1993  os << std::endl;
1994  }
1995  }
1996 
1997  os << "Please report this bug to the Tpetra developers.";
1998  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1999  (true, std::logic_error, os.str ());
2000  }
2001 #endif // HAVE_TPETRA_DEBUG
2002  }
2003 
2004  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2005  void
2007  insertGlobalValues (const GlobalOrdinal gblRow,
2008  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2009  const Teuchos::ArrayView<const Scalar>& values)
2010  {
2011  using Teuchos::toString;
2012  using std::endl;
2013  typedef impl_scalar_type IST;
2014  typedef LocalOrdinal LO;
2015  typedef GlobalOrdinal GO;
2016  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2017  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
2018  const char tfecfFuncName[] = "insertGlobalValues: ";
2019 
2020 #ifdef HAVE_TPETRA_DEBUG
2021  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2022  (values.size () != indices.size (), std::runtime_error,
2023  "values.size() = " << values.size () << " != indices.size() = "
2024  << indices.size () << ".");
2025 #endif // HAVE_TPETRA_DEBUG
2026 
2027  // getRowMap() is not thread safe, because it increments RCP's
2028  // reference count. getCrsGraphRef() is thread safe.
2029  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2030  const LO lclRow = rowMap.getLocalElement (gblRow);
2031 
2032  if (lclRow == OTLO::invalid ()) {
2033  // Input row is _not_ owned by the calling process.
2034  //
2035  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2036  // is not in the row Map, it doesn't matter whether or not the
2037  // graph is static; the data just get stashed for later use by
2038  // globalAssemble().
2039  this->insertNonownedGlobalValues (gblRow, indices, values);
2040  }
2041  else { // Input row _is_ owned by the calling process
2042  if (this->isStaticGraph ()) {
2043  // Uh oh! Not allowed to insert into owned rows in that case.
2044  const int myRank = rowMap.getComm ()->getRank ();
2045  const int numProcs = rowMap.getComm ()->getSize ();
2046  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2047  (true, std::runtime_error,
2048  "The matrix was constructed with a constant (\"static\") graph, "
2049  "yet the given global row index " << gblRow << " is in the row "
2050  "Map on the calling process (with rank " << myRank << ", of " <<
2051  numProcs << " process(es)). In this case, you may not insert "
2052  "new entries into rows owned by the calling process.");
2053  }
2054 
2055  crs_graph_type& graph = * (this->myGraph_);
2056  const IST* const inputVals =
2057  reinterpret_cast<const IST*> (values.getRawPtr ());
2058  const GO* const inputGblColInds = indices.getRawPtr ();
2059  const size_t numInputEnt = indices.size ();
2060  RowInfo rowInfo = graph.getRowInfo (lclRow);
2061 
2062  // If the matrix has a column Map, check at this point whether
2063  // the column indices belong to the column Map.
2064  //
2065  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2066  // test to the CrsGraph method, since it may have to do this
2067  // anyway.
2068  if (! graph.colMap_.is_null ()) {
2069  const map_type& colMap = * (graph.colMap_);
2070  // In a debug build, keep track of the nonowned ("bad") column
2071  // indices, so that we can display them in the exception
2072  // message. In a release build, just ditch the loop early if
2073  // we encounter a nonowned column index.
2074 #ifdef HAVE_TPETRA_DEBUG
2075  Teuchos::Array<GO> badColInds;
2076 #endif // HAVE_TPETRA_DEBUG
2077  const size_type numEntriesToInsert = indices.size ();
2078  bool allInColMap = true;
2079  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2080  if (! colMap.isNodeGlobalElement (indices[k])) {
2081  allInColMap = false;
2082 #ifdef HAVE_TPETRA_DEBUG
2083  badColInds.push_back (indices[k]);
2084 #else
2085  break;
2086 #endif // HAVE_TPETRA_DEBUG
2087  }
2088  }
2089  if (! allInColMap) {
2090  std::ostringstream os;
2091  os << "You attempted to insert entries in owned row " << gblRow
2092  << ", at the following column indices: " << toString (indices)
2093  << "." << endl;
2094 #ifdef HAVE_TPETRA_DEBUG
2095  os << "Of those, the following indices are not in the column Map "
2096  "on this process: " << toString (badColInds) << "." << endl
2097  << "Since the matrix has a column Map already, it is invalid "
2098  "to insert entries at those locations.";
2099 #else
2100  os << "At least one of those indices is not in the column Map "
2101  "on this process." << endl << "It is invalid to insert into "
2102  "columns not in the column Map on the process that owns the "
2103  "row.";
2104 #endif // HAVE_TPETRA_DEBUG
2105  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2106  (true, std::invalid_argument, os.str ());
2107  }
2108  }
2109 
2110  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2111  inputVals, numInputEnt);
2112  }
2113  }
2114 
2115 
2116  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2117  void
2119  insertGlobalValues (const GlobalOrdinal globalRow,
2120  const LocalOrdinal numEnt,
2121  const Scalar vals[],
2122  const GlobalOrdinal inds[])
2123  {
2124  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2125  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2126  this->insertGlobalValues (globalRow, indsT, valsT);
2127  }
2128 
2129 
2130  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2131  void
2134  const GlobalOrdinal gblRow,
2135  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2136  const Teuchos::ArrayView<const Scalar>& values,
2137  const bool debug)
2138  {
2139  typedef impl_scalar_type IST;
2140  typedef LocalOrdinal LO;
2141  typedef GlobalOrdinal GO;
2142  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2143  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2144 
2145  if (debug) {
2146  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2147  (values.size () != indices.size (), std::runtime_error,
2148  "values.size() = " << values.size () << " != indices.size() = "
2149  << indices.size () << ".");
2150  }
2151 
2152  // getRowMap() is not thread safe, because it increments RCP's
2153  // reference count. getCrsGraphRef() is thread safe.
2154  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2155  const LO lclRow = rowMap.getLocalElement (gblRow);
2156  if (lclRow == OTLO::invalid ()) {
2157  // Input row is _not_ owned by the calling process.
2158  //
2159  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2160  // is not in the row Map, it doesn't matter whether or not the
2161  // graph is static; the data just get stashed for later use by
2162  // globalAssemble().
2163  this->insertNonownedGlobalValues (gblRow, indices, values);
2164  }
2165  else { // Input row _is_ owned by the calling process
2166  if (this->isStaticGraph ()) {
2167  // Uh oh! Not allowed to insert into owned rows in that case.
2168  const int myRank = rowMap.getComm ()->getRank ();
2169  const int numProcs = rowMap.getComm ()->getSize ();
2170  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2171  (true, std::runtime_error,
2172  "The matrix was constructed with a constant (\"static\") graph, "
2173  "yet the given global row index " << gblRow << " is in the row "
2174  "Map on the calling process (with rank " << myRank << ", of " <<
2175  numProcs << " process(es)). In this case, you may not insert "
2176  "new entries into rows owned by the calling process.");
2177  }
2178 
2179  crs_graph_type& graph = * (this->myGraph_);
2180  const IST* const inputVals =
2181  reinterpret_cast<const IST*> (values.getRawPtr ());
2182  const GO* const inputGblColInds = indices.getRawPtr ();
2183  const size_t numInputEnt = indices.size ();
2184  RowInfo rowInfo = graph.getRowInfo (lclRow);
2185 
2186  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2187  // This branch is similar in function to the following branch, but for
2188  // the special case that the target graph is locally indexed.
2189  // In this case, we cannot simply filter
2190  // out global indices that don't exist on the receiving process and
2191  // insert the remaining (global) indices, but we must convert them (the
2192  // remaining global indices) to local and call `insertLocalValues`.
2193  const map_type& colMap = * (graph.colMap_);
2194  size_t curOffset = 0;
2195  while (curOffset < numInputEnt) {
2196  // Find a sequence of input indices that are in the column Map on the
2197  // calling process. Doing a sequence at a time, instead of one at a
2198  // time, amortizes some overhead.
2199  Teuchos::Array<LO> lclIndices;
2200  size_t endOffset = curOffset;
2201  for ( ; endOffset < numInputEnt; ++endOffset) {
2202  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2203  if (lclIndex != OTLO::invalid())
2204  lclIndices.push_back(lclIndex);
2205  else
2206  break;
2207  }
2208  // curOffset, endOffset: half-exclusive range of indices in the column
2209  // Map on the calling process. If endOffset == curOffset, the range is
2210  // empty.
2211  const LO numIndInSeq = (endOffset - curOffset);
2212  if (numIndInSeq != 0) {
2213  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2214  }
2215  // Invariant before the increment line: Either endOffset ==
2216  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2217  // on the calling process.
2218  if (debug) {
2219  const bool invariant = endOffset == numInputEnt ||
2220  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2221  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2222  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2223  }
2224  curOffset = endOffset + 1;
2225  }
2226  }
2227  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2228  const map_type& colMap = * (graph.colMap_);
2229  size_t curOffset = 0;
2230  while (curOffset < numInputEnt) {
2231  // Find a sequence of input indices that are in the column
2232  // Map on the calling process. Doing a sequence at a time,
2233  // instead of one at a time, amortizes some overhead.
2234  size_t endOffset = curOffset;
2235  for ( ; endOffset < numInputEnt &&
2236  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2237  ++endOffset)
2238  {}
2239  // curOffset, endOffset: half-exclusive range of indices in
2240  // the column Map on the calling process. If endOffset ==
2241  // curOffset, the range is empty.
2242  const LO numIndInSeq = (endOffset - curOffset);
2243  if (numIndInSeq != 0) {
2244  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2245  this->insertGlobalValuesImpl (graph, rowInfo,
2246  inputGblColInds + curOffset,
2247  inputVals + curOffset,
2248  numIndInSeq);
2249  }
2250  // Invariant before the increment line: Either endOffset ==
2251  // numInputEnt, or inputGblColInds[endOffset] is not in the
2252  // column Map on the calling process.
2253  if (debug) {
2254  const bool invariant = endOffset == numInputEnt ||
2255  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2256  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2257  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2258  }
2259  curOffset = endOffset + 1;
2260  }
2261  }
2262  else { // we don't have a column Map.
2263  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2264  inputVals, numInputEnt);
2265  }
2266  }
2267  }
2268 
2269  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2270  void
2271  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2272  insertGlobalValuesFilteredChecked(
2273  const GlobalOrdinal gblRow,
2274  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2275  const Teuchos::ArrayView<const Scalar>& values,
2276  const char* const prefix,
2277  const bool debug,
2278  const bool verbose)
2279  {
2281  using std::endl;
2282 
2283  try {
2284  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2285  }
2286  catch(std::exception& e) {
2287  std::ostringstream os;
2288  if (verbose) {
2289  const size_t maxNumToPrint =
2291  os << *prefix << ": insertGlobalValuesFiltered threw an "
2292  "exception: " << e.what() << endl
2293  << "Global row index: " << gblRow << endl;
2294  verbosePrintArray(os, indices, "Global column indices",
2295  maxNumToPrint);
2296  os << endl;
2297  verbosePrintArray(os, values, "Values", maxNumToPrint);
2298  os << endl;
2299  }
2300  else {
2301  os << ": insertGlobalValuesFiltered threw an exception: "
2302  << e.what();
2303  }
2304  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2305  }
2306  }
2307 
2308  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2309  LocalOrdinal
2312  const crs_graph_type& graph,
2313  const RowInfo& rowInfo,
2314  const LocalOrdinal inds[],
2315  const impl_scalar_type newVals[],
2316  const LocalOrdinal numElts)
2317  {
2318  typedef LocalOrdinal LO;
2319  typedef GlobalOrdinal GO;
2320  const bool sorted = graph.isSorted ();
2321 
2322  size_t hint = 0; // Guess for the current index k into rowVals
2323  LO numValid = 0; // number of valid local column indices
2324 
2325  if (graph.isLocallyIndexed ()) {
2326  // Get a view of the column indices in the row. This amortizes
2327  // the cost of getting the view over all the entries of inds.
2328  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2329 
2330  for (LO j = 0; j < numElts; ++j) {
2331  const LO lclColInd = inds[j];
2332  const size_t offset =
2333  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2334  lclColInd, hint, sorted);
2335  if (offset != rowInfo.numEntries) {
2336  rowVals[offset] = newVals[j];
2337  hint = offset + 1;
2338  ++numValid;
2339  }
2340  }
2341  }
2342  else if (graph.isGloballyIndexed ()) {
2343  if (graph.colMap_.is_null ()) {
2344  return Teuchos::OrdinalTraits<LO>::invalid ();
2345  }
2346  const map_type colMap = * (graph.colMap_);
2347 
2348  // Get a view of the column indices in the row. This amortizes
2349  // the cost of getting the view over all the entries of inds.
2350  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2351 
2352  for (LO j = 0; j < numElts; ++j) {
2353  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2354  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2355  const size_t offset =
2356  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2357  gblColInd, hint, sorted);
2358  if (offset != rowInfo.numEntries) {
2359  rowVals[offset] = newVals[j];
2360  hint = offset + 1;
2361  ++numValid;
2362  }
2363  }
2364  }
2365  }
2366  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2367  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2368  // to be neither locally nor globally indexed on a process.
2369  // This means that the graph or matrix has no entries on that
2370  // process. Epetra also works like this. It's related to lazy
2371  // allocation (on first insertion, not at graph / matrix
2372  // construction). Lazy allocation will go away because it is
2373  // not thread scalable.
2374 
2375  return numValid;
2376  }
2377 
2378  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2379  LocalOrdinal
2381  replaceLocalValues (const LocalOrdinal localRow,
2382  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2383  const Teuchos::ArrayView<const Scalar>& vals)
2384  {
2385  typedef LocalOrdinal LO;
2386 
2387  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2388  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2389  return Teuchos::OrdinalTraits<LO>::invalid ();
2390  }
2391  const LO* const inputInds = lclCols.getRawPtr ();
2392  const Scalar* const inputVals = vals.getRawPtr ();
2393  return this->replaceLocalValues (localRow, numInputEnt,
2394  inputVals, inputInds);
2395  }
2396 
2397  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2399  local_ordinal_type
2402  const local_ordinal_type localRow,
2403  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2404  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2405  {
2406  using LO = local_ordinal_type;
2407  const LO numInputEnt = inputInds.extent(0);
2408  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2409  return Teuchos::OrdinalTraits<LO>::invalid();
2410  }
2411  const Scalar* const inVals =
2412  reinterpret_cast<const Scalar*>(inputVals.data());
2413  return this->replaceLocalValues(localRow, numInputEnt,
2414  inVals, inputInds.data());
2415  }
2416 
2417  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2418  LocalOrdinal
2420  replaceLocalValues (const LocalOrdinal localRow,
2421  const LocalOrdinal numEnt,
2422  const Scalar inputVals[],
2423  const LocalOrdinal inputCols[])
2424  {
2425  typedef impl_scalar_type IST;
2426  typedef LocalOrdinal LO;
2427 
2428  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2429  // Fill must be active and the "nonconst" graph must exist.
2430  return Teuchos::OrdinalTraits<LO>::invalid ();
2431  }
2432  const crs_graph_type& graph = * (this->staticGraph_);
2433  const RowInfo rowInfo = graph.getRowInfo (localRow);
2434 
2435  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2436  // The calling process does not own this row, so it is not
2437  // allowed to modify its values.
2438  return static_cast<LO> (0);
2439  }
2440  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2441  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2442  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2443  inputCols, inVals, numEnt);
2444  }
2445 
2446  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2447  LocalOrdinal
2450  const crs_graph_type& graph,
2451  const RowInfo& rowInfo,
2452  const GlobalOrdinal inds[],
2453  const impl_scalar_type newVals[],
2454  const LocalOrdinal numElts)
2455  {
2456  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2457  auto fun =
2458  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2459  rowVals[offset] = newVals[k];
2460  };
2461  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2462  return graph.findGlobalIndices(rowInfo, indsT, cb);
2463  }
2464 
2465  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2466  LocalOrdinal
2468  replaceGlobalValues (const GlobalOrdinal globalRow,
2469  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2470  const Teuchos::ArrayView<const Scalar>& inputVals)
2471  {
2472  typedef LocalOrdinal LO;
2473 
2474  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2475  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2476  return Teuchos::OrdinalTraits<LO>::invalid ();
2477  }
2478  return this->replaceGlobalValues (globalRow, numInputEnt,
2479  inputVals.getRawPtr (),
2480  inputGblColInds.getRawPtr ());
2481  }
2482 
2483  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2484  LocalOrdinal
2486  replaceGlobalValues (const GlobalOrdinal globalRow,
2487  const LocalOrdinal numEnt,
2488  const Scalar inputVals[],
2489  const GlobalOrdinal inputGblColInds[])
2490  {
2491  typedef impl_scalar_type IST;
2492  typedef LocalOrdinal LO;
2493 
2494  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2495  // Fill must be active and the "nonconst" graph must exist.
2496  return Teuchos::OrdinalTraits<LO>::invalid ();
2497  }
2498  const crs_graph_type& graph = * (this->staticGraph_);
2499 
2500  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2501  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2502  // The input local row is invalid on the calling process,
2503  // which means that the calling process summed 0 entries.
2504  return static_cast<LO> (0);
2505  }
2506 
2507  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2508  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2509  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2510  inputGblColInds, inVals, numEnt);
2511  }
2512 
2513  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2515  local_ordinal_type
2518  const global_ordinal_type globalRow,
2519  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2520  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2521  {
2522  // We use static_assert here to check the template parameters,
2523  // rather than std::enable_if (e.g., on the return value, to
2524  // enable compilation only if the template parameters match the
2525  // desired attributes). This turns obscure link errors into
2526  // clear compilation errors. It also makes the return value a
2527  // lot easier to see.
2528  using LO = local_ordinal_type;
2529  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2530  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2531  return Teuchos::OrdinalTraits<LO>::invalid();
2532  }
2533  const Scalar* const inVals =
2534  reinterpret_cast<const Scalar*>(inputVals.data());
2535  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2536  inputInds.data());
2537  }
2538 
2539  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2540  LocalOrdinal
2543  const crs_graph_type& graph,
2544  const RowInfo& rowInfo,
2545  const GlobalOrdinal inds[],
2546  const impl_scalar_type newVals[],
2547  const LocalOrdinal numElts,
2548  const bool atomic)
2549  {
2550  typedef LocalOrdinal LO;
2551  typedef GlobalOrdinal GO;
2552 
2553  const bool sorted = graph.isSorted ();
2554 
2555  size_t hint = 0; // guess at the index's relative offset in the row
2556  LO numValid = 0; // number of valid input column indices
2557 
2558  if (graph.isLocallyIndexed ()) {
2559  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2560  // pointer does NOT change its reference count. Thus, this
2561  // code is still thread safe.
2562  if (graph.colMap_.is_null ()) {
2563  // NO input column indices are valid in this case, since if
2564  // the column Map is null on the calling process, then the
2565  // calling process owns no graph entries.
2566  return numValid;
2567  }
2568  const map_type& colMap = * (graph.colMap_);
2569 
2570  // Get a view of the column indices in the row. This amortizes
2571  // the cost of getting the view over all the entries of inds.
2572  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2573  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2574 
2575  for (LO j = 0; j < numElts; ++j) {
2576  const LO lclColInd = colMap.getLocalElement (inds[j]);
2577  if (lclColInd != LINV) {
2578  const size_t offset =
2579  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2580  lclColInd, hint, sorted);
2581  if (offset != rowInfo.numEntries) {
2582  if (atomic) {
2583  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2584  }
2585  else {
2586  rowVals[offset] += newVals[j];
2587  }
2588  hint = offset + 1;
2589  numValid++;
2590  }
2591  }
2592  }
2593  }
2594  else if (graph.isGloballyIndexed ()) {
2595  // Get a view of the column indices in the row. This amortizes
2596  // the cost of getting the view over all the entries of inds.
2597  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2598 
2599  for (LO j = 0; j < numElts; ++j) {
2600  const GO gblColInd = inds[j];
2601  const size_t offset =
2602  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2603  gblColInd, hint, sorted);
2604  if (offset != rowInfo.numEntries) {
2605  if (atomic) {
2606  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2607  }
2608  else {
2609  rowVals[offset] += newVals[j];
2610  }
2611  hint = offset + 1;
2612  numValid++;
2613  }
2614  }
2615  }
2616  // If the graph is neither locally nor globally indexed on the
2617  // calling process, that means the calling process has no graph
2618  // entries. Thus, none of the input column indices are valid.
2619 
2620  return numValid;
2621  }
2622 
2623  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2624  LocalOrdinal
2626  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2627  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2628  const Teuchos::ArrayView<const Scalar>& inputVals,
2629  const bool atomic)
2630  {
2631  typedef LocalOrdinal LO;
2632 
2633  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2634  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2635  return Teuchos::OrdinalTraits<LO>::invalid ();
2636  }
2637  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2638  inputVals.getRawPtr (),
2639  inputGblColInds.getRawPtr (),
2640  atomic);
2641  }
2642 
2643  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2644  LocalOrdinal
2646  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2647  const LocalOrdinal numInputEnt,
2648  const Scalar inputVals[],
2649  const GlobalOrdinal inputGblColInds[],
2650  const bool atomic)
2651  {
2652  typedef impl_scalar_type IST;
2653  typedef LocalOrdinal LO;
2654  typedef GlobalOrdinal GO;
2655 
2656  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2657  // Fill must be active and the "nonconst" graph must exist.
2658  return Teuchos::OrdinalTraits<LO>::invalid ();
2659  }
2660  const crs_graph_type& graph = * (this->staticGraph_);
2661 
2662  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2663  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2664  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2665  // thread safe in a debug build, in part because it uses
2666  // Teuchos::ArrayView, and in part because of the data structure
2667  // used to stash outgoing entries.
2668  using Teuchos::ArrayView;
2669  ArrayView<const GO> inputGblColInds_av(
2670  numInputEnt == 0 ? nullptr : inputGblColInds,
2671  numInputEnt);
2672  ArrayView<const Scalar> inputVals_av(
2673  numInputEnt == 0 ? nullptr :
2674  inputVals, numInputEnt);
2675  // gblRow is not in the row Map on the calling process, so stash
2676  // the given entries away in a separate data structure.
2677  // globalAssemble() (called during fillComplete()) will exchange
2678  // that data and sum it in using sumIntoGlobalValues().
2679  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2680  inputVals_av);
2681  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2682  // since we won't know whether the given indices were valid
2683  // until globalAssemble (called in fillComplete) is called.
2684  // That's why insertNonownedGlobalValues doesn't return
2685  // anything. Just for consistency, I'll return the number of
2686  // entries that the user gave us.
2687  return numInputEnt;
2688  }
2689  else { // input row is in the row Map on the calling process
2690  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2691  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2692  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2693  inputGblColInds, inVals,
2694  numInputEnt, atomic);
2695  }
2696  }
2697 
2698  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2699  LocalOrdinal
2701  transformLocalValues (const LocalOrdinal lclRow,
2702  const LocalOrdinal numInputEnt,
2703  const impl_scalar_type inputVals[],
2704  const LocalOrdinal inputCols[],
2705  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2706  const bool atomic)
2707  {
2708  using Tpetra::Details::OrdinalTraits;
2709  typedef LocalOrdinal LO;
2710 
2711  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2712  // Fill must be active and the "nonconst" graph must exist.
2713  return Teuchos::OrdinalTraits<LO>::invalid ();
2714  }
2715  const crs_graph_type& graph = * (this->staticGraph_);
2716  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2717 
2718  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2719  // The calling process does not own this row, so it is not
2720  // allowed to modify its values.
2721  return static_cast<LO> (0);
2722  }
2723  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2724  return this->transformLocalValues (curRowVals.data (), graph,
2725  rowInfo, inputCols, inputVals,
2726  numInputEnt, f, atomic);
2727  }
2728 
2729  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2730  LocalOrdinal
2731  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2732  transformGlobalValues (const GlobalOrdinal gblRow,
2733  const LocalOrdinal numInputEnt,
2734  const impl_scalar_type inputVals[],
2735  const GlobalOrdinal inputCols[],
2736  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2737  const bool atomic)
2738  {
2739  using Tpetra::Details::OrdinalTraits;
2740  typedef LocalOrdinal LO;
2741 
2742  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2743  // Fill must be active and the "nonconst" graph must exist.
2744  return OrdinalTraits<LO>::invalid ();
2745  }
2746  const crs_graph_type& graph = * (this->staticGraph_);
2747  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2748 
2749  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2750  // The calling process does not own this row, so it is not
2751  // allowed to modify its values.
2752  return static_cast<LO> (0);
2753  }
2754  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2755  return this->transformGlobalValues (curRowVals.data (), graph,
2756  rowInfo, inputCols, inputVals,
2757  numInputEnt, f, atomic);
2758  }
2759 
2760  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2761  LocalOrdinal
2762  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2763  transformLocalValues (impl_scalar_type rowVals[],
2764  const crs_graph_type& graph,
2765  const RowInfo& rowInfo,
2766  const LocalOrdinal inds[],
2767  const impl_scalar_type newVals[],
2768  const LocalOrdinal numElts,
2769  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2770  const bool atomic)
2771  {
2772  typedef impl_scalar_type ST;
2773  typedef LocalOrdinal LO;
2774  typedef GlobalOrdinal GO;
2775 
2776  //if (newVals.extent (0) != inds.extent (0)) {
2777  // The sizes of the input arrays must match.
2778  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2779  //}
2780  //const LO numElts = static_cast<LO> (inds.extent (0));
2781  const bool sorted = graph.isSorted ();
2782 
2783  LO numValid = 0; // number of valid input column indices
2784  size_t hint = 0; // Guess for the current index k into rowVals
2785 
2786  if (graph.isLocallyIndexed ()) {
2787  // Get a view of the column indices in the row. This amortizes
2788  // the cost of getting the view over all the entries of inds.
2789  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2790 
2791  for (LO j = 0; j < numElts; ++j) {
2792  const LO lclColInd = inds[j];
2793  const size_t offset =
2794  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2795  lclColInd, hint, sorted);
2796  if (offset != rowInfo.numEntries) {
2797  if (atomic) {
2798  // NOTE (mfh 30 Nov 2015) The commented-out code is
2799  // wrong because another thread may have changed
2800  // rowVals[offset] between those two lines of code.
2801  //
2802  //const ST newVal = f (rowVals[offset], newVals[j]);
2803  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2804 
2805  volatile ST* const dest = &rowVals[offset];
2806  (void) atomic_binary_function_update (dest, newVals[j], f);
2807  }
2808  else {
2809  // use binary function f
2810  rowVals[offset] = f (rowVals[offset], newVals[j]);
2811  }
2812  hint = offset + 1;
2813  ++numValid;
2814  }
2815  }
2816  }
2817  else if (graph.isGloballyIndexed ()) {
2818  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2819  // pointer does NOT change its reference count. Thus, this
2820  // code is still thread safe.
2821  if (graph.colMap_.is_null ()) {
2822  // NO input column indices are valid in this case. Either
2823  // the column Map hasn't been set yet (so local indices
2824  // don't exist yet), or the calling process owns no graph
2825  // entries.
2826  return numValid;
2827  }
2828  const map_type& colMap = * (graph.colMap_);
2829  // Get a view of the column indices in the row. This amortizes
2830  // the cost of getting the view over all the entries of inds.
2831  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2832 
2833  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2834  for (LO j = 0; j < numElts; ++j) {
2835  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2836  if (gblColInd != GINV) {
2837  const size_t offset =
2838  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2839  gblColInd, hint, sorted);
2840  if (offset != rowInfo.numEntries) {
2841  if (atomic) {
2842  // NOTE (mfh 30 Nov 2015) The commented-out code is
2843  // wrong because another thread may have changed
2844  // rowVals[offset] between those two lines of code.
2845  //
2846  //const ST newVal = f (rowVals[offset], newVals[j]);
2847  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2848 
2849  volatile ST* const dest = &rowVals[offset];
2850  (void) atomic_binary_function_update (dest, newVals[j], f);
2851  }
2852  else {
2853  // use binary function f
2854  rowVals[offset] = f (rowVals[offset], newVals[j]);
2855  }
2856  hint = offset + 1;
2857  numValid++;
2858  }
2859  }
2860  }
2861  }
2862  // If the graph is neither locally nor globally indexed on the
2863  // calling process, that means the calling process has no graph
2864  // entries. Thus, none of the input column indices are valid.
2865 
2866  return numValid;
2867  }
2868 
2869  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2870  LocalOrdinal
2871  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2872  transformGlobalValues (impl_scalar_type rowVals[],
2873  const crs_graph_type& graph,
2874  const RowInfo& rowInfo,
2875  const GlobalOrdinal inds[],
2876  const impl_scalar_type newVals[],
2877  const LocalOrdinal numElts,
2878  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2879  const bool atomic)
2880  {
2881  typedef impl_scalar_type ST;
2882  typedef LocalOrdinal LO;
2883  typedef GlobalOrdinal GO;
2884 
2885  //if (newVals.extent (0) != inds.extent (0)) {
2886  // The sizes of the input arrays must match.
2887  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2888  //}
2889  //const LO numElts = static_cast<LO> (inds.extent (0));
2890  const bool sorted = graph.isSorted ();
2891 
2892  LO numValid = 0; // number of valid input column indices
2893  size_t hint = 0; // Guess for the current index k into rowVals
2894 
2895  if (graph.isGloballyIndexed ()) {
2896  // Get a view of the column indices in the row. This amortizes
2897  // the cost of getting the view over all the entries of inds.
2898  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2899 
2900  for (LO j = 0; j < numElts; ++j) {
2901  const GO gblColInd = inds[j];
2902  const size_t offset =
2903  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2904  gblColInd, hint, sorted);
2905  if (offset != rowInfo.numEntries) {
2906  if (atomic) {
2907  // NOTE (mfh 30 Nov 2015) The commented-out code is
2908  // wrong because another thread may have changed
2909  // rowVals[offset] between those two lines of code.
2910  //
2911  //const ST newVal = f (rowVals[offset], newVals[j]);
2912  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2913 
2914  volatile ST* const dest = &rowVals[offset];
2915  (void) atomic_binary_function_update (dest, newVals[j], f);
2916  }
2917  else {
2918  // use binary function f
2919  rowVals[offset] = f (rowVals[offset], newVals[j]);
2920  }
2921  hint = offset + 1;
2922  ++numValid;
2923  }
2924  }
2925  }
2926  else if (graph.isLocallyIndexed ()) {
2927  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2928  // pointer does NOT change its reference count. Thus, this
2929  // code is still thread safe.
2930  if (graph.colMap_.is_null ()) {
2931  // NO input column indices are valid in this case. Either the
2932  // column Map hasn't been set yet (so local indices don't
2933  // exist yet), or the calling process owns no graph entries.
2934  return numValid;
2935  }
2936  const map_type& colMap = * (graph.colMap_);
2937  // Get a view of the column indices in the row. This amortizes
2938  // the cost of getting the view over all the entries of inds.
2939  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2940 
2941  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2942  for (LO j = 0; j < numElts; ++j) {
2943  const LO lclColInd = colMap.getLocalElement (inds[j]);
2944  if (lclColInd != LINV) {
2945  const size_t offset =
2946  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2947  lclColInd, hint, sorted);
2948  if (offset != rowInfo.numEntries) {
2949  if (atomic) {
2950  // NOTE (mfh 30 Nov 2015) The commented-out code is
2951  // wrong because another thread may have changed
2952  // rowVals[offset] between those two lines of code.
2953  //
2954  //const ST newVal = f (rowVals[offset], newVals[j]);
2955  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2956 
2957  volatile ST* const dest = &rowVals[offset];
2958  (void) atomic_binary_function_update (dest, newVals[j], f);
2959  }
2960  else {
2961  // use binary function f
2962  rowVals[offset] = f (rowVals[offset], newVals[j]);
2963  }
2964  hint = offset + 1;
2965  numValid++;
2966  }
2967  }
2968  }
2969  }
2970  // If the graph is neither locally nor globally indexed on the
2971  // calling process, that means the calling process has no graph
2972  // entries. Thus, none of the input column indices are valid.
2973 
2974  return numValid;
2975  }
2976 
2977  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2978  LocalOrdinal
2981  const crs_graph_type& graph,
2982  const RowInfo& rowInfo,
2983  const LocalOrdinal inds[],
2984  const impl_scalar_type newVals[],
2985  const LocalOrdinal numElts,
2986  const bool atomic)
2987  {
2988  typedef LocalOrdinal LO;
2989  typedef GlobalOrdinal GO;
2990 
2991  const bool sorted = graph.isSorted ();
2992 
2993  size_t hint = 0; // Guess for the current index k into rowVals
2994  LO numValid = 0; // number of valid local column indices
2995 
2996  if (graph.isLocallyIndexed ()) {
2997  // Get a view of the column indices in the row. This amortizes
2998  // the cost of getting the view over all the entries of inds.
2999  auto colInds = graph.getLocalIndsViewHost (rowInfo);
3000 
3001  for (LO j = 0; j < numElts; ++j) {
3002  const LO lclColInd = inds[j];
3003  const size_t offset =
3004  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3005  lclColInd, hint, sorted);
3006  if (offset != rowInfo.numEntries) {
3007  if (atomic) {
3008  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3009  }
3010  else {
3011  rowVals[offset] += newVals[j];
3012  }
3013  hint = offset + 1;
3014  ++numValid;
3015  }
3016  }
3017  }
3018  else if (graph.isGloballyIndexed ()) {
3019  if (graph.colMap_.is_null ()) {
3020  return Teuchos::OrdinalTraits<LO>::invalid ();
3021  }
3022  const map_type colMap = * (graph.colMap_);
3023 
3024  // Get a view of the column indices in the row. This amortizes
3025  // the cost of getting the view over all the entries of inds.
3026  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
3027 
3028  for (LO j = 0; j < numElts; ++j) {
3029  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3030  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3031  const size_t offset =
3032  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3033  gblColInd, hint, sorted);
3034  if (offset != rowInfo.numEntries) {
3035  if (atomic) {
3036  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3037  }
3038  else {
3039  rowVals[offset] += newVals[j];
3040  }
3041  hint = offset + 1;
3042  ++numValid;
3043  }
3044  }
3045  }
3046  }
3047  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3048  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3049  // to be neither locally nor globally indexed on a process.
3050  // This means that the graph or matrix has no entries on that
3051  // process. Epetra also works like this. It's related to lazy
3052  // allocation (on first insertion, not at graph / matrix
3053  // construction). Lazy allocation will go away because it is
3054  // not thread scalable.
3055 
3056  return numValid;
3057  }
3058 
3059  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3060  LocalOrdinal
3062  sumIntoLocalValues (const LocalOrdinal localRow,
3063  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3064  const Teuchos::ArrayView<const Scalar>& values,
3065  const bool atomic)
3066  {
3067  using LO = local_ordinal_type;
3068  const LO numInputEnt = static_cast<LO>(indices.size());
3069  if (static_cast<LO>(values.size()) != numInputEnt) {
3070  return Teuchos::OrdinalTraits<LO>::invalid();
3071  }
3072  const LO* const inputInds = indices.getRawPtr();
3073  const scalar_type* const inputVals = values.getRawPtr();
3074  return this->sumIntoLocalValues(localRow, numInputEnt,
3075  inputVals, inputInds, atomic);
3076  }
3077 
3078  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3080  local_ordinal_type
3083  const local_ordinal_type localRow,
3084  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3085  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3086  const bool atomic)
3087  {
3088  using LO = local_ordinal_type;
3089  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3090  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3091  return Teuchos::OrdinalTraits<LO>::invalid();
3092  }
3093  const scalar_type* inVals =
3094  reinterpret_cast<const scalar_type*>(inputVals.data());
3095  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3096  inputInds.data(), atomic);
3097  }
3098 
3099  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3100  LocalOrdinal
3102  sumIntoLocalValues (const LocalOrdinal localRow,
3103  const LocalOrdinal numEnt,
3104  const Scalar vals[],
3105  const LocalOrdinal cols[],
3106  const bool atomic)
3107  {
3108  typedef impl_scalar_type IST;
3109  typedef LocalOrdinal LO;
3110 
3111  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3112  // Fill must be active and the "nonconst" graph must exist.
3113  return Teuchos::OrdinalTraits<LO>::invalid ();
3114  }
3115  const crs_graph_type& graph = * (this->staticGraph_);
3116  const RowInfo rowInfo = graph.getRowInfo (localRow);
3117 
3118  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3119  // The calling process does not own this row, so it is not
3120  // allowed to modify its values.
3121  return static_cast<LO> (0);
3122  }
3123  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3124  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3125  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3126  cols, inputVals, numEnt, atomic);
3127  }
3128 
3129  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3131  values_dualv_type::t_host::const_type
3133  getValuesViewHost (const RowInfo& rowinfo) const
3134  {
3135  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3136  return typename values_dualv_type::t_host::const_type ();
3137  else
3138  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3139  rowinfo.allocSize,
3140  Access::ReadOnly);
3141  }
3142 
3143  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3145  values_dualv_type::t_host
3148  {
3149  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3150  return typename values_dualv_type::t_host ();
3151  else
3152  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3153  rowinfo.allocSize,
3154  Access::ReadWrite);
3155  }
3156 
3157  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3159  values_dualv_type::t_dev::const_type
3161  getValuesViewDevice (const RowInfo& rowinfo) const
3162  {
3163  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3164  return typename values_dualv_type::t_dev::const_type ();
3165  else
3166  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3167  rowinfo.allocSize,
3168  Access::ReadOnly);
3169  }
3170 
3171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3173  values_dualv_type::t_dev
3176  {
3177  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3178  return typename values_dualv_type::t_dev ();
3179  else
3180  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3181  rowinfo.allocSize,
3182  Access::ReadWrite);
3183  }
3184 
3185 
3186  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3187  void
3190  nonconst_local_inds_host_view_type &indices,
3191  nonconst_values_host_view_type &values,
3192  size_t& numEntries) const
3193  {
3194  using Teuchos::ArrayView;
3195  using Teuchos::av_reinterpret_cast;
3196  const char tfecfFuncName[] = "getLocalRowCopy: ";
3197 
3198  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3199  (! this->hasColMap (), std::runtime_error,
3200  "The matrix does not have a column Map yet. This means we don't have "
3201  "local indices for columns yet, so it doesn't make sense to call this "
3202  "method. If the matrix doesn't have a column Map yet, you should call "
3203  "fillComplete on it first.");
3204 
3205  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3206  const size_t theNumEntries = rowinfo.numEntries;
3207  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3208  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3209  static_cast<size_t> (values.size ()) < theNumEntries,
3210  std::runtime_error, "Row with local index " << localRow << " has " <<
3211  theNumEntries << " entry/ies, but indices.size() = " <<
3212  indices.size () << " and values.size() = " << values.size () << ".");
3213  numEntries = theNumEntries; // first side effect
3214 
3215  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3216  if (staticGraph_->isLocallyIndexed ()) {
3217  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3218  auto curVals = getValuesViewHost(rowinfo);
3219 
3220  for (size_t j = 0; j < theNumEntries; ++j) {
3221  values[j] = curVals[j];
3222  indices[j] = curLclInds(j);
3223  }
3224  }
3225  else if (staticGraph_->isGloballyIndexed ()) {
3226  // Don't call getColMap(), because it touches RCP's reference count.
3227  const map_type& colMap = * (staticGraph_->colMap_);
3228  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3229  auto curVals = getValuesViewHost(rowinfo);
3230 
3231  for (size_t j = 0; j < theNumEntries; ++j) {
3232  values[j] = curVals[j];
3233  indices[j] = colMap.getLocalElement (curGblInds(j));
3234  }
3235  }
3236  }
3237  }
3238 
3239 
3240 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3241 void
3244  nonconst_global_inds_host_view_type &indices,
3245  nonconst_values_host_view_type &values,
3246  size_t& numEntries) const
3247  {
3248  using Teuchos::ArrayView;
3249  using Teuchos::av_reinterpret_cast;
3250  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3251 
3252  const RowInfo rowinfo =
3253  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3254  const size_t theNumEntries = rowinfo.numEntries;
3255  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3256  static_cast<size_t> (indices.size ()) < theNumEntries ||
3257  static_cast<size_t> (values.size ()) < theNumEntries,
3258  std::runtime_error, "Row with global index " << globalRow << " has "
3259  << theNumEntries << " entry/ies, but indices.size() = " <<
3260  indices.size () << " and values.size() = " << values.size () << ".");
3261  numEntries = theNumEntries; // first side effect
3262 
3263  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3264  if (staticGraph_->isLocallyIndexed ()) {
3265  const map_type& colMap = * (staticGraph_->colMap_);
3266  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3267  auto curVals = getValuesViewHost(rowinfo);
3268 
3269  for (size_t j = 0; j < theNumEntries; ++j) {
3270  values[j] = curVals[j];
3271  indices[j] = colMap.getGlobalElement (curLclInds(j));
3272  }
3273  }
3274  else if (staticGraph_->isGloballyIndexed ()) {
3275  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3276  auto curVals = getValuesViewHost(rowinfo);
3277 
3278  for (size_t j = 0; j < theNumEntries; ++j) {
3279  values[j] = curVals[j];
3280  indices[j] = curGblInds(j);
3281  }
3282  }
3283  }
3284  }
3285 
3286 
3287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3288  void
3290  getLocalRowView(LocalOrdinal localRow,
3291  local_inds_host_view_type &indices,
3292  values_host_view_type &values) const
3293  {
3294  const char tfecfFuncName[] = "getLocalRowView: ";
3295 
3296  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3297  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3298  "its indices as global indices, so you cannot get a view with local "
3299  "column indices. If the matrix has a column Map, you may call "
3300  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3301  "a view with global column indices by calling getGlobalRowCopy().");
3302 
3303  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3304  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3305  rowInfo.numEntries > 0) {
3306  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3307  rowInfo.offset1D,
3308  rowInfo.numEntries,
3309  Access::ReadOnly);
3310  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3311  rowInfo.numEntries,
3312  Access::ReadOnly);
3313  }
3314  else {
3315  // This does the right thing (reports an empty row) if the input
3316  // row is invalid.
3317  indices = local_inds_host_view_type();
3318  values = values_host_view_type();
3319  }
3320 
3321 #ifdef HAVE_TPETRA_DEBUG
3322  const char suffix[] = ". This should never happen. Please report this "
3323  "bug to the Tpetra developers.";
3324  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3325  (static_cast<size_t> (indices.size ()) !=
3326  static_cast<size_t> (values.size ()), std::logic_error,
3327  "At the end of this method, for local row " << localRow << ", "
3328  "indices.size() = " << indices.size () << " != values.size () = "
3329  << values.size () << suffix);
3330  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3331  (static_cast<size_t> (indices.size ()) !=
3332  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3333  "At the end of this method, for local row " << localRow << ", "
3334  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3335  << rowInfo.numEntries << suffix);
3336  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3337  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3338  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3339  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3340  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3341  expectedNumEntries << suffix);
3342 #endif // HAVE_TPETRA_DEBUG
3343  }
3344 
3345 
3346  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3347  void
3349  getGlobalRowView (GlobalOrdinal globalRow,
3350  global_inds_host_view_type &indices,
3351  values_host_view_type &values) const
3352  {
3353  const char tfecfFuncName[] = "getGlobalRowView: ";
3354 
3355  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3356  isLocallyIndexed (), std::runtime_error,
3357  "The matrix is locally indexed, so we cannot return a view of the row "
3358  "with global column indices. Use getGlobalRowCopy() instead.");
3359 
3360  // This does the right thing (reports an empty row) if the input
3361  // row is invalid.
3362  const RowInfo rowInfo =
3363  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3364  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3365  rowInfo.numEntries > 0) {
3366  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3367  rowInfo.numEntries,
3368  Access::ReadOnly);
3369  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3370  rowInfo.numEntries,
3371  Access::ReadOnly);
3372  }
3373  else {
3374  indices = global_inds_host_view_type();
3375  values = values_host_view_type();
3376  }
3377 
3378 #ifdef HAVE_TPETRA_DEBUG
3379  const char suffix[] = ". This should never happen. Please report this "
3380  "bug to the Tpetra developers.";
3381  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3382  (static_cast<size_t> (indices.size ()) !=
3383  static_cast<size_t> (values.size ()), std::logic_error,
3384  "At the end of this method, for global row " << globalRow << ", "
3385  "indices.size() = " << indices.size () << " != values.size () = "
3386  << values.size () << suffix);
3387  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3388  (static_cast<size_t> (indices.size ()) !=
3389  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3390  "At the end of this method, for global row " << globalRow << ", "
3391  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3392  << rowInfo.numEntries << suffix);
3393  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3394  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3395  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3396  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3397  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3398  " " << expectedNumEntries << suffix);
3399 #endif // HAVE_TPETRA_DEBUG
3400  }
3401 
3402 
3403  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3404  void
3406  scale (const Scalar& alpha)
3407  {
3408  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3409 
3410  const size_t nlrs = staticGraph_->getLocalNumRows ();
3411  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3412  if (! staticGraph_->indicesAreAllocated () ||
3413  nlrs == 0 || numEntries == 0) {
3414  // do nothing
3415  }
3416  else {
3417 
3418  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3419  KokkosBlas::scal(vals, theAlpha, vals);
3420 
3421  }
3422  }
3423 
3424  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3425  void
3427  setAllToScalar (const Scalar& alpha)
3428  {
3429  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3430 
3431  // replace all values in the matrix
3432  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3433  // however, if there are no valid entries, we can short-circuit
3434  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3435  const size_t numEntries = staticGraph_->getLocalNumEntries();
3436  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3437  // do nothing
3438  }
3439  else {
3440  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3441  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3442  theAlpha);
3443  // CAG: This fence was found to be required on Cuda with UVM=on.
3444  Kokkos::fence("CrsMatrix::setAllToScalar");
3445  }
3446  }
3447 
3448  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3449  void
3451  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3452  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3453  const typename local_matrix_device_type::values_type& values)
3454  {
3455  using ProfilingRegion=Details::ProfilingRegion;
3456  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3457  const char tfecfFuncName[] = "setAllValues: ";
3458  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3459  (columnIndices.size () != values.size (), std::invalid_argument,
3460  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3461  " = " << values.size () << ".");
3462  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3463  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3464 
3465  try {
3466  myGraph_->setAllIndices (rowPointers, columnIndices);
3467  }
3468  catch (std::exception &e) {
3469  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3470  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3471  "exception: " << e.what ());
3472  }
3473 
3474  // Make sure that myGraph_ now has a local graph. It may not be
3475  // fillComplete yet, so it's important to check. We don't care
3476  // whether setAllIndices() did a shallow copy or a deep copy, so a
3477  // good way to check is to compare dimensions.
3478  auto lclGraph = myGraph_->getLocalGraphDevice ();
3479  const size_t numEnt = lclGraph.entries.extent (0);
3480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3481  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3482  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3483  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3484  "local graph. Please report this bug to the Tpetra developers.");
3485 
3486  valuesPacked_wdv = values_wdv_type(values);
3487  valuesUnpacked_wdv = valuesPacked_wdv;
3488 
3489  // Storage MUST be packed, since the interface doesn't give any
3490  // way to indicate any extra space at the end of each row.
3491  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3492 
3493  checkInternalState ();
3494  }
3495 
3496  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3497  void
3499  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3500  {
3501  using ProfilingRegion=Details::ProfilingRegion;
3502  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3503 
3504  auto graph = localDeviceMatrix.graph;
3505  //FIXME how to check whether graph is allocated
3506 
3507  auto rows = graph.row_map;
3508  auto columns = graph.entries;
3509  auto values = localDeviceMatrix.values;
3510 
3511  setAllValues(rows,columns,values);
3512  }
3513 
3514  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3515  void
3517  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3518  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3519  const Teuchos::ArrayRCP<Scalar>& val)
3520  {
3521  using Kokkos::Compat::getKokkosViewDeepCopy;
3522  using Teuchos::ArrayRCP;
3523  using Teuchos::av_reinterpret_cast;
3524  typedef device_type DT;
3525  typedef impl_scalar_type IST;
3526  typedef typename local_graph_device_type::row_map_type row_map_type;
3527  //typedef typename row_map_type::non_const_value_type row_offset_type;
3528  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3529 
3530  // The row offset type may depend on the execution space. It may
3531  // not necessarily be size_t. If it's not, we need to make a deep
3532  // copy. We need to make a deep copy anyway so that Kokkos can
3533  // own the memory. Regardless, ptrIn gets the copy.
3534  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3535  Kokkos::View<const size_t*,
3536  typename row_map_type::array_layout,
3537  Kokkos::HostSpace,
3538  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3539  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3540 
3541  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3542  (ptrNative.extent (0) != ptrSizeT.extent (0),
3543  std::logic_error, "ptrNative.extent(0) = " <<
3544  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3545  << ptrSizeT.extent (0) << ". Please report this bug to the "
3546  "Tpetra developers.");
3547 
3548  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3549  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3550  this->setAllValues (ptrNative, indIn, valIn);
3551  }
3552 
3553  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3554  void
3556  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3557  {
3558  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3560  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3561 
3562  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3563  // this method in CrsGraph too, so don't call it (otherwise build
3564  // warnings will show up and annoy users). Instead, copy results
3565  // in and out, if the memory space requires it.
3566 
3567  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3568  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3569  offsets.resize (lclNumRows);
3570  }
3571 
3572  // The input ArrayRCP must always be a host pointer. Thus, if
3573  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3574  // to write to that allocation directly as a Kokkos::View.
3575  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3576  // It is always syntactically correct to assign a raw host
3577  // pointer to a device View, so this code will compile correctly
3578  // even if this branch never runs.
3579  typedef Kokkos::View<size_t*, device_type,
3580  Kokkos::MemoryUnmanaged> output_type;
3581  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3582  staticGraph_->getLocalDiagOffsets (offsetsOut);
3583  }
3584  else {
3585  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3586  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3587  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3588  Kokkos::MemoryUnmanaged> output_type;
3589  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3590  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3591  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3592  }
3593  }
3594 
3595  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3596  void
3599  {
3600  using Teuchos::ArrayRCP;
3601  using Teuchos::ArrayView;
3602  using Teuchos::av_reinterpret_cast;
3603  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3604  typedef local_ordinal_type LO;
3605 
3606 
3607  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3608  staticGraph_.is_null (), std::runtime_error,
3609  "This method requires that the matrix have a graph.");
3610  auto rowMapPtr = this->getRowMap ();
3611  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3612  // Processes on which the row Map or its communicator is null
3613  // don't participate. Users shouldn't even call this method on
3614  // those processes.
3615  return;
3616  }
3617  auto colMapPtr = this->getColMap ();
3618  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3619  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3620  "This method requires that the matrix have a column Map.");
3621  const map_type& rowMap = * rowMapPtr;
3622  const map_type& colMap = * colMapPtr;
3623  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3624 
3625 #ifdef HAVE_TPETRA_DEBUG
3626  // isCompatible() requires an all-reduce, and thus this check
3627  // should only be done in debug mode.
3628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3629  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3630  "The input Vector's Map must be compatible with the CrsMatrix's row "
3631  "Map. You may check this by using Map's isCompatible method: "
3632  "diag.getMap ()->isCompatible (A.getRowMap ());");
3633 #endif // HAVE_TPETRA_DEBUG
3634 
3635  if (this->isFillComplete ()) {
3636  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3637  // 1-D subview of the first (and only) column of D_lcl.
3638  const auto D_lcl_1d =
3639  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3640 
3641  const auto lclRowMap = rowMap.getLocalMap ();
3642  const auto lclColMap = colMap.getLocalMap ();
3644  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3645  lclColMap,
3646  getLocalMatrixDevice ());
3647  }
3648  else {
3650  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3651  }
3652  }
3653 
3654  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3655  void
3658  const Kokkos::View<const size_t*, device_type,
3659  Kokkos::MemoryUnmanaged>& offsets) const
3660  {
3661  typedef LocalOrdinal LO;
3662 
3663 #ifdef HAVE_TPETRA_DEBUG
3664  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3665  const map_type& rowMap = * (this->getRowMap ());
3666  // isCompatible() requires an all-reduce, and thus this check
3667  // should only be done in debug mode.
3668  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3669  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3670  "The input Vector's Map must be compatible with (in the sense of Map::"
3671  "isCompatible) the CrsMatrix's row Map.");
3672 #endif // HAVE_TPETRA_DEBUG
3673 
3674  // For now, we fill the Vector on the host and sync to device.
3675  // Later, we may write a parallel kernel that works entirely on
3676  // device.
3677  //
3678  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3679  // we write a device kernel, it will not need to assume UVM.
3680 
3681  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3682  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3683  // Get 1-D subview of the first (and only) column of D_lcl.
3684  auto D_lcl_1d =
3685  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3686 
3687  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3688  getLocalMatrixDevice ());
3689  }
3690 
3691  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3692  void
3695  const Teuchos::ArrayView<const size_t>& offsets) const
3696  {
3697  using LO = LocalOrdinal;
3698  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3699  using IST = impl_scalar_type;
3700 
3701 #ifdef HAVE_TPETRA_DEBUG
3702  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3703  const map_type& rowMap = * (this->getRowMap ());
3704  // isCompatible() requires an all-reduce, and thus this check
3705  // should only be done in debug mode.
3706  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3707  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3708  "The input Vector's Map must be compatible with (in the sense of Map::"
3709  "isCompatible) the CrsMatrix's row Map.");
3710 #endif // HAVE_TPETRA_DEBUG
3711 
3712  // See #1510. In case diag has already been marked modified on
3713  // device, we need to clear that flag, since the code below works
3714  // on host.
3715  //diag.clear_sync_state ();
3716 
3717  // For now, we fill the Vector on the host and sync to device.
3718  // Later, we may write a parallel kernel that works entirely on
3719  // device.
3720  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3721  // 1-D subview of the first (and only) column of lclVecHost.
3722  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3723 
3724  using host_offsets_view_type =
3725  Kokkos::View<const size_t*, Kokkos::HostSpace,
3726  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3727  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3728  // Find the diagonal entries and put them in lclVecHost1d.
3729  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3730  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3731  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3732 
3733  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3734  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3735  Kokkos::parallel_for
3736  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3737  range_type (0, myNumRows),
3738  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3739  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3740  if (h_offsets[lclRow] != INV) {
3741  auto curRowOffset = rowPtrsPackedHost (lclRow);
3742  lclVecHost1d(lclRow) =
3743  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3744  }
3745  });
3746  //diag.sync_device ();
3747  }
3748 
3749 
3750  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3751  void
3754  {
3755  using ::Tpetra::Details::ProfilingRegion;
3756  using Teuchos::ArrayRCP;
3757  using Teuchos::ArrayView;
3758  using Teuchos::null;
3759  using Teuchos::RCP;
3760  using Teuchos::rcp;
3761  using Teuchos::rcpFromRef;
3763  const char tfecfFuncName[] = "leftScale: ";
3764 
3765  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3766 
3767  RCP<const vec_type> xp;
3768  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3769  // Take from Epetra: If we have a non-trivial exporter, we must
3770  // import elements that are permuted or are on other processors.
3771  auto exporter = this->getCrsGraphRef ().getExporter ();
3772  if (exporter.get () != nullptr) {
3773  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3774  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3775  xp = tempVec;
3776  }
3777  else {
3778  xp = rcpFromRef (x);
3779  }
3780  }
3781  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3782  xp = rcpFromRef (x);
3783  }
3784  else {
3785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3786  (true, std::invalid_argument, "x's Map must be the same as "
3787  "either the row Map or the range Map of the CrsMatrix.");
3788  }
3789 
3790  if (this->isFillComplete()) {
3791  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3792  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3794  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3795  x_lcl_1d, false, false);
3796  }
3797  else {
3798  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3799  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3800  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3801  " fillComplete");
3802  }
3803  }
3804 
3805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3806  void
3809  {
3810  using ::Tpetra::Details::ProfilingRegion;
3811  using Teuchos::ArrayRCP;
3812  using Teuchos::ArrayView;
3813  using Teuchos::null;
3814  using Teuchos::RCP;
3815  using Teuchos::rcp;
3816  using Teuchos::rcpFromRef;
3818  const char tfecfFuncName[] = "rightScale: ";
3819 
3820  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3821 
3822  RCP<const vec_type> xp;
3823  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3824  // Take from Epetra: If we have a non-trivial exporter, we must
3825  // import elements that are permuted or are on other processors.
3826  auto importer = this->getCrsGraphRef ().getImporter ();
3827  if (importer.get () != nullptr) {
3828  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3829  tempVec->doImport (x, *importer, REPLACE);
3830  xp = tempVec;
3831  }
3832  else {
3833  xp = rcpFromRef (x);
3834  }
3835  }
3836  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3837  xp = rcpFromRef (x);
3838  } else {
3839  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3840  (true, std::runtime_error, "x's Map must be the same as "
3841  "either the domain Map or the column Map of the CrsMatrix.");
3842  }
3843 
3844  if (this->isFillComplete()) {
3845  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3846  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3848  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3849  x_lcl_1d, false, false);
3850  }
3851  else {
3852  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3854  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3855  " fillComplete");
3856  }
3857  }
3858 
3859  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3863  {
3864  using Teuchos::ArrayView;
3865  using Teuchos::outArg;
3866  using Teuchos::REDUCE_SUM;
3867  using Teuchos::reduceAll;
3868 
3869  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3870  // local part of this computation. It could make sense to put
3871  // this operation in the Kokkos::CrsMatrix.
3872 
3873  // check the cache first
3874  mag_type mySum = STM::zero ();
3875  if (getLocalNumEntries() > 0) {
3876  if (isStorageOptimized ()) {
3877  // "Optimized" storage is packed storage. That means we can
3878  // iterate in one pass through the 1-D values array.
3879  const size_t numEntries = getLocalNumEntries ();
3880  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3881  for (size_t k = 0; k < numEntries; ++k) {
3882  auto val = values[k];
3883  // Note (etp 06 Jan 2015) We need abs() here for composite types
3884  // (in general, if mag_type is on the left-hand-side, we need
3885  // abs() on the right-hand-side)
3886  const mag_type val_abs = STS::abs (val);
3887  mySum += val_abs * val_abs;
3888  }
3889  }
3890  else {
3891  const LocalOrdinal numRows =
3892  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3893  for (LocalOrdinal r = 0; r < numRows; ++r) {
3894  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3895  const size_t numEntries = rowInfo.numEntries;
3896  auto A_r = this->getValuesViewHost(rowInfo);
3897  for (size_t k = 0; k < numEntries; ++k) {
3898  const impl_scalar_type val = A_r[k];
3899  const mag_type val_abs = STS::abs (val);
3900  mySum += val_abs * val_abs;
3901  }
3902  }
3903  }
3904  }
3905  mag_type totalSum = STM::zero ();
3906  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3907  mySum, outArg (totalSum));
3908  return STM::sqrt (totalSum);
3909  }
3910 
3911  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3912  void
3914  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3915  {
3916  const char tfecfFuncName[] = "replaceColMap: ";
3917  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3918  // Then replacing the column Map might mean that we need to
3919  // reindex the column indices.
3920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3921  myGraph_.is_null (), std::runtime_error,
3922  "This method does not work if the matrix has a const graph. The whole "
3923  "idea of a const graph is that you are not allowed to change it, but "
3924  "this method necessarily must modify the graph, since the graph owns "
3925  "the matrix's column Map.");
3926  myGraph_->replaceColMap (newColMap);
3927  }
3928 
3929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3930  void
3933  const Teuchos::RCP<const map_type>& newColMap,
3934  const Teuchos::RCP<const import_type>& newImport,
3935  const bool sortEachRow)
3936  {
3937  const char tfecfFuncName[] = "reindexColumns: ";
3938  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3939  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3940  "The input graph is null, but the matrix does not own its graph.");
3941 
3942  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3943  const bool sortGraph = false; // we'll sort graph & matrix together below
3944 
3945  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3946 
3947  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3948  const LocalOrdinal lclNumRows =
3949  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3950 
3951  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3952 
3953  const RowInfo rowInfo = theGraph.getRowInfo (row);
3954  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3955  auto vals = this->getValuesViewHostNonConst (rowInfo);
3956 
3957  sort2 (lclColInds.data (),
3958  lclColInds.data () + rowInfo.numEntries,
3959  vals.data ());
3960  }
3961  theGraph.indicesAreSorted_ = true;
3962  }
3963  }
3964 
3965  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3966  void
3968  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3969  {
3970  const char tfecfFuncName[] = "replaceDomainMap: ";
3971  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3972  myGraph_.is_null (), std::runtime_error,
3973  "This method does not work if the matrix has a const graph. The whole "
3974  "idea of a const graph is that you are not allowed to change it, but this"
3975  " method necessarily must modify the graph, since the graph owns the "
3976  "matrix's domain Map and Import objects.");
3977  myGraph_->replaceDomainMap (newDomainMap);
3978  }
3979 
3980  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3981  void
3983  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3984  Teuchos::RCP<const import_type>& newImporter)
3985  {
3986  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3987  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3988  myGraph_.is_null (), std::runtime_error,
3989  "This method does not work if the matrix has a const graph. The whole "
3990  "idea of a const graph is that you are not allowed to change it, but this"
3991  " method necessarily must modify the graph, since the graph owns the "
3992  "matrix's domain Map and Import objects.");
3993  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3994  }
3995 
3996  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3997  void
3999  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
4000  {
4001  const char tfecfFuncName[] = "replaceRangeMap: ";
4002  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4003  myGraph_.is_null (), std::runtime_error,
4004  "This method does not work if the matrix has a const graph. The whole "
4005  "idea of a const graph is that you are not allowed to change it, but this"
4006  " method necessarily must modify the graph, since the graph owns the "
4007  "matrix's domain Map and Import objects.");
4008  myGraph_->replaceRangeMap (newRangeMap);
4009  }
4010 
4011  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4012  void
4014  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
4015  Teuchos::RCP<const export_type>& newExporter)
4016  {
4017  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
4018  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4019  myGraph_.is_null (), std::runtime_error,
4020  "This method does not work if the matrix has a const graph. The whole "
4021  "idea of a const graph is that you are not allowed to change it, but this"
4022  " method necessarily must modify the graph, since the graph owns the "
4023  "matrix's domain Map and Import objects.");
4024  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
4025  }
4026 
4027  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4028  void
4030  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4031  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4032  const Teuchos::ArrayView<const Scalar>& values)
4033  {
4034  using Teuchos::Array;
4035  typedef GlobalOrdinal GO;
4036  typedef typename Array<GO>::size_type size_type;
4037 
4038  const size_type numToInsert = indices.size ();
4039  // Add the new data to the list of nonlocals.
4040  // This creates the arrays if they don't exist yet.
4041  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4042  Array<GO>& curRowInds = curRow.first;
4043  Array<Scalar>& curRowVals = curRow.second;
4044  const size_type newCapacity = curRowInds.size () + numToInsert;
4045  curRowInds.reserve (newCapacity);
4046  curRowVals.reserve (newCapacity);
4047  for (size_type k = 0; k < numToInsert; ++k) {
4048  curRowInds.push_back (indices[k]);
4049  curRowVals.push_back (values[k]);
4050  }
4051  }
4052 
4053  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4054  void
4057  {
4058  using Details::Behavior;
4060  using Teuchos::Comm;
4061  using Teuchos::outArg;
4062  using Teuchos::RCP;
4063  using Teuchos::rcp;
4064  using Teuchos::REDUCE_MAX;
4065  using Teuchos::REDUCE_MIN;
4066  using Teuchos::reduceAll;
4067  using std::endl;
4069  //typedef LocalOrdinal LO;
4070  typedef GlobalOrdinal GO;
4071  typedef typename Teuchos::Array<GO>::size_type size_type;
4072  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4073  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4074 
4075  const bool verbose = Behavior::verbose("CrsMatrix");
4076  std::unique_ptr<std::string> prefix;
4077  if (verbose) {
4078  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4079  std::ostringstream os;
4080  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4081  << endl;
4082  std::cerr << os.str();
4083  }
4084  RCP<const Comm<int> > comm = getComm ();
4085 
4086  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4087  (! isFillActive (), std::runtime_error, "Fill must be active before "
4088  "you may call this method.");
4089 
4090  const size_t myNumNonlocalRows = nonlocals_.size ();
4091 
4092  // If no processes have nonlocal rows, then we don't have to do
4093  // anything. Checking this is probably cheaper than constructing
4094  // the Map of nonlocal rows (see below) and noticing that it has
4095  // zero global entries.
4096  {
4097  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4098  int someoneHasNonlocalRows = 0;
4099  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4100  outArg (someoneHasNonlocalRows));
4101  if (someoneHasNonlocalRows == 0) {
4102  return; // no process has nonlocal rows, so nothing to do
4103  }
4104  }
4105 
4106  // 1. Create a list of the "nonlocal" rows on each process. this
4107  // requires iterating over nonlocals_, so while we do this,
4108  // deduplicate the entries and get a count for each nonlocal
4109  // row on this process.
4110  // 2. Construct a new row Map corresponding to those rows. This
4111  // Map is likely overlapping. We know that the Map is not
4112  // empty on all processes, because the above all-reduce and
4113  // return exclude that case.
4114 
4115  RCP<const map_type> nonlocalRowMap;
4116  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4117  {
4118  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4119  size_type curPos = 0;
4120  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4121  ++mapIter, ++curPos) {
4122  myNonlocalGblRows[curPos] = mapIter->first;
4123  // Get the values and column indices by reference, since we
4124  // intend to change them in place (that's what "erase" does).
4125  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4126  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4127 
4128  // Sort both arrays jointly, using the column indices as keys,
4129  // then merge them jointly. "Merge" here adds values
4130  // corresponding to the same column indices. The first 2 args
4131  // of merge2 are output arguments that work just like the
4132  // return value of std::unique.
4133  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4134  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4135  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4136  merge2 (gblCols_newEnd, vals_newEnd,
4137  gblCols.begin (), gblCols.end (),
4138  vals.begin (), vals.end ());
4139  gblCols.erase (gblCols_newEnd, gblCols.end ());
4140  vals.erase (vals_newEnd, vals.end ());
4141  numEntPerNonlocalRow[curPos] = gblCols.size ();
4142  }
4143 
4144  // Currently, Map requires that its indexBase be the global min
4145  // of all its global indices. Map won't compute this for us, so
4146  // we must do it. If our process has no nonlocal rows, set the
4147  // "min" to the max possible GO value. This ensures that if
4148  // some process has at least one nonlocal row, then it will pick
4149  // that up as the min. We know that at least one process has a
4150  // nonlocal row, since the all-reduce and return at the top of
4151  // this method excluded that case.
4152  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4153  {
4154  auto iter = std::min_element (myNonlocalGblRows.begin (),
4155  myNonlocalGblRows.end ());
4156  if (iter != myNonlocalGblRows.end ()) {
4157  myMinNonlocalGblRow = *iter;
4158  }
4159  }
4160  GO gblMinNonlocalGblRow = 0;
4161  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4162  outArg (gblMinNonlocalGblRow));
4163  const GO indexBase = gblMinNonlocalGblRow;
4164  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4165  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4166  }
4167 
4168  // 3. Use the values and column indices for each nonlocal row, as
4169  // stored in nonlocals_, to construct a CrsMatrix corresponding
4170  // to nonlocal rows. We have
4171  // exact counts of the number of entries in each nonlocal row.
4172 
4173  if (verbose) {
4174  std::ostringstream os;
4175  os << *prefix << "Create nonlocal matrix" << endl;
4176  std::cerr << os.str();
4177  }
4178  RCP<crs_matrix_type> nonlocalMatrix =
4179  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4180  {
4181  size_type curPos = 0;
4182  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4183  ++mapIter, ++curPos) {
4184  const GO gblRow = mapIter->first;
4185  // Get values & column indices by ref, just to avoid copy.
4186  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4187  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4188  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4189  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4190  }
4191  }
4192  // There's no need to fill-complete the nonlocals matrix.
4193  // We just use it as a temporary container for the Export.
4194 
4195  // 4. If the original row Map is one to one, then we can Export
4196  // directly from nonlocalMatrix into this. Otherwise, we have
4197  // to create a temporary matrix with a one-to-one row Map,
4198  // Export into that, then Import from the temporary matrix into
4199  // *this.
4200 
4201  auto origRowMap = this->getRowMap ();
4202  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4203 
4204  int isLocallyComplete = 1; // true by default
4205 
4206  if (origRowMapIsOneToOne) {
4207  if (verbose) {
4208  std::ostringstream os;
4209  os << *prefix << "Original row Map is 1-to-1" << endl;
4210  std::cerr << os.str();
4211  }
4212  export_type exportToOrig (nonlocalRowMap, origRowMap);
4213  if (! exportToOrig.isLocallyComplete ()) {
4214  isLocallyComplete = 0;
4215  }
4216  if (verbose) {
4217  std::ostringstream os;
4218  os << *prefix << "doExport from nonlocalMatrix" << endl;
4219  std::cerr << os.str();
4220  }
4221  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4222  // We're done at this point!
4223  }
4224  else {
4225  if (verbose) {
4226  std::ostringstream os;
4227  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4228  std::cerr << os.str();
4229  }
4230  // If you ask a Map whether it is one to one, it does some
4231  // communication and stashes intermediate results for later use
4232  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4233  // much more then the original cost of calling isOneToOne.
4234  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4235  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4236  if (! exportToOneToOne.isLocallyComplete ()) {
4237  isLocallyComplete = 0;
4238  }
4239 
4240  // Create a temporary matrix with the one-to-one row Map.
4241  //
4242  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4243  // each row, to avoid reallocation during the Export operation.
4244  if (verbose) {
4245  std::ostringstream os;
4246  os << *prefix << "Create & doExport into 1-to-1 matrix"
4247  << endl;
4248  std::cerr << os.str();
4249  }
4250  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4251  // Export from matrix of nonlocals into the temp one-to-one matrix.
4252  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4253  Tpetra::ADD);
4254 
4255  // We don't need the matrix of nonlocals anymore, so get rid of
4256  // it, to keep the memory high-water mark down.
4257  if (verbose) {
4258  std::ostringstream os;
4259  os << *prefix << "Free nonlocalMatrix" << endl;
4260  std::cerr << os.str();
4261  }
4262  nonlocalMatrix = Teuchos::null;
4263 
4264  // Import from the one-to-one matrix to the original matrix.
4265  if (verbose) {
4266  std::ostringstream os;
4267  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4268  std::cerr << os.str();
4269  }
4270  import_type importToOrig (oneToOneRowMap, origRowMap);
4271  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4272  }
4273 
4274  // It's safe now to clear out nonlocals_, since we've already
4275  // committed side effects to *this. The standard idiom for
4276  // clearing a Container like std::map, is to swap it with an empty
4277  // Container and let the swapped Container fall out of scope.
4278  if (verbose) {
4279  std::ostringstream os;
4280  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4281  std::cerr << os.str();
4282  }
4283  decltype (nonlocals_) newNonlocals;
4284  std::swap (nonlocals_, newNonlocals);
4285 
4286  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4287  // don't like throwing an exception here. A local return value
4288  // would likely be more useful to users. However, if users find
4289  // themselves exercising nonlocal inserts often, then they are
4290  // probably novice users who need the help. See Gibhub Issues
4291  // #603 and #601 (esp. the latter) for discussion.
4292 
4293  int isGloballyComplete = 0; // output argument of reduceAll
4294  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4295  outArg (isGloballyComplete));
4296  TEUCHOS_TEST_FOR_EXCEPTION
4297  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4298  "you called insertGlobalValues with a global row index which is not in "
4299  "the matrix's row Map on any process in its communicator.");
4300  }
4301 
4302  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4303  void
4305  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4306  {
4307  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4308  myGraph_->resumeFill (params);
4309  }
4310 #if KOKKOSKERNELS_VERSION >= 40299
4311  // Delete the apply helper (if it exists)
4312  applyHelper.reset();
4313 #endif
4314  fillComplete_ = false;
4315  }
4316 
4317  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4318  bool
4321  return getCrsGraphRef ().haveGlobalConstants ();
4322  }
4323 
4324  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4325  void
4327  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4328  {
4329  const char tfecfFuncName[] = "fillComplete(params): ";
4330 
4331  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4332  (this->getCrsGraph ().is_null (), std::logic_error,
4333  "getCrsGraph() returns null. This should not happen at this point. "
4334  "Please report this bug to the Tpetra developers.");
4335 
4336  const crs_graph_type& graph = this->getCrsGraphRef ();
4337  if (this->isStaticGraph () && graph.isFillComplete ()) {
4338  // If this matrix's graph is fill complete and the user did not
4339  // supply a domain or range Map, use the graph's domain and
4340  // range Maps.
4341  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4342  }
4343  else { // assume that user's row Map is the domain and range Map
4344  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4345  Teuchos::RCP<const map_type> domainMap = rangeMap;
4346  this->fillComplete (domainMap, rangeMap, params);
4347  }
4348  }
4349 
4350  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4351  void
4353  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4354  const Teuchos::RCP<const map_type>& rangeMap,
4355  const Teuchos::RCP<Teuchos::ParameterList>& params)
4356  {
4357  using Details::Behavior;
4359  using Teuchos::ArrayRCP;
4360  using Teuchos::RCP;
4361  using Teuchos::rcp;
4362  using std::endl;
4363  const char tfecfFuncName[] = "fillComplete: ";
4364  ProfilingRegion regionFillComplete
4365  ("Tpetra::CrsMatrix::fillComplete");
4366  const bool verbose = Behavior::verbose("CrsMatrix");
4367  std::unique_ptr<std::string> prefix;
4368  if (verbose) {
4369  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4370  std::ostringstream os;
4371  os << *prefix << endl;
4372  std::cerr << os.str ();
4373  }
4374  Details::ProfilingRegion region(
4375  "Tpetra::CrsMatrix::fillCompete",
4376  "fillCompete");
4377 
4378  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4379  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4380  "Matrix fill state must be active (isFillActive() "
4381  "must be true) before you may call fillComplete().");
4382  const int numProcs = this->getComm ()->getSize ();
4383 
4384  //
4385  // Read parameters from the input ParameterList.
4386  //
4387  {
4388  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4389 
4390  // If true, the caller promises that no process did nonlocal
4391  // changes since the last call to fillComplete.
4392  bool assertNoNonlocalInserts = false;
4393  // If true, makeColMap sorts remote GIDs (within each remote
4394  // process' group).
4395  bool sortGhosts = true;
4396 
4397  if (! params.is_null ()) {
4398  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4399  assertNoNonlocalInserts);
4400  if (params->isParameter ("sort column map ghost gids")) {
4401  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4402  }
4403  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4404  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4405  }
4406  }
4407  // We also don't need to do global assembly if there is only one
4408  // process in the communicator.
4409  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4410  // This parameter only matters if this matrix owns its graph.
4411  if (! this->myGraph_.is_null ()) {
4412  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4413  }
4414 
4415  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4416  if (this->hasColMap ()) { // use local indices
4417  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4418  }
4419  else { // no column Map, so use global indices
4420  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4421  }
4422  }
4423  // Global assemble, if we need to. This call only costs a single
4424  // all-reduce if we didn't need global assembly after all.
4425  if (needGlobalAssemble) {
4426  this->globalAssemble ();
4427  }
4428  else {
4429  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4430  (numProcs == 1 && nonlocals_.size() > 0,
4431  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4432  "An invalid entry (i.e., with row index not in the row Map) must have "
4433  "been submitted to the CrsMatrix.");
4434  }
4435  }
4436  if (this->isStaticGraph ()) {
4437  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4438  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4439  // checks below only in debug mode. It would be nicer to do a
4440  // local check, then propagate the error state in a deferred
4441  // way, whenever communication happens. That would reduce the
4442  // cost of checking, to the point where it may make sense to
4443  // enable it even in release mode.
4444 #ifdef HAVE_TPETRA_DEBUG
4445  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4446  // input Maps incurs a penalty of two all-reduces for the
4447  // otherwise optimal const graph case.
4448  //
4449  // We could turn these (max) 2 all-reduces into (max) 1, by
4450  // fusing them. We could do this by adding a "locallySameAs"
4451  // method to Map, which would return one of four states:
4452  //
4453  // a. Certainly globally the same
4454  // b. Certainly globally not the same
4455  // c. Locally the same
4456  // d. Locally not the same
4457  //
4458  // The first two states don't require further communication.
4459  // The latter two states require an all-reduce to communicate
4460  // globally, but we only need one all-reduce, since we only need
4461  // to check whether at least one of the Maps is wrong.
4462  const bool domainMapsMatch =
4463  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4464  const bool rangeMapsMatch =
4465  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4466 
4467  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4468  (! domainMapsMatch, std::runtime_error,
4469  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4470  "The graph cannot be changed because it was given to the CrsMatrix "
4471  "constructor as const. You can fix this by passing in the graph's "
4472  "domain Map and range Map to the matrix's fillComplete call.");
4473 
4474  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4475  (! rangeMapsMatch, std::runtime_error,
4476  "The CrsMatrix's range Map does not match the graph's range Map. "
4477  "The graph cannot be changed because it was given to the CrsMatrix "
4478  "constructor as const. You can fix this by passing in the graph's "
4479  "domain Map and range Map to the matrix's fillComplete call.");
4480 #endif // HAVE_TPETRA_DEBUG
4481 
4482  // The matrix does _not_ own the graph, and the graph's
4483  // structure is already fixed, so just fill the local matrix.
4484  this->fillLocalMatrix (params);
4485  }
4486  else {
4487  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4488  // Set the graph's domain and range Maps. This will clear the
4489  // Import if the domain Map has changed (is a different
4490  // pointer), and the Export if the range Map has changed (is a
4491  // different pointer).
4492  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4493 
4494  // Make the graph's column Map, if necessary.
4495  Teuchos::Array<int> remotePIDs (0);
4496  const bool mustBuildColMap = ! this->hasColMap ();
4497  if (mustBuildColMap) {
4498  this->myGraph_->makeColMap (remotePIDs);
4499  }
4500 
4501  // Make indices local, if necessary. The method won't do
4502  // anything if the graph is already locally indexed.
4503  const std::pair<size_t, std::string> makeIndicesLocalResult =
4504  this->myGraph_->makeIndicesLocal(verbose);
4505  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4506  // the error state to makeImportExport
4507  // which may do all-reduces and thus may
4508  // have the opportunity to communicate that error state.
4509  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4510  (makeIndicesLocalResult.first != 0, std::runtime_error,
4511  makeIndicesLocalResult.second);
4512 
4513  const bool sorted = this->myGraph_->isSorted ();
4514  const bool merged = this->myGraph_->isMerged ();
4515  this->sortAndMergeIndicesAndValues (sorted, merged);
4516 
4517  // Make Import and Export objects, if they haven't been made
4518  // already. If we made a column Map above, reuse information
4519  // from that process to avoid communiation in the Import setup.
4520  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4521 
4522  // The matrix _does_ own the graph, so fill the local graph at
4523  // the same time as the local matrix.
4524  this->fillLocalGraphAndMatrix (params);
4525 
4526  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4527  params->get ("compute global constants", true);
4528  if (callGraphComputeGlobalConstants) {
4529  this->myGraph_->computeGlobalConstants ();
4530  }
4531  else {
4532  this->myGraph_->computeLocalConstants ();
4533  }
4534  this->myGraph_->fillComplete_ = true;
4535  this->myGraph_->checkInternalState ();
4536  }
4537 
4538  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4539 
4540  this->fillComplete_ = true; // Now we're fill complete!
4541  {
4542  Details::ProfilingRegion region_cis(
4543  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4544  );
4545  this->checkInternalState ();
4546  }
4547  } //fillComplete(domainMap, rangeMap, params)
4548 
4549  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4550  void
4552  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4553  const Teuchos::RCP<const map_type> & rangeMap,
4554  const Teuchos::RCP<const import_type>& importer,
4555  const Teuchos::RCP<const export_type>& exporter,
4556  const Teuchos::RCP<Teuchos::ParameterList> &params)
4557  {
4558 #ifdef HAVE_TPETRA_MMM_TIMINGS
4559  std::string label;
4560  if(!params.is_null())
4561  label = params->get("Timer Label",label);
4562  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4563  using Teuchos::TimeMonitor;
4564 
4565  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4566 #endif
4567 
4568  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4569  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4570  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4571  "must be true) before calling fillComplete().");
4572  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4573  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4574 
4575  {
4576 #ifdef HAVE_TPETRA_MMM_TIMINGS
4577  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4578 #endif
4579  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4580  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4581  }
4582 
4583  {
4584 #ifdef HAVE_TPETRA_MMM_TIMINGS
4585  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4586 #endif
4587  // Fill the local graph and matrix
4588  fillLocalGraphAndMatrix (params);
4589  }
4590  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4591 
4592  // Now we're fill complete!
4593  fillComplete_ = true;
4594 
4595  // Sanity checks at the end.
4596 #ifdef HAVE_TPETRA_DEBUG
4597  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4598  ": We're at the end of fillComplete(), but isFillActive() is true. "
4599  "Please report this bug to the Tpetra developers.");
4600  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4601  ": We're at the end of fillComplete(), but isFillActive() is true. "
4602  "Please report this bug to the Tpetra developers.");
4603 #endif // HAVE_TPETRA_DEBUG
4604  {
4605 #ifdef HAVE_TPETRA_MMM_TIMINGS
4606  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4607 #endif
4608 
4609  checkInternalState();
4610  }
4611  }
4612 
4613  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4615  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4616  {
4617  impl_scalar_type* rowValueIter = vals;
4618  // beg,end define a half-exclusive interval over which to iterate.
4619  LocalOrdinal* beg = cols;
4620  LocalOrdinal* end = cols + rowLen;
4621  LocalOrdinal* newend = beg;
4622  if (beg != end) {
4623  LocalOrdinal* cur = beg + 1;
4624  impl_scalar_type* vcur = rowValueIter + 1;
4625  impl_scalar_type* vend = rowValueIter;
4626  cur = beg+1;
4627  while (cur != end) {
4628  if (*cur != *newend) {
4629  // new entry; save it
4630  ++newend;
4631  ++vend;
4632  (*newend) = (*cur);
4633  (*vend) = (*vcur);
4634  }
4635  else {
4636  // old entry; merge it
4637  //(*vend) = f (*vend, *vcur);
4638  (*vend) += *vcur;
4639  }
4640  ++cur;
4641  ++vcur;
4642  }
4643  ++newend; // one past the last entry, per typical [beg,end) semantics
4644  }
4645  return newend - beg;
4646  }
4647 
4648  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4649  void
4651  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4652  {
4653  using ::Tpetra::Details::ProfilingRegion;
4654  typedef LocalOrdinal LO;
4655  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4656  host_execution_space;
4657  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4658  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4659  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4660 
4661  if (! sorted || ! merged) {
4662  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4663  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4664  "\"static\" (const) graph, since the matrix does not own the graph.");
4665  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4666  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4667  "this matrix claims ! isStaticGraph(). "
4668  "Please report this bug to the Tpetra developers.");
4669  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4670  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4671  "this method if the graph's storage has already been optimized. "
4672  "Please report this bug to the Tpetra developers.");
4673 
4674  crs_graph_type& graph = * (this->myGraph_);
4675  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4676  size_t totalNumDups = 0;
4677  {
4678  //Accessing host unpacked (4-array CRS) local matrix.
4679  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4680  auto rowLengths_ = graph.k_numRowEntries_;
4681  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4682  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4683  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4684  [=] (const LO lclRow, size_t& numDups) {
4685  size_t rowBegin = rowBegins_(lclRow);
4686  size_t rowLen = rowLengths_(lclRow);
4687  LO* cols = cols_.data() + rowBegin;
4688  impl_scalar_type* vals = vals_.data() + rowBegin;
4689  if (! sorted) {
4690  sort2 (cols, cols + rowLen, vals);
4691  }
4692  if (! merged) {
4693  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4694  rowLengths_(lclRow) = newRowLength;
4695  numDups += rowLen - newRowLength;
4696  }
4697  }, totalNumDups);
4698  }
4699  if (! sorted) {
4700  graph.indicesAreSorted_ = true; // we just sorted every row
4701  }
4702  if (! merged) {
4703  graph.noRedundancies_ = true; // we just merged every row
4704  }
4705  }
4706  }
4707 
4708  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4709  void
4713  Scalar alpha,
4714  Scalar beta) const
4715  {
4717  using Teuchos::RCP;
4718  using Teuchos::rcp;
4719  using Teuchos::rcp_const_cast;
4720  using Teuchos::rcpFromRef;
4721  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4722  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4723 
4724  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4725  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4726  // failing only for the Kokkos refactor version of Tpetra. It's a
4727  // good idea regardless to have the bypass.
4728  if (alpha == ZERO) {
4729  if (beta == ZERO) {
4730  Y_in.putScalar (ZERO);
4731  } else if (beta != ONE) {
4732  Y_in.scale (beta);
4733  }
4734  return;
4735  }
4736 
4737  // It's possible that X is a view of Y or vice versa. We don't
4738  // allow this (apply() requires that X and Y not alias one
4739  // another), but it's helpful to detect and work around this case.
4740  // We don't try to to detect the more subtle cases (e.g., one is a
4741  // subview of the other, but their initial pointers differ). We
4742  // only need to do this if this matrix's Import is trivial;
4743  // otherwise, we don't actually apply the operator from X into Y.
4744 
4745  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4746  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4747 
4748  // If beta == 0, then the output MV will be overwritten; none of
4749  // its entries should be read. (Sparse BLAS semantics say that we
4750  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4751  // This matters if we need to do an Export operation; see below.
4752  const bool Y_is_overwritten = (beta == ZERO);
4753 
4754  // We treat the case of a replicated MV output specially.
4755  const bool Y_is_replicated =
4756  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4757 
4758  // This is part of the special case for replicated MV output.
4759  // We'll let each process do its thing, but do an all-reduce at
4760  // the end to sum up the results. Setting beta=0 on all processes
4761  // but Proc 0 makes the math work out for the all-reduce. (This
4762  // assumes that the replicated data is correctly replicated, so
4763  // that the data are the same on all processes.)
4764  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4765  beta = ZERO;
4766  }
4767 
4768  // Temporary MV for Import operation. After the block of code
4769  // below, this will be an (Imported if necessary) column Map MV
4770  // ready to give to localApply(...).
4771  RCP<const MV> X_colMap;
4772  if (importer.is_null ()) {
4773  if (! X_in.isConstantStride ()) {
4774  // Not all sparse mat-vec kernels can handle an input MV with
4775  // nonconstant stride correctly, so we have to copy it in that
4776  // case into a constant stride MV. To make a constant stride
4777  // copy of X_in, we force creation of the column (== domain)
4778  // Map MV (if it hasn't already been created, else fetch the
4779  // cached copy). This avoids creating a new MV each time.
4780  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4781  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4782  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4783  }
4784  else {
4785  // The domain and column Maps are the same, so do the local
4786  // multiply using the domain Map input MV X_in.
4787  X_colMap = rcpFromRef (X_in);
4788  }
4789  }
4790  else { // need to Import source (multi)vector
4791  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4792 
4793  // We're doing an Import anyway, which will copy the relevant
4794  // elements of the domain Map MV X_in into a separate column Map
4795  // MV. Thus, we don't have to worry whether X_in is constant
4796  // stride.
4797  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4798 
4799  // Import from the domain Map MV to the column Map MV.
4800  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4801  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4802  }
4803 
4804  // Temporary MV for doExport (if needed), or for copying a
4805  // nonconstant stride output MV into a constant stride MV. This
4806  // is null if we don't need the temporary MV, that is, if the
4807  // Export is trivial (null).
4808  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4809 
4810  // If we have a nontrivial Export object, we must perform an
4811  // Export. In that case, the local multiply result will go into
4812  // the row Map multivector. We don't have to make a
4813  // constant-stride version of Y_in in this case, because we had to
4814  // make a constant stride Y_rowMap MV and do an Export anyway.
4815  if (! exporter.is_null ()) {
4816  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4817  {
4818  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4819 
4820  // If we're overwriting the output MV Y_in completely (beta ==
4821  // 0), then make sure that it is filled with zeros before we
4822  // do the Export. Otherwise, the ADD combine mode will use
4823  // data in Y_in, which is supposed to be zero.
4824  if (Y_is_overwritten) {
4825  Y_in.putScalar (ZERO);
4826  }
4827  else {
4828  // Scale output MV by beta, so that doExport sums in the
4829  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4830  Y_in.scale (beta);
4831  }
4832  // Do the Export operation.
4833  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4834  }
4835  }
4836  else { // Don't do an Export: row Map and range Map are the same.
4837  //
4838  // If Y_in does not have constant stride, or if the column Map
4839  // MV aliases Y_in, then we can't let the kernel write directly
4840  // to Y_in. Instead, we have to use the cached row (== range)
4841  // Map MV as temporary storage.
4842  //
4843  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4844  // the user passed in the same MultiVector for both X and Y. It
4845  // won't detect whether one MultiVector views the other. We
4846  // should also check the MultiVectors' raw data pointers.
4847  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4848  // Force creating the MV if it hasn't been created already.
4849  // This will reuse a previously created cached MV.
4850  Y_rowMap = getRowMapMultiVector (Y_in, true);
4851 
4852  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4853  // since we're overwriting it anyway.
4854  if (beta != ZERO) {
4855  Tpetra::deep_copy (*Y_rowMap, Y_in);
4856  }
4857  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4858  Tpetra::deep_copy (Y_in, *Y_rowMap);
4859  }
4860  else {
4861  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4862  }
4863  }
4864 
4865  // If the range Map is a locally replicated Map, sum up
4866  // contributions from each process. We set beta = 0 on all
4867  // processes but Proc 0 initially, so this will handle the scaling
4868  // factor beta correctly.
4869  if (Y_is_replicated) {
4870  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4871  Y_in.reduce ();
4872  }
4873  }
4874 
4875  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4876  void
4880  const Teuchos::ETransp mode,
4881  Scalar alpha,
4882  Scalar beta) const
4883  {
4885  using Teuchos::null;
4886  using Teuchos::RCP;
4887  using Teuchos::rcp;
4888  using Teuchos::rcp_const_cast;
4889  using Teuchos::rcpFromRef;
4890  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4891 
4892  // Take shortcuts for alpha == 0.
4893  if (alpha == ZERO) {
4894  // Follow the Sparse BLAS convention by ignoring both the matrix
4895  // and X_in, in this case.
4896  if (beta == ZERO) {
4897  // Follow the Sparse BLAS convention by overwriting any Inf or
4898  // NaN values in Y_in, in this case.
4899  Y_in.putScalar (ZERO);
4900  }
4901  else {
4902  Y_in.scale (beta);
4903  }
4904  return;
4905  }
4906  else if (beta == ZERO) {
4907  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4908  //when bets==0. This was not the case with transpose in a multithreaded
4909  //environment where a multiplication with subsequent atomic_adds is used
4910  //since 0 is effectively not special cased. Doing the explicit set to zero here
4911  //This catches cases where Y is nan or inf.
4912  Y_in.putScalar (ZERO);
4913  }
4914 
4915  const size_t numVectors = X_in.getNumVectors ();
4916 
4917  // We don't allow X_in and Y_in to alias one another. It's hard
4918  // to check this, because advanced users could create views from
4919  // raw pointers. However, if X_in and Y_in reference the same
4920  // object, we will do the user a favor by copying X into new
4921  // storage (with a warning). We only need to do this if we have
4922  // trivial importers; otherwise, we don't actually apply the
4923  // operator from X into Y.
4924  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4925  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4926  // access X indirectly, in case we need to create temporary storage
4927  RCP<const MV> X;
4928 
4929  // some parameters for below
4930  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4931  const bool Y_is_overwritten = (beta == ZERO);
4932  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4933  beta = ZERO;
4934  }
4935 
4936  // The kernels do not allow input or output with nonconstant stride.
4937  if (! X_in.isConstantStride () && importer.is_null ()) {
4938  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4939  } else {
4940  X = rcpFromRef (X_in); // Reference to X_in
4941  }
4942 
4943  // Set up temporary multivectors for Import and/or Export.
4944  if (importer != Teuchos::null) {
4945  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4946  importMV_ = null;
4947  }
4948  if (importMV_ == null) {
4949  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4950  }
4951  }
4952  if (exporter != Teuchos::null) {
4953  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4954  exportMV_ = null;
4955  }
4956  if (exportMV_ == null) {
4957  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4958  }
4959  }
4960 
4961  // If we have a non-trivial exporter, we must import elements that
4962  // are permuted or are on other processors.
4963  if (! exporter.is_null ()) {
4964  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4965  exportMV_->doImport (X_in, *exporter, INSERT);
4966  X = exportMV_; // multiply out of exportMV_
4967  }
4968 
4969  // If we have a non-trivial importer, we must export elements that
4970  // are permuted or belong to other processors. We will compute
4971  // solution into the to-be-exported MV; get a view.
4972  if (importer != Teuchos::null) {
4973  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4974 
4975  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4976  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4977  // back and figure out why this helps. importMV_ SHOULD be
4978  // completely overwritten in the localApply(...) call
4979  // below, because beta == ZERO there.
4980  importMV_->putScalar (ZERO);
4981  // Do the local computation.
4982  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4983 
4984  if (Y_is_overwritten) {
4985  Y_in.putScalar (ZERO);
4986  } else {
4987  Y_in.scale (beta);
4988  }
4989  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4990  }
4991  // otherwise, multiply into Y
4992  else {
4993  // can't multiply in-situ; can't multiply into non-strided multivector
4994  //
4995  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4996  // the user passed in the same MultiVector for both X and Y. It
4997  // won't detect whether one MultiVector views the other. We
4998  // should also check the MultiVectors' raw data pointers.
4999  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
5000  // Make a deep copy of Y_in, into which to write the multiply result.
5001  MV Y (Y_in, Teuchos::Copy);
5002  this->localApply (*X, Y, mode, alpha, beta);
5003  Tpetra::deep_copy (Y_in, Y);
5004  } else {
5005  this->localApply (*X, Y_in, mode, alpha, beta);
5006  }
5007  }
5008 
5009  // If the range Map is a locally replicated map, sum the
5010  // contributions from each process. (That's why we set beta=0
5011  // above for all processes but Proc 0.)
5012  if (Y_is_replicated) {
5013  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
5014  Y_in.reduce ();
5015  }
5016  }
5017 
5018  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5019  void
5023  const Teuchos::ETransp mode,
5024  const Scalar& alpha,
5025  const Scalar& beta) const
5026  {
5028  using Teuchos::NO_TRANS;
5029  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5030 
5031  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5032  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5033 
5034  const bool debug = ::Tpetra::Details::Behavior::debug ();
5035  if (debug) {
5036  const char tfecfFuncName[] = "localApply: ";
5037  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5038  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5039  "X.getNumVectors() = " << X.getNumVectors () << " != "
5040  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5041  const bool transpose = (mode != Teuchos::NO_TRANS);
5042  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5043  (! transpose && X.getLocalLength () !=
5044  getColMap ()->getLocalNumElements (), std::runtime_error,
5045  "NO_TRANS case: X has the wrong number of local rows. "
5046  "X.getLocalLength() = " << X.getLocalLength () << " != "
5047  "getColMap()->getLocalNumElements() = " <<
5048  getColMap ()->getLocalNumElements () << ".");
5049  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5050  (! transpose && Y.getLocalLength () !=
5051  getRowMap ()->getLocalNumElements (), std::runtime_error,
5052  "NO_TRANS case: Y has the wrong number of local rows. "
5053  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5054  "getRowMap()->getLocalNumElements() = " <<
5055  getRowMap ()->getLocalNumElements () << ".");
5056  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5057  (transpose && X.getLocalLength () !=
5058  getRowMap ()->getLocalNumElements (), std::runtime_error,
5059  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5060  "rows. X.getLocalLength() = " << X.getLocalLength ()
5061  << " != getRowMap()->getLocalNumElements() = "
5062  << getRowMap ()->getLocalNumElements () << ".");
5063  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5064  (transpose && Y.getLocalLength () !=
5065  getColMap ()->getLocalNumElements (), std::runtime_error,
5066  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5067  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5068  << " != getColMap()->getLocalNumElements() = "
5069  << getColMap ()->getLocalNumElements () << ".");
5070  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5071  (! isFillComplete (), std::runtime_error, "The matrix is not "
5072  "fill complete. You must call fillComplete() (possibly with "
5073  "domain and range Map arguments) without an intervening "
5074  "resumeFill() call before you may call this method.");
5075  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5076  (! X.isConstantStride () || ! Y.isConstantStride (),
5077  std::runtime_error, "X and Y must be constant stride.");
5078  // If the two pointers are null, then they don't alias one
5079  // another, even though they are equal.
5080  // Kokkos does not guarantee that zero row-extent vectors
5081  // point to different places, so we have to check that too.
5082  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5083  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5084  && X_lcl.extent(0) != 0,
5085  std::runtime_error, "X and Y may not alias one another.");
5086  }
5087 
5088 #if KOKKOSKERNELS_VERSION >= 40299
5089  auto A_lcl = getLocalMatrixDevice();
5090 
5091  if(!applyHelper.get()) {
5092  // The apply helper does not exist, so create it.
5093  // Decide now whether to use the imbalanced row path, or the default.
5094  bool useMergePath = false;
5095 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5096  //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5097  //we can use SPMV_MERGE_PATH for the native spmv as well.
5098  //Take out this ifdef to enable that.
5099  //
5100  //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5101  if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5102  LocalOrdinal nrows = getLocalNumRows();
5103  LocalOrdinal maxRowImbalance = 0;
5104  if(nrows != 0)
5105  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5106 
5107  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5108  useMergePath = true;
5109  }
5110 #endif
5111  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5112  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5113  }
5114 
5115  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5116  const char* modeKK = nullptr;
5117  switch(mode)
5118  {
5119  case Teuchos::NO_TRANS:
5120  modeKK = KokkosSparse::NoTranspose; break;
5121  case Teuchos::TRANS:
5122  modeKK = KokkosSparse::Transpose; break;
5123  case Teuchos::CONJ_TRANS:
5124  modeKK = KokkosSparse::ConjugateTranspose; break;
5125  default:
5126  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5127  }
5128 
5129  if(applyHelper->shouldUseIntRowptrs())
5130  {
5131  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5132  KokkosSparse::spmv(
5133  &applyHelper->handle_int, modeKK,
5134  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5135  }
5136  else
5137  {
5138  KokkosSparse::spmv(
5139  &applyHelper->handle, modeKK,
5140  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5141  }
5142 #else
5143  LocalOrdinal nrows = getLocalNumRows();
5144  LocalOrdinal maxRowImbalance = 0;
5145  if(nrows != 0)
5146  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5147 
5148  auto matrix_lcl = getLocalMultiplyOperator();
5149  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5150  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5151  else
5152  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5153 #endif
5154  }
5155 
5156  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5157  void
5161  Teuchos::ETransp mode,
5162  Scalar alpha,
5163  Scalar beta) const
5164  {
5166  const char fnName[] = "Tpetra::CrsMatrix::apply";
5167 
5168  TEUCHOS_TEST_FOR_EXCEPTION
5169  (! isFillComplete (), std::runtime_error,
5170  fnName << ": Cannot call apply() until fillComplete() "
5171  "has been called.");
5172 
5173  if (mode == Teuchos::NO_TRANS) {
5174  ProfilingRegion regionNonTranspose (fnName);
5175  this->applyNonTranspose (X, Y, alpha, beta);
5176  }
5177  else {
5178  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5179  this->applyTranspose (X, Y, mode, alpha, beta);
5180  }
5181  }
5182 
5183 
5184  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5185  template<class T>
5186  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5188  convert () const
5189  {
5190  using Teuchos::RCP;
5191  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5192  const char tfecfFuncName[] = "convert: ";
5193 
5194  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5195  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5196  "of the conversion) is not fill complete. You must first call "
5197  "fillComplete() (possibly with the domain and range Map) without an "
5198  "intervening call to resumeFill(), before you may call this method.");
5199 
5200  RCP<output_matrix_type> newMatrix
5201  (new output_matrix_type (this->getCrsGraph ()));
5202  // Copy old values into new values. impl_scalar_type and T may
5203  // differ, so we can't use Kokkos::deep_copy.
5205  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5206  this->getLocalMatrixDevice ().values);
5207  // Since newmat has a static (const) graph, the graph already has
5208  // a column Map, and Import and Export objects already exist (if
5209  // applicable). Thus, calling fillComplete is cheap.
5210  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5211 
5212  return newMatrix;
5213  }
5214 
5215 
5216  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5217  void
5220  {
5221  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5222  if (debug) {
5223  const char tfecfFuncName[] = "checkInternalState: ";
5224  const char err[] = "Internal state is not consistent. "
5225  "Please report this bug to the Tpetra developers.";
5226 
5227  // This version of the graph (RCP<const crs_graph_type>) must
5228  // always be nonnull.
5229  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5230  (staticGraph_.is_null (), std::logic_error, err);
5231  // myGraph == null means that the matrix has a const ("static")
5232  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5233  // graph).
5234  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5235  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5236  std::logic_error, err);
5237  // if matrix is fill complete, then graph must be fill complete
5238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5239  (isFillComplete () && ! staticGraph_->isFillComplete (),
5240  std::logic_error, err << " Specifically, the matrix is fill complete, "
5241  "but its graph is NOT fill complete.");
5242  // if values are allocated and they are non-zero in number, then
5243  // one of the allocations should be present
5244  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5245  (staticGraph_->indicesAreAllocated () &&
5246  staticGraph_->getLocalAllocationSize() > 0 &&
5247  staticGraph_->getLocalNumRows() > 0 &&
5248  valuesUnpacked_wdv.extent (0) == 0,
5249  std::logic_error, err);
5250  }
5251  }
5252 
5253  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5254  std::string
5257  {
5258  std::ostringstream os;
5259 
5260  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5261  if (this->getObjectLabel () != "") {
5262  os << "Label: \"" << this->getObjectLabel () << "\", ";
5263  }
5264  if (isFillComplete ()) {
5265  os << "isFillComplete: true"
5266  << ", global dimensions: [" << getGlobalNumRows () << ", "
5267  << getGlobalNumCols () << "]"
5268  << ", global number of entries: " << getGlobalNumEntries ()
5269  << "}";
5270  }
5271  else {
5272  os << "isFillComplete: false"
5273  << ", global dimensions: [" << getGlobalNumRows () << ", "
5274  << getGlobalNumCols () << "]}";
5275  }
5276  return os.str ();
5277  }
5278 
5279  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5280  void
5282  describe (Teuchos::FancyOStream &out,
5283  const Teuchos::EVerbosityLevel verbLevel) const
5284  {
5285  using std::endl;
5286  using std::setw;
5287  using Teuchos::ArrayView;
5288  using Teuchos::Comm;
5289  using Teuchos::RCP;
5290  using Teuchos::TypeNameTraits;
5291  using Teuchos::VERB_DEFAULT;
5292  using Teuchos::VERB_NONE;
5293  using Teuchos::VERB_LOW;
5294  using Teuchos::VERB_MEDIUM;
5295  using Teuchos::VERB_HIGH;
5296  using Teuchos::VERB_EXTREME;
5297 
5298  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5299 
5300  if (vl == VERB_NONE) {
5301  return; // Don't print anything at all
5302  }
5303 
5304  // By convention, describe() always begins with a tab.
5305  Teuchos::OSTab tab0 (out);
5306 
5307  RCP<const Comm<int> > comm = this->getComm();
5308  const int myRank = comm->getRank();
5309  const int numProcs = comm->getSize();
5310  size_t width = 1;
5311  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5312  ++width;
5313  }
5314  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5315 
5316  // none: print nothing
5317  // low: print O(1) info from node 0
5318  // medium: print O(P) info, num entries per process
5319  // high: print O(N) info, num entries per row
5320  // extreme: print O(NNZ) info: print indices and values
5321  //
5322  // for medium and higher, print constituent objects at specified verbLevel
5323  if (myRank == 0) {
5324  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5325  }
5326  Teuchos::OSTab tab1 (out);
5327 
5328  if (myRank == 0) {
5329  if (this->getObjectLabel () != "") {
5330  out << "Label: \"" << this->getObjectLabel () << "\", ";
5331  }
5332  {
5333  out << "Template parameters:" << endl;
5334  Teuchos::OSTab tab2 (out);
5335  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5336  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5337  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5338  << "Node: " << TypeNameTraits<Node>::name () << endl;
5339  }
5340  if (isFillComplete()) {
5341  out << "isFillComplete: true" << endl
5342  << "Global dimensions: [" << getGlobalNumRows () << ", "
5343  << getGlobalNumCols () << "]" << endl
5344  << "Global number of entries: " << getGlobalNumEntries () << endl
5345  << endl << "Global max number of entries in a row: "
5346  << getGlobalMaxNumRowEntries () << endl;
5347  }
5348  else {
5349  out << "isFillComplete: false" << endl
5350  << "Global dimensions: [" << getGlobalNumRows () << ", "
5351  << getGlobalNumCols () << "]" << endl;
5352  }
5353  }
5354 
5355  if (vl < VERB_MEDIUM) {
5356  return; // all done!
5357  }
5358 
5359  // Describe the row Map.
5360  if (myRank == 0) {
5361  out << endl << "Row Map:" << endl;
5362  }
5363  if (getRowMap ().is_null ()) {
5364  if (myRank == 0) {
5365  out << "null" << endl;
5366  }
5367  }
5368  else {
5369  if (myRank == 0) {
5370  out << endl;
5371  }
5372  getRowMap ()->describe (out, vl);
5373  }
5374 
5375  // Describe the column Map.
5376  if (myRank == 0) {
5377  out << "Column Map: ";
5378  }
5379  if (getColMap ().is_null ()) {
5380  if (myRank == 0) {
5381  out << "null" << endl;
5382  }
5383  } else if (getColMap () == getRowMap ()) {
5384  if (myRank == 0) {
5385  out << "same as row Map" << endl;
5386  }
5387  } else {
5388  if (myRank == 0) {
5389  out << endl;
5390  }
5391  getColMap ()->describe (out, vl);
5392  }
5393 
5394  // Describe the domain Map.
5395  if (myRank == 0) {
5396  out << "Domain Map: ";
5397  }
5398  if (getDomainMap ().is_null ()) {
5399  if (myRank == 0) {
5400  out << "null" << endl;
5401  }
5402  } else if (getDomainMap () == getRowMap ()) {
5403  if (myRank == 0) {
5404  out << "same as row Map" << endl;
5405  }
5406  } else if (getDomainMap () == getColMap ()) {
5407  if (myRank == 0) {
5408  out << "same as column Map" << endl;
5409  }
5410  } else {
5411  if (myRank == 0) {
5412  out << endl;
5413  }
5414  getDomainMap ()->describe (out, vl);
5415  }
5416 
5417  // Describe the range Map.
5418  if (myRank == 0) {
5419  out << "Range Map: ";
5420  }
5421  if (getRangeMap ().is_null ()) {
5422  if (myRank == 0) {
5423  out << "null" << endl;
5424  }
5425  } else if (getRangeMap () == getDomainMap ()) {
5426  if (myRank == 0) {
5427  out << "same as domain Map" << endl;
5428  }
5429  } else if (getRangeMap () == getRowMap ()) {
5430  if (myRank == 0) {
5431  out << "same as row Map" << endl;
5432  }
5433  } else {
5434  if (myRank == 0) {
5435  out << endl;
5436  }
5437  getRangeMap ()->describe (out, vl);
5438  }
5439 
5440  // O(P) data
5441  for (int curRank = 0; curRank < numProcs; ++curRank) {
5442  if (myRank == curRank) {
5443  out << "Process rank: " << curRank << endl;
5444  Teuchos::OSTab tab2 (out);
5445  if (! staticGraph_->indicesAreAllocated ()) {
5446  out << "Graph indices not allocated" << endl;
5447  }
5448  else {
5449  out << "Number of allocated entries: "
5450  << staticGraph_->getLocalAllocationSize () << endl;
5451  }
5452  out << "Number of entries: " << getLocalNumEntries () << endl
5453  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5454  << endl;
5455  }
5456  // Give output time to complete by executing some barriers.
5457  comm->barrier ();
5458  comm->barrier ();
5459  comm->barrier ();
5460  }
5461 
5462  if (vl < VERB_HIGH) {
5463  return; // all done!
5464  }
5465 
5466  // O(N) and O(NNZ) data
5467  for (int curRank = 0; curRank < numProcs; ++curRank) {
5468  if (myRank == curRank) {
5469  out << std::setw(width) << "Proc Rank"
5470  << std::setw(width) << "Global Row"
5471  << std::setw(width) << "Num Entries";
5472  if (vl == VERB_EXTREME) {
5473  out << std::setw(width) << "(Index,Value)";
5474  }
5475  out << endl;
5476  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5477  const size_t nE = getNumEntriesInLocalRow(r);
5478  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5479  out << std::setw(width) << myRank
5480  << std::setw(width) << gid
5481  << std::setw(width) << nE;
5482  if (vl == VERB_EXTREME) {
5483  if (isGloballyIndexed()) {
5484  global_inds_host_view_type rowinds;
5485  values_host_view_type rowvals;
5486  getGlobalRowView (gid, rowinds, rowvals);
5487  for (size_t j = 0; j < nE; ++j) {
5488  out << " (" << rowinds[j]
5489  << ", " << rowvals[j]
5490  << ") ";
5491  }
5492  }
5493  else if (isLocallyIndexed()) {
5494  local_inds_host_view_type rowinds;
5495  values_host_view_type rowvals;
5496  getLocalRowView (r, rowinds, rowvals);
5497  for (size_t j=0; j < nE; ++j) {
5498  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5499  << ", " << rowvals[j]
5500  << ") ";
5501  }
5502  } // globally or locally indexed
5503  } // vl == VERB_EXTREME
5504  out << endl;
5505  } // for each row r on this process
5506  } // if (myRank == curRank)
5507 
5508  // Give output time to complete
5509  comm->barrier ();
5510  comm->barrier ();
5511  comm->barrier ();
5512  } // for each process p
5513  }
5514 
5515  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5516  bool
5519  {
5520  // It's not clear what kind of compatibility checks on sizes can
5521  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5522  // compatibility.
5523 
5524  // Currently, the source object must be a RowMatrix with the same
5525  // four template parameters as the target CrsMatrix. We might
5526  // relax this requirement later.
5527  const row_matrix_type* srcRowMat =
5528  dynamic_cast<const row_matrix_type*> (&source);
5529  return (srcRowMat != nullptr);
5530  }
5531 
5532  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5533  void
5536  const typename crs_graph_type::padding_type& padding,
5537  const bool verbose)
5538  {
5540  using Details::padCrsArrays;
5541  using std::endl;
5542  using LO = local_ordinal_type;
5543  using row_ptrs_type =
5544  typename local_graph_device_type::row_map_type::non_const_type;
5545  using range_policy =
5546  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5547  const char tfecfFuncName[] = "applyCrsPadding";
5548  const char suffix[] =
5549  ". Please report this bug to the Tpetra developers.";
5550  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5551 
5552  std::unique_ptr<std::string> prefix;
5553  if (verbose) {
5554  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5555  std::ostringstream os;
5556  os << *prefix << "padding: ";
5557  padding.print(os);
5558  os << endl;
5559  std::cerr << os.str();
5560  }
5561  const int myRank = ! verbose ? -1 : [&] () {
5562  auto map = this->getMap();
5563  if (map.is_null()) {
5564  return -1;
5565  }
5566  auto comm = map->getComm();
5567  if (comm.is_null()) {
5568  return -1;
5569  }
5570  return comm->getRank();
5571  } ();
5572 
5573  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5574  if (! myGraph_->indicesAreAllocated()) {
5575  if (verbose) {
5576  std::ostringstream os;
5577  os << *prefix << "Call allocateIndices" << endl;
5578  std::cerr << os.str();
5579  }
5580  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5581  }
5582 
5583  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5584  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5585  // size needs to increase. That should be the job of
5586  // padCrsArrays.
5587 
5588  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5589  // would use it directly.
5590 
5591  if (verbose) {
5592  std::ostringstream os;
5593  os << *prefix << "Allocate row_ptrs_beg: "
5594  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5595  std::cerr << os.str();
5596  }
5597  using Kokkos::view_alloc;
5598  using Kokkos::WithoutInitializing;
5599  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5600  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5601  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5602  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5603 
5604  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5605  size_t(row_ptr_beg.extent(0) - 1);
5606  if (verbose) {
5607  std::ostringstream os;
5608  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5609  std::cerr << os.str();
5610  }
5611  row_ptrs_type row_ptr_end(
5612  view_alloc("row_ptr_end", WithoutInitializing), N);
5613 
5614  row_ptrs_type num_row_entries_d;
5615 
5616  const bool refill_num_row_entries =
5617  myGraph_->k_numRowEntries_.extent(0) != 0;
5618 
5619  if (refill_num_row_entries) { // unpacked storage
5620  // We can't assume correct *this capture until C++17, and it's
5621  // likely more efficient just to capture what we need anyway.
5622  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5623  myGraph_->k_numRowEntries_);
5624  Kokkos::parallel_for
5625  ("Fill end row pointers", range_policy(0, N),
5626  KOKKOS_LAMBDA (const size_t i) {
5627  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5628  });
5629  }
5630  else {
5631  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5632  // storage, we don't need row_ptr_end to be separate allocation;
5633  // could just have it alias row_ptr_beg+1.
5634  Kokkos::parallel_for
5635  ("Fill end row pointers", range_policy(0, N),
5636  KOKKOS_LAMBDA (const size_t i) {
5637  row_ptr_end(i) = row_ptr_beg(i+1);
5638  });
5639  }
5640 
5641  if (myGraph_->isGloballyIndexed()) {
5642  padCrsArrays(row_ptr_beg, row_ptr_end,
5643  myGraph_->gblInds_wdv,
5644  valuesUnpacked_wdv, padding, myRank, verbose);
5645  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5646  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5647  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5648  (newValuesLen != newColIndsLen, std::logic_error,
5649  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5650  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5651  << suffix);
5652  }
5653  else {
5654  padCrsArrays(row_ptr_beg, row_ptr_end,
5655  myGraph_->lclIndsUnpacked_wdv,
5656  valuesUnpacked_wdv, padding, myRank, verbose);
5657  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5658  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5659  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5660  (newValuesLen != newColIndsLen, std::logic_error,
5661  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5662  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5663  << suffix);
5664  }
5665 
5666  if (refill_num_row_entries) {
5667  Kokkos::parallel_for
5668  ("Fill num entries", range_policy(0, N),
5669  KOKKOS_LAMBDA (const size_t i) {
5670  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5671  });
5672  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5673  }
5674 
5675  if (verbose) {
5676  std::ostringstream os;
5677  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5678  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5679  << ", new size: " << row_ptr_beg.extent(0) << endl;
5680  std::cerr << os.str();
5681  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5682  row_ptr_beg.extent(0) );
5683  }
5684  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5685  }
5686 
5687  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5688  void
5689  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5690  copyAndPermuteStaticGraph(
5691  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5692  const size_t numSameIDs,
5693  const LocalOrdinal permuteToLIDs[],
5694  const LocalOrdinal permuteFromLIDs[],
5695  const size_t numPermutes)
5696  {
5697  using Details::ProfilingRegion;
5698  using Teuchos::Array;
5699  using Teuchos::ArrayView;
5700  using std::endl;
5701  using LO = LocalOrdinal;
5702  using GO = GlobalOrdinal;
5703  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5704  const char suffix[] =
5705  " Please report this bug to the Tpetra developers.";
5706  ProfilingRegion regionCAP
5707  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5708 
5709  const bool debug = Details::Behavior::debug("CrsGraph");
5710  const bool verbose = Details::Behavior::verbose("CrsGraph");
5711  std::unique_ptr<std::string> prefix;
5712  if (verbose) {
5713  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5714  std::ostringstream os;
5715  os << *prefix << "Start" << endl;
5716  }
5717  const char* const prefix_raw =
5718  verbose ? prefix.get()->c_str() : nullptr;
5719 
5720  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5721  //
5722  // Copy the first numSame row from source to target (this matrix).
5723  // This involves copying rows corresponding to LIDs [0, numSame-1].
5724  //
5725  const map_type& srcRowMap = * (srcMat.getRowMap ());
5726  nonconst_global_inds_host_view_type rowInds;
5727  nonconst_values_host_view_type rowVals;
5728  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5729  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5730  // Global ID for the current row index in the source matrix.
5731  // The first numSameIDs GIDs in the two input lists are the
5732  // same, so sourceGID == targetGID in this case.
5733  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5734  const GO targetGID = sourceGID;
5735 
5736  ArrayView<const GO>rowIndsConstView;
5737  ArrayView<const Scalar> rowValsConstView;
5738 
5739  if (sourceIsLocallyIndexed) {
5740  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5741  if (rowLength > static_cast<size_t> (rowInds.size())) {
5742  Kokkos::resize(rowInds,rowLength);
5743  Kokkos::resize(rowVals,rowLength);
5744  }
5745  // Resizing invalidates an Array's views, so we must make new
5746  // ones, even if rowLength hasn't changed.
5747  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5748  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5749 
5750  // The source matrix is locally indexed, so we have to get a
5751  // copy. Really it's the GIDs that have to be copied (because
5752  // they have to be converted from LIDs).
5753  size_t checkRowLength = 0;
5754  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5755  rowValsView, checkRowLength);
5756  if (debug) {
5757  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5758  (rowLength != checkRowLength, std::logic_error, "For "
5759  "global row index " << sourceGID << ", the source "
5760  "matrix's getNumEntriesInGlobalRow returns a row length "
5761  "of " << rowLength << ", but getGlobalRowCopy reports "
5762  "a row length of " << checkRowLength << "." << suffix);
5763  }
5764 
5765  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5766  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5767  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5768  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5769  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5770  rowIndsView.data(), rowIndsView.extent(0),
5771  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5772  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5773  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5774  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5775  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5776  // KDDKDD UVM TEMPORARY: KokkosView interface
5777  }
5778  else { // source matrix is globally indexed.
5779  global_inds_host_view_type rowIndsView;
5780  values_host_view_type rowValsView;
5781  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5782  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5783  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5784  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5785  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5786  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5787  rowIndsView.data(), rowIndsView.extent(0),
5788  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5789  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5790  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5791  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5792  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5793  // KDDKDD UVM TEMPORARY: KokkosView interface
5794 
5795  }
5796 
5797  // Applying a permutation to a matrix with a static graph
5798  // means REPLACE-ing entries.
5799  combineGlobalValues(targetGID, rowIndsConstView,
5800  rowValsConstView, REPLACE,
5801  prefix_raw, debug, verbose);
5802  }
5803 
5804  if (verbose) {
5805  std::ostringstream os;
5806  os << *prefix << "Do permutes" << endl;
5807  }
5808 
5809  const map_type& tgtRowMap = * (this->getRowMap ());
5810  for (size_t p = 0; p < numPermutes; ++p) {
5811  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5812  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5813 
5814  ArrayView<const GO> rowIndsConstView;
5815  ArrayView<const Scalar> rowValsConstView;
5816 
5817  if (sourceIsLocallyIndexed) {
5818  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5819  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5820  Kokkos::resize(rowInds,rowLength);
5821  Kokkos::resize(rowVals,rowLength);
5822  }
5823  // Resizing invalidates an Array's views, so we must make new
5824  // ones, even if rowLength hasn't changed.
5825  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5826  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5827 
5828  // The source matrix is locally indexed, so we have to get a
5829  // copy. Really it's the GIDs that have to be copied (because
5830  // they have to be converted from LIDs).
5831  size_t checkRowLength = 0;
5832  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5833  rowValsView, checkRowLength);
5834  if (debug) {
5835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5836  (rowLength != checkRowLength, std::logic_error, "For "
5837  "source matrix global row index " << sourceGID << ", "
5838  "getNumEntriesInGlobalRow returns a row length of " <<
5839  rowLength << ", but getGlobalRowCopy a row length of "
5840  << checkRowLength << "." << suffix);
5841  }
5842 
5843  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5844  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5845  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5846  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5847  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5848  rowIndsView.data(), rowIndsView.extent(0),
5849  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5850  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5851  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5852  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5853  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5854  // KDDKDD UVM TEMPORARY: KokkosView interface
5855  }
5856  else {
5857  global_inds_host_view_type rowIndsView;
5858  values_host_view_type rowValsView;
5859  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5860  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5861  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5862  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5863  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5864  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5865  rowIndsView.data(), rowIndsView.extent(0),
5866  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5867  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5868  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5869  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5870  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5871  // KDDKDD UVM TEMPORARY: KokkosView interface
5872  }
5873 
5874  combineGlobalValues(targetGID, rowIndsConstView,
5875  rowValsConstView, REPLACE,
5876  prefix_raw, debug, verbose);
5877  }
5878 
5879  if (verbose) {
5880  std::ostringstream os;
5881  os << *prefix << "Done" << endl;
5882  }
5883  }
5884 
5885  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5886  void
5887  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5888  copyAndPermuteNonStaticGraph(
5889  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5890  const size_t numSameIDs,
5891  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5892  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5893  const size_t numPermutes)
5894  {
5895  using Details::ProfilingRegion;
5896  using Teuchos::Array;
5897  using Teuchos::ArrayView;
5898  using std::endl;
5899  using LO = LocalOrdinal;
5900  using GO = GlobalOrdinal;
5901  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5902  const char suffix[] =
5903  " Please report this bug to the Tpetra developers.";
5904  ProfilingRegion regionCAP
5905  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5906 
5907  const bool debug = Details::Behavior::debug("CrsGraph");
5908  const bool verbose = Details::Behavior::verbose("CrsGraph");
5909  std::unique_ptr<std::string> prefix;
5910  if (verbose) {
5911  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5912  std::ostringstream os;
5913  os << *prefix << "Start" << endl;
5914  }
5915  const char* const prefix_raw =
5916  verbose ? prefix.get()->c_str() : nullptr;
5917 
5918  {
5919  using row_graph_type = RowGraph<LO, GO, Node>;
5920  const row_graph_type& srcGraph = *(srcMat.getGraph());
5921  auto padding =
5922  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5923  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5924  applyCrsPadding(*padding, verbose);
5925  }
5926  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5927  //
5928  // Copy the first numSame row from source to target (this matrix).
5929  // This involves copying rows corresponding to LIDs [0, numSame-1].
5930  //
5931  const map_type& srcRowMap = * (srcMat.getRowMap ());
5932  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5933  using gids_type = nonconst_global_inds_host_view_type;
5934  using vals_type = nonconst_values_host_view_type;
5935  gids_type rowInds;
5936  vals_type rowVals;
5937  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5938  // Global ID for the current row index in the source matrix.
5939  // The first numSameIDs GIDs in the two input lists are the
5940  // same, so sourceGID == targetGID in this case.
5941  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5942  const GO targetGID = sourceGID;
5943 
5944  ArrayView<const GO> rowIndsConstView;
5945  ArrayView<const Scalar> rowValsConstView;
5946 
5947  if (sourceIsLocallyIndexed) {
5948 
5949  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5950  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5951  Kokkos::resize(rowInds,rowLength);
5952  Kokkos::resize(rowVals,rowLength);
5953  }
5954  // Resizing invalidates an Array's views, so we must make new
5955  // ones, even if rowLength hasn't changed.
5956  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5957  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5958 
5959  // The source matrix is locally indexed, so we have to get a
5960  // copy. Really it's the GIDs that have to be copied (because
5961  // they have to be converted from LIDs).
5962  size_t checkRowLength = 0;
5963  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5964  checkRowLength);
5965  if (debug) {
5966  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5967  (rowLength != checkRowLength, std::logic_error, ": For "
5968  "global row index " << sourceGID << ", the source "
5969  "matrix's getNumEntriesInGlobalRow returns a row length "
5970  "of " << rowLength << ", but getGlobalRowCopy reports "
5971  "a row length of " << checkRowLength << "." << suffix);
5972  }
5973  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5974  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5975  }
5976  else { // source matrix is globally indexed.
5977  global_inds_host_view_type rowIndsView;
5978  values_host_view_type rowValsView;
5979  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5980 
5981  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5982  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5983  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5984  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5985  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5986  rowIndsView.data(), rowIndsView.extent(0),
5987  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5988  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5989  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5990  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5991  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5992  // KDDKDD UVM TEMPORARY: KokkosView interface
5993  }
5994 
5995  // Combine the data into the target matrix.
5996  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5997  rowValsConstView, prefix_raw, debug, verbose);
5998  }
5999 
6000  if (verbose) {
6001  std::ostringstream os;
6002  os << *prefix << "Do permutes" << endl;
6003  }
6004  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
6005  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
6006 
6007  const map_type& tgtRowMap = * (this->getRowMap ());
6008  for (size_t p = 0; p < numPermutes; ++p) {
6009  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
6010  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
6011 
6012  ArrayView<const GO> rowIndsConstView;
6013  ArrayView<const Scalar> rowValsConstView;
6014 
6015  if (sourceIsLocallyIndexed) {
6016  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6017  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
6018  Kokkos::resize(rowInds,rowLength);
6019  Kokkos::resize(rowVals,rowLength);
6020  }
6021  // Resizing invalidates an Array's views, so we must make new
6022  // ones, even if rowLength hasn't changed.
6023  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
6024  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
6025 
6026  // The source matrix is locally indexed, so we have to get a
6027  // copy. Really it's the GIDs that have to be copied (because
6028  // they have to be converted from LIDs).
6029  size_t checkRowLength = 0;
6030  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6031  rowValsView, checkRowLength);
6032  if (debug) {
6033  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6034  (rowLength != checkRowLength, std::logic_error, "For "
6035  "source matrix global row index " << sourceGID << ", "
6036  "getNumEntriesInGlobalRow returns a row length of " <<
6037  rowLength << ", but getGlobalRowCopy a row length of "
6038  << checkRowLength << "." << suffix);
6039  }
6040  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6041  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6042  }
6043  else {
6044  global_inds_host_view_type rowIndsView;
6045  values_host_view_type rowValsView;
6046  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6047 
6048  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6049  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6050  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6051  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6052  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6053  rowIndsView.data(), rowIndsView.extent(0),
6054  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6055  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6056  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6057  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6058  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6059  // KDDKDD UVM TEMPORARY: KokkosView interface
6060  }
6061 
6062  // Combine the data into the target matrix.
6063  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6064  rowValsConstView, prefix_raw, debug, verbose);
6065  }
6066 
6067  if (verbose) {
6068  std::ostringstream os;
6069  os << *prefix << "Done" << endl;
6070  }
6071  }
6072 
6073  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6074  void
6077  const SrcDistObject& srcObj,
6078  const size_t numSameIDs,
6079  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6080  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6081  const CombineMode /*CM*/)
6082  {
6083  using Details::Behavior;
6086  using std::endl;
6087 
6088  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6089  const char tfecfFuncName[] = "copyAndPermute: ";
6090  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6091 
6092  const bool verbose = Behavior::verbose("CrsMatrix");
6093  std::unique_ptr<std::string> prefix;
6094  if (verbose) {
6095  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6096  std::ostringstream os;
6097  os << *prefix << endl
6098  << *prefix << " numSameIDs: " << numSameIDs << endl
6099  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6100  << endl
6101  << *prefix << " "
6102  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6103  << endl
6104  << *prefix << " "
6105  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6106  << endl
6107  << *prefix << " "
6108  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6109  << endl;
6110  std::cerr << os.str ();
6111  }
6112 
6113  const auto numPermute = permuteToLIDs.extent (0);
6114  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6115  (numPermute != permuteFromLIDs.extent (0),
6116  std::invalid_argument, "permuteToLIDs.extent(0) = "
6117  << numPermute << "!= permuteFromLIDs.extent(0) = "
6118  << permuteFromLIDs.extent (0) << ".");
6119 
6120  // This dynamic cast should succeed, because we've already tested
6121  // it in checkSizes().
6123  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6124  if (isStaticGraph ()) {
6125  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6126  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6127  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6128  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6129 
6130  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6131  permuteToLIDs_h.data(),
6132  permuteFromLIDs_h.data(),
6133  numPermute);
6134  }
6135  else {
6136  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6137  permuteFromLIDs, numPermute);
6138  }
6139 
6140  if (verbose) {
6141  std::ostringstream os;
6142  os << *prefix << "Done" << endl;
6143  std::cerr << os.str();
6144  }
6145  }
6146 
6147  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6148  void
6151  (const SrcDistObject& source,
6152  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6153  Kokkos::DualView<char*, buffer_device_type>& exports,
6154  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6155  size_t& constantNumPackets)
6156  {
6157  using Details::Behavior;
6160  using Teuchos::outArg;
6161  using Teuchos::REDUCE_MAX;
6162  using Teuchos::reduceAll;
6163  using std::endl;
6164  typedef LocalOrdinal LO;
6165  typedef GlobalOrdinal GO;
6166  const char tfecfFuncName[] = "packAndPrepare: ";
6167  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6168 
6169  const bool debug = Behavior::debug("CrsMatrix");
6170  const bool verbose = Behavior::verbose("CrsMatrix");
6171 
6172  // Processes on which the communicator is null should not participate.
6173  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6174  if (pComm.is_null ()) {
6175  return;
6176  }
6177  const Teuchos::Comm<int>& comm = *pComm;
6178  const int myRank = comm.getSize ();
6179 
6180  std::unique_ptr<std::string> prefix;
6181  if (verbose) {
6182  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6183  std::ostringstream os;
6184  os << *prefix << "Start" << endl
6185  << *prefix << " "
6186  << dualViewStatusToString (exportLIDs, "exportLIDs")
6187  << endl
6188  << *prefix << " "
6189  << dualViewStatusToString (exports, "exports")
6190  << endl
6191  << *prefix << " "
6192  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6193  << endl;
6194  std::cerr << os.str ();
6195  }
6196 
6197  // Attempt to cast the source object to CrsMatrix. If successful,
6198  // use the source object's packNew() method to pack its data for
6199  // communication. Otherwise, attempt to cast to RowMatrix; if
6200  // successful, use the source object's pack() method. Otherwise,
6201  // the source object doesn't have the right type.
6202  //
6203  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6204  // RowMatrix to have the same Node type. Unfortunately, we don't
6205  // have a way to ask if the RowMatrix is "a RowMatrix with any
6206  // Node type," since RowMatrix doesn't have a base class. A
6207  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6208  // not currently exist, would satisfy this requirement.
6209  //
6210  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6211  // type doesn't technically need to match the target object's
6212  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6213  // and GO need not be the same, as long as there is no overflow of
6214  // the indices. However, checking for index overflow is global
6215  // and therefore undesirable.
6216 
6217  std::ostringstream msg; // for collecting error messages
6218  int lclBad = 0; // to be set below
6219 
6220  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6221  const crs_matrix_type* srcCrsMat =
6222  dynamic_cast<const crs_matrix_type*> (&source);
6223  if (srcCrsMat != nullptr) {
6224  if (verbose) {
6225  std::ostringstream os;
6226  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6227  "calling packNew" << endl;
6228  std::cerr << os.str ();
6229  }
6230  try {
6231  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6232  constantNumPackets);
6233  }
6234  catch (std::exception& e) {
6235  lclBad = 1;
6236  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6237  }
6238  }
6239  else {
6240  using Kokkos::HostSpace;
6241  using Kokkos::subview;
6242  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6243  using range_type = Kokkos::pair<size_t, size_t>;
6244 
6245  if (verbose) {
6246  std::ostringstream os;
6247  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6248  << endl;
6249  std::cerr << os.str ();
6250  }
6251 
6252  const row_matrix_type* srcRowMat =
6253  dynamic_cast<const row_matrix_type*> (&source);
6254  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6255  (srcRowMat == nullptr, std::invalid_argument,
6256  "The source object of the Import or Export operation is neither a "
6257  "CrsMatrix (with the same template parameters as the target object), "
6258  "nor a RowMatrix (with the same first four template parameters as the "
6259  "target object).");
6260 
6261  // For the RowMatrix case, we need to convert from
6262  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6263  // so terribly efficient, since packing a non-CrsMatrix
6264  // RowMatrix for Import/Export into a CrsMatrix is not a
6265  // critical case. Thus, we may allocate Teuchos::Array objects
6266  // here and copy to and from Kokkos::*View.
6267 
6268  // View exportLIDs's host data as a Teuchos::ArrayView.
6269  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6270  auto exportLIDs_h = exportLIDs.view_host ();
6271  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6272  exportLIDs_h.size ());
6273 
6274  // pack() will allocate exports_a as needed. We'll copy back
6275  // into exports (after (re)allocating exports if needed) below.
6276  Teuchos::Array<char> exports_a;
6277 
6278  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6279  // need to sync, since we're doing write-only access, but we do
6280  // need to mark the DualView as modified on host.
6281 
6282  numPacketsPerLID.clear_sync_state (); // write-only access
6283  numPacketsPerLID.modify_host ();
6284  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6285  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6286  numPacketsPerLID_h.size ());
6287 
6288  // Invoke RowMatrix's legacy pack() interface, using above
6289  // Teuchos::Array* objects.
6290  try {
6291  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6292  constantNumPackets);
6293  }
6294  catch (std::exception& e) {
6295  lclBad = 1;
6296  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6297  }
6298 
6299  // Allocate 'exports', and copy exports_a back into it.
6300  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6301  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6302  const std::string oldLabel = exports.d_view.label ();
6303  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6304  exports = exports_type (newLabel, newAllocSize);
6305  }
6306  // It's safe to assume that we're working on host anyway, so
6307  // just keep exports sync'd to host.
6308  // ignore current device contents
6309  exports.modify_host();
6310 
6311  auto exports_h = exports.view_host ();
6312  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6313 
6314  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6315  // exports_a into a nonowning Kokkos::View first before copying.
6316  typedef typename exports_type::t_host::execution_space HES;
6317  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6318  Kokkos::View<const char*, host_device_type>
6319  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6320  // DEEP_COPY REVIEW - NOT TESTED
6321  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6322  }
6323 
6324  if (debug) {
6325  int gblBad = 0; // output argument; to be set below
6326  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6327  if (gblBad != 0) {
6328  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6329  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6330  (true, std::logic_error, "packNew() or pack() threw an exception on "
6331  &qu