Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_CRSMATRIX_DEF_HPP
41 #define TPETRA_CRSMATRIX_DEF_HPP
42 
50 
51 #include "Tpetra_Import_Util.hpp"
52 #include "Tpetra_Import_Util2.hpp"
53 #include "Tpetra_RowMatrix.hpp"
54 #include "Tpetra_LocalCrsMatrixOperator.hpp"
55 
62 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
67 #include "KokkosSparse_getDiagCopy.hpp"
71 #include "Tpetra_Details_packCrsMatrix.hpp"
72 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
74 #include "Teuchos_FancyOStream.hpp"
75 #include "Teuchos_RCP.hpp"
76 #include "Teuchos_DataAccess.hpp"
77 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
78 #include "KokkosBlas.hpp"
79 
80 #include <memory>
81 #include <sstream>
82 #include <typeinfo>
83 #include <utility>
84 #include <vector>
85 
86 namespace Tpetra {
87 
88 namespace { // (anonymous)
89 
90  template<class T, class BinaryFunction>
91  T atomic_binary_function_update (volatile T* const dest,
92  const T& inputVal,
93  BinaryFunction f)
94  {
95  T oldVal = *dest;
96  T assume;
97 
98  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
99  // POWER architectures, because 'newval' depends on 'assume',
100  // which depends on 'oldVal', which depends on '*dest'. This
101  // sets up a chain of read dependencies that should ensure
102  // correct behavior given a sane memory model.
103  do {
104  assume = oldVal;
105  T newVal = f (assume, inputVal);
106  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
107  } while (assume != oldVal);
108 
109  return oldVal;
110  }
111 } // namespace (anonymous)
112 
113 //
114 // Users must never rely on anything in the Details namespace.
115 //
116 namespace Details {
117 
127 template<class Scalar>
128 struct AbsMax {
130  Scalar operator() (const Scalar& x, const Scalar& y) {
131  typedef Teuchos::ScalarTraits<Scalar> STS;
132  return std::max (STS::magnitude (x), STS::magnitude (y));
133  }
134 };
135 
136 } // namespace Details
137 } // namespace Tpetra
138 
139 namespace Tpetra {
140 
141  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
142  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
143  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
144  size_t maxNumEntriesPerRow,
145  const Teuchos::RCP<Teuchos::ParameterList>& params) :
146  dist_object_type (rowMap)
147  {
148  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
149  "[, RCP<ParameterList>]): ";
150  Teuchos::RCP<crs_graph_type> graph;
151  try {
152  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
153  params));
154  }
155  catch (std::exception& e) {
156  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
157  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
158  "size_t [, RCP<ParameterList>]) threw an exception: "
159  << e.what ());
160  }
161  // myGraph_ not null means that the matrix owns the graph. That's
162  // different than the const CrsGraph constructor, where the matrix
163  // does _not_ own the graph.
164  myGraph_ = graph;
165  staticGraph_ = myGraph_;
166  resumeFill (params);
168  }
169 
170  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
172  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
173  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
174  const Teuchos::RCP<Teuchos::ParameterList>& params) :
175  dist_object_type (rowMap)
176  {
177  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
178  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
179  Teuchos::RCP<crs_graph_type> graph;
180  try {
181  using Teuchos::rcp;
182  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
183  params));
184  }
185  catch (std::exception& e) {
186  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
187  (true, std::runtime_error, "CrsGraph constructor "
188  "(RCP<const Map>, ArrayView<const size_t>"
189  "[, RCP<ParameterList>]) threw an exception: "
190  << e.what ());
191  }
192  // myGraph_ not null means that the matrix owns the graph. That's
193  // different than the const CrsGraph constructor, where the matrix
194  // does _not_ own the graph.
195  myGraph_ = graph;
196  staticGraph_ = graph;
197  resumeFill (params);
199  }
200 
201  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
203  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
204  const Teuchos::RCP<const map_type>& colMap,
205  const size_t maxNumEntPerRow,
206  const Teuchos::RCP<Teuchos::ParameterList>& params) :
207  dist_object_type (rowMap)
208  {
209  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
210  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
211  const char suffix[] =
212  " Please report this bug to the Tpetra developers.";
213 
214  // An artifact of debugging something a while back.
215  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
216  (! staticGraph_.is_null (), std::logic_error,
217  "staticGraph_ is not null at the beginning of the constructor."
218  << suffix);
219  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
220  (! myGraph_.is_null (), std::logic_error,
221  "myGraph_ is not null at the beginning of the constructor."
222  << suffix);
223  Teuchos::RCP<crs_graph_type> graph;
224  try {
225  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
226  maxNumEntPerRow,
227  params));
228  }
229  catch (std::exception& e) {
230  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
231  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
232  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
233  "exception: " << e.what ());
234  }
235  // myGraph_ not null means that the matrix owns the graph. That's
236  // different than the const CrsGraph constructor, where the matrix
237  // does _not_ own the graph.
238  myGraph_ = graph;
239  staticGraph_ = myGraph_;
240  resumeFill (params);
242  }
243 
244  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
246  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
247  const Teuchos::RCP<const map_type>& colMap,
248  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
249  const Teuchos::RCP<Teuchos::ParameterList>& params) :
250  dist_object_type (rowMap)
251  {
252  const char tfecfFuncName[] =
253  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
254  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
255  Teuchos::RCP<crs_graph_type> graph;
256  try {
257  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
258  numEntPerRowToAlloc,
259  params));
260  }
261  catch (std::exception& e) {
262  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
263  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
264  "RCP<const Map>, ArrayView<const size_t>[, "
265  "RCP<ParameterList>]) threw an exception: " << e.what ());
266  }
267  // myGraph_ not null means that the matrix owns the graph. That's
268  // different than the const CrsGraph constructor, where the matrix
269  // does _not_ own the graph.
270  myGraph_ = graph;
271  staticGraph_ = graph;
272  resumeFill (params);
274  }
275 
276 
277  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
279  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
280  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
281  dist_object_type (graph->getRowMap ()),
282  staticGraph_ (graph),
283  storageStatus_ (Details::STORAGE_1D_PACKED)
284  {
285  using std::endl;
286  typedef typename local_matrix_device_type::values_type values_type;
287  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
288  "RCP<ParameterList>]): ";
289  const bool verbose = Details::Behavior::verbose("CrsMatrix");
290 
291  std::unique_ptr<std::string> prefix;
292  if (verbose) {
293  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
294  std::ostringstream os;
295  os << *prefix << "Start" << endl;
296  std::cerr << os.str ();
297  }
298 
299  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
300  (graph.is_null (), std::runtime_error, "Input graph is null.");
301  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
302  (! graph->isFillComplete (), std::runtime_error, "Input graph "
303  "is not fill complete. You must call fillComplete on the "
304  "graph before using it to construct a CrsMatrix. Note that "
305  "calling resumeFill on the graph makes it not fill complete, "
306  "even if you had previously called fillComplete. In that "
307  "case, you must call fillComplete on the graph again.");
308 
309  // The graph is fill complete, so it is locally indexed and has a
310  // fixed structure. This means we can allocate the (1-D) array of
311  // values and build the local matrix right now. Note that the
312  // local matrix's number of columns comes from the column Map, not
313  // the domain Map.
314 
315  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
316  if (verbose) {
317  std::ostringstream os;
318  os << *prefix << "Allocate values: " << numEnt << endl;
319  std::cerr << os.str ();
320  }
321 
322  values_type val ("Tpetra::CrsMatrix::values", numEnt);
323  valuesPacked_wdv = values_wdv_type(val);
324  valuesUnpacked_wdv = valuesPacked_wdv;
325 
327 
328  if (verbose) {
329  std::ostringstream os;
330  os << *prefix << "Done" << endl;
331  std::cerr << os.str ();
332  }
333  }
334 
335  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
338  const Teuchos::RCP<const crs_graph_type>& graph,
339  const Teuchos::RCP<Teuchos::ParameterList>& params) :
340  dist_object_type (graph->getRowMap ()),
341  staticGraph_ (graph),
342  storageStatus_ (matrix.storageStatus_)
343  {
344  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
345  "local_matrix_device_type::values_type, "
346  "[,RCP<ParameterList>]): ";
347  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
348  (graph.is_null (), std::runtime_error, "Input graph is null.");
349  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
350  (! graph->isFillComplete (), std::runtime_error, "Input graph "
351  "is not fill complete. You must call fillComplete on the "
352  "graph before using it to construct a CrsMatrix. Note that "
353  "calling resumeFill on the graph makes it not fill complete, "
354  "even if you had previously called fillComplete. In that "
355  "case, you must call fillComplete on the graph again.");
356 
357  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
358  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
359 
360  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
361  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
362 
364  }
365 
366 
367  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
369  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
370  const typename local_matrix_device_type::values_type& values,
371  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
372  dist_object_type (graph->getRowMap ()),
373  staticGraph_ (graph),
374  storageStatus_ (Details::STORAGE_1D_PACKED)
375  {
376  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
377  "local_matrix_device_type::values_type, "
378  "[,RCP<ParameterList>]): ";
379  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
380  (graph.is_null (), std::runtime_error, "Input graph is null.");
381  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
382  (! graph->isFillComplete (), std::runtime_error, "Input graph "
383  "is not fill complete. You must call fillComplete on the "
384  "graph before using it to construct a CrsMatrix. Note that "
385  "calling resumeFill on the graph makes it not fill complete, "
386  "even if you had previously called fillComplete. In that "
387  "case, you must call fillComplete on the graph again.");
388 
389  // The graph is fill complete, so it is locally indexed and has a
390  // fixed structure. This means we can allocate the (1-D) array of
391  // values and build the local matrix right now. Note that the
392  // local matrix's number of columns comes from the column Map, not
393  // the domain Map.
394 
395  valuesPacked_wdv = values_wdv_type(values);
396  valuesUnpacked_wdv = valuesPacked_wdv;
397 
398  // FIXME (22 Jun 2016) I would very much like to get rid of
399  // k_values1D_ at some point. I find it confusing to have all
400  // these extra references lying around.
401  // KDDKDD ALMOST THERE, MARK!
402 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
403 
405  }
406 
407  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
409  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
410  const Teuchos::RCP<const map_type>& colMap,
411  const typename local_graph_device_type::row_map_type& rowPointers,
412  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
413  const typename local_matrix_device_type::values_type& values,
414  const Teuchos::RCP<Teuchos::ParameterList>& params) :
415  dist_object_type (rowMap),
416  storageStatus_ (Details::STORAGE_1D_PACKED)
417  {
418  using Details::getEntryOnHost;
419  using Teuchos::RCP;
420  using std::endl;
421  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
422  "RCP<const Map>, ptr, ind, val[, params]): ";
423  const char suffix[] =
424  ". Please report this bug to the Tpetra developers.";
425  const bool debug = Details::Behavior::debug("CrsMatrix");
426  const bool verbose = Details::Behavior::verbose("CrsMatrix");
427 
428  std::unique_ptr<std::string> prefix;
429  if (verbose) {
430  prefix = this->createPrefix(
431  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
432  std::ostringstream os;
433  os << *prefix << "Start" << endl;
434  std::cerr << os.str ();
435  }
436 
437  // Check the user's input. Note that this might throw only on
438  // some processes but not others, causing deadlock. We prefer
439  // deadlock due to exceptions to segfaults, because users can
440  // catch exceptions.
441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
442  (values.extent(0) != columnIndices.extent(0),
443  std::invalid_argument, "values.extent(0)=" << values.extent(0)
444  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
445  << ".");
446  if (debug && rowPointers.extent(0) != 0) {
447  const size_t numEnt =
448  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
449  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
450  (numEnt != size_t(columnIndices.extent(0)) ||
451  numEnt != size_t(values.extent(0)),
452  std::invalid_argument, "Last entry of rowPointers says that "
453  "the matrix has " << numEnt << " entr"
454  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
455  "columnIndices and values don't match this. "
456  "columnIndices.extent(0)=" << columnIndices.extent (0)
457  << " and values.extent(0)=" << values.extent (0) << ".");
458  }
459 
460  RCP<crs_graph_type> graph;
461  try {
462  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
463  columnIndices, params));
464  }
465  catch (std::exception& e) {
466  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
467  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
468  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
469  << e.what ());
470  }
471  // The newly created CrsGraph _must_ have a local graph at this
472  // point. We don't really care whether CrsGraph's constructor
473  // deep-copies or shallow-copies the input, but the dimensions
474  // have to be right. That's how we tell whether the CrsGraph has
475  // a local graph.
476  auto lclGraph = graph->getLocalGraphDevice ();
477  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
478  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
479  lclGraph.entries.extent (0) != columnIndices.extent (0),
480  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
481  "ind[, params]) did not set the local graph correctly." << suffix);
482  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
483  (lclGraph.entries.extent (0) != values.extent (0),
484  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
485  "params]) did not set the local graph correctly. "
486  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
487  << " != values.extent(0) = " << values.extent (0) << suffix);
488 
489  // myGraph_ not null means that the matrix owns the graph. This
490  // is true because the column indices come in as nonconst,
491  // implying shared ownership.
492  myGraph_ = graph;
493  staticGraph_ = graph;
494 
495  // The graph may not be fill complete yet. However, it is locally
496  // indexed (since we have a column Map) and has a fixed structure
497  // (due to the input arrays). This means we can allocate the
498  // (1-D) array of values and build the local matrix right now.
499  // Note that the local matrix's number of columns comes from the
500  // column Map, not the domain Map.
501 
502  valuesPacked_wdv = values_wdv_type(values);
503  valuesUnpacked_wdv = valuesPacked_wdv;
504 
505  // FIXME (22 Jun 2016) I would very much like to get rid of
506  // k_values1D_ at some point. I find it confusing to have all
507  // these extra references lying around.
508 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
509 
511  if (verbose) {
512  std::ostringstream os;
513  os << *prefix << "Done" << endl;
514  std::cerr << os.str();
515  }
516  }
517 
518  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
520  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
521  const Teuchos::RCP<const map_type>& colMap,
522  const Teuchos::ArrayRCP<size_t>& ptr,
523  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
524  const Teuchos::ArrayRCP<Scalar>& val,
525  const Teuchos::RCP<Teuchos::ParameterList>& params) :
526  dist_object_type (rowMap),
527  storageStatus_ (Details::STORAGE_1D_PACKED)
528  {
529  using Kokkos::Compat::getKokkosViewDeepCopy;
530  using Teuchos::av_reinterpret_cast;
531  using Teuchos::RCP;
532  using values_type = typename local_matrix_device_type::values_type;
533  using IST = impl_scalar_type;
534  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
535  "RCP<const Map>, ptr, ind, val[, params]): ";
536 
537  RCP<crs_graph_type> graph;
538  try {
539  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
540  ind, params));
541  }
542  catch (std::exception& e) {
543  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
544  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
545  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
546  "RCP<ParameterList>]) threw an exception: " << e.what ());
547  }
548  // myGraph_ not null means that the matrix owns the graph. This
549  // is true because the column indices come in as nonconst,
550  // implying shared ownership.
551  myGraph_ = graph;
552  staticGraph_ = graph;
553 
554  // The graph may not be fill complete yet. However, it is locally
555  // indexed (since we have a column Map) and has a fixed structure
556  // (due to the input arrays). This means we can allocate the
557  // (1-D) array of values and build the local matrix right now.
558  // Note that the local matrix's number of columns comes from the
559  // column Map, not the domain Map.
560 
561  // The graph _must_ have a local graph at this point. We don't
562  // really care whether CrsGraph's constructor deep-copies or
563  // shallow-copies the input, but the dimensions have to be right.
564  // That's how we tell whether the CrsGraph has a local graph.
565  auto lclGraph = staticGraph_->getLocalGraphDevice ();
566  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
567  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
568  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
569  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
570  "ptr, ind[, params]) did not set the local graph correctly. "
571  "Please report this bug to the Tpetra developers.");
572 
573  values_type valIn =
574  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
575  valuesPacked_wdv = values_wdv_type(valIn);
576  valuesUnpacked_wdv = valuesPacked_wdv;
577 
578  // FIXME (22 Jun 2016) I would very much like to get rid of
579  // k_values1D_ at some point. I find it confusing to have all
580  // these extra references lying around.
581 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
582 
584  }
585 
586  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
588  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
589  const Teuchos::RCP<const map_type>& colMap,
590  const local_matrix_device_type& lclMatrix,
591  const Teuchos::RCP<Teuchos::ParameterList>& params) :
592  dist_object_type (rowMap),
593  storageStatus_ (Details::STORAGE_1D_PACKED),
594  fillComplete_ (true)
595  {
596  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
597  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
598  const char suffix[] =
599  " Please report this bug to the Tpetra developers.";
600 
601  Teuchos::RCP<crs_graph_type> graph;
602  try {
603  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
604  lclMatrix.graph, params));
605  }
606  catch (std::exception& e) {
607  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
608  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
609  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
610  "exception: " << e.what ());
611  }
612  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
613  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
614  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
615  "did not produce a fill-complete graph. Please report this bug to the "
616  "Tpetra developers.");
617  // myGraph_ not null means that the matrix owns the graph. This
618  // is true because the column indices come in as nonconst through
619  // the matrix, implying shared ownership.
620  myGraph_ = graph;
621  staticGraph_ = graph;
622 
623  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
624  valuesUnpacked_wdv = valuesPacked_wdv;
625 
626  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
627  (isFillActive (), std::logic_error,
628  "At the end of a CrsMatrix constructor that should produce "
629  "a fillComplete matrix, isFillActive() is true." << suffix);
630  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
631  (! isFillComplete (), std::logic_error, "At the end of a "
632  "CrsMatrix constructor that should produce a fillComplete "
633  "matrix, isFillComplete() is false." << suffix);
635  }
636 
637  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
640  const Teuchos::RCP<const map_type>& rowMap,
641  const Teuchos::RCP<const map_type>& colMap,
642  const Teuchos::RCP<const map_type>& domainMap,
643  const Teuchos::RCP<const map_type>& rangeMap,
644  const Teuchos::RCP<Teuchos::ParameterList>& params) :
645  dist_object_type (rowMap),
646  storageStatus_ (Details::STORAGE_1D_PACKED),
647  fillComplete_ (true)
648  {
649  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
650  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
651  "local_matrix_device_type[, RCP<ParameterList>]): ";
652  const char suffix[] =
653  " Please report this bug to the Tpetra developers.";
654 
655  Teuchos::RCP<crs_graph_type> graph;
656  try {
657  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
658  domainMap, rangeMap, params));
659  }
660  catch (std::exception& e) {
661  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
662  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
663  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
664  "RCP<ParameterList>]) threw an exception: " << e.what ());
665  }
666  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
667  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
668  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
669  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
670  "not produce a fillComplete graph." << suffix);
671  // myGraph_ not null means that the matrix owns the graph. This
672  // is true because the column indices come in as nonconst through
673  // the matrix, implying shared ownership.
674  myGraph_ = graph;
675  staticGraph_ = graph;
676 
677  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
678  valuesUnpacked_wdv = valuesPacked_wdv;
679 
680  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
681  (isFillActive (), std::logic_error,
682  "At the end of a CrsMatrix constructor that should produce "
683  "a fillComplete matrix, isFillActive() is true." << suffix);
684  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
685  (! isFillComplete (), std::logic_error, "At the end of a "
686  "CrsMatrix constructor that should produce a fillComplete "
687  "matrix, isFillComplete() is false." << suffix);
689  }
690 
691  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
694  const Teuchos::RCP<const map_type>& rowMap,
695  const Teuchos::RCP<const map_type>& colMap,
696  const Teuchos::RCP<const map_type>& domainMap,
697  const Teuchos::RCP<const map_type>& rangeMap,
698  const Teuchos::RCP<const import_type>& importer,
699  const Teuchos::RCP<const export_type>& exporter,
700  const Teuchos::RCP<Teuchos::ParameterList>& params) :
701  dist_object_type (rowMap),
702  storageStatus_ (Details::STORAGE_1D_PACKED),
703  fillComplete_ (true)
704  {
705  using Teuchos::rcp;
706  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
707  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
708  const char suffix[] =
709  " Please report this bug to the Tpetra developers.";
710 
711  Teuchos::RCP<crs_graph_type> graph;
712  try {
713  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
714  domainMap, rangeMap, importer,
715  exporter, params));
716  }
717  catch (std::exception& e) {
718  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
719  (true, std::runtime_error, "CrsGraph constructor "
720  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
721  "params) threw: " << e.what ());
722  }
723  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
724  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
725  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
726  "Export, params) did not produce a fill-complete graph. "
727  "Please report this bug to the Tpetra developers.");
728  // myGraph_ not null means that the matrix owns the graph. This
729  // is true because the column indices come in as nonconst through
730  // the matrix, implying shared ownership.
731  myGraph_ = graph;
732  staticGraph_ = graph;
733 
734  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
735  valuesUnpacked_wdv = valuesPacked_wdv;
736 
737  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
738  (isFillActive (), std::logic_error,
739  "At the end of a CrsMatrix constructor that should produce "
740  "a fillComplete matrix, isFillActive() is true." << suffix);
741  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
742  (! isFillComplete (), std::logic_error, "At the end of a "
743  "CrsMatrix constructor that should produce a fillComplete "
744  "matrix, isFillComplete() is false." << suffix);
746  }
747 
748  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
751  const Teuchos::DataAccess copyOrView):
752  dist_object_type (source.getCrsGraph()->getRowMap ()),
753  staticGraph_ (source.getCrsGraph()),
754  storageStatus_ (source.storageStatus_)
755  {
756  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
757  "const CrsMatrix&, const Teuchos::DataAccess): ";
758  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
759  (! source.isFillComplete (), std::invalid_argument,
760  "Source graph must be fillComplete().");
761 
762  if (copyOrView == Teuchos::Copy) {
763  using values_type = typename local_matrix_device_type::values_type;
764  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
765  using Kokkos::view_alloc;
766  using Kokkos::WithoutInitializing;
767  values_type newvals (view_alloc ("val", WithoutInitializing),
768  vals.extent (0));
769  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
770  Kokkos::deep_copy (newvals, vals);
771  valuesPacked_wdv = values_wdv_type(newvals);
772  valuesUnpacked_wdv = valuesPacked_wdv;
773  fillComplete (source.getDomainMap (), source.getRangeMap ());
774  }
775  else if (copyOrView == Teuchos::View) {
776  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
777  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
778  fillComplete (source.getDomainMap (), source.getRangeMap ());
779  }
780  else {
781  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
782  (true, std::invalid_argument, "Second argument 'copyOrView' "
783  "has an invalid value " << copyOrView << ". Valid values "
784  "include Teuchos::Copy = " << Teuchos::Copy << " and "
785  "Teuchos::View = " << Teuchos::View << ".");
786  }
788  }
789 
790  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
791  void
794  {
795  std::swap(crs_matrix.importMV_, this->importMV_);
796  std::swap(crs_matrix.exportMV_, this->exportMV_);
797  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
798  std::swap(crs_matrix.myGraph_, this->myGraph_);
799  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
800  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
801  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
802  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
803  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
804  }
805 
806  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
807  Teuchos::RCP<const Teuchos::Comm<int> >
809  getComm () const {
810  return getCrsGraphRef ().getComm ();
811  }
812 
813  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
814  bool
816  isFillComplete () const {
817  return fillComplete_;
818  }
819 
820  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
821  bool
823  isFillActive () const {
824  return ! fillComplete_;
825  }
826 
827  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
828  bool
831  return this->getCrsGraphRef ().isStorageOptimized ();
832  }
833 
834  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
835  bool
838  return getCrsGraphRef ().isLocallyIndexed ();
839  }
840 
841  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
842  bool
845  return getCrsGraphRef ().isGloballyIndexed ();
846  }
847 
848  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
849  bool
851  hasColMap () const {
852  return getCrsGraphRef ().hasColMap ();
853  }
854 
855  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
859  return getCrsGraphRef ().getGlobalNumEntries ();
860  }
861 
862  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
863  size_t
866  return getCrsGraphRef ().getLocalNumEntries ();
867  }
868 
869  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
873  return getCrsGraphRef ().getGlobalNumRows ();
874  }
875 
876  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
880  return getCrsGraphRef ().getGlobalNumCols ();
881  }
882 
883  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
884  size_t
887  return getCrsGraphRef ().getLocalNumRows ();
888  }
889 
890 
891  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892  size_t
895  return getCrsGraphRef ().getLocalNumCols ();
896  }
897 
898 
899  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
900  size_t
902  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
903  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
904  }
905 
906  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
907  size_t
909  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
910  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
911  }
912 
913  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
914  size_t
917  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
918  }
919 
920  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921  size_t
924  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
925  }
926 
927  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
928  GlobalOrdinal
930  getIndexBase () const {
931  return getRowMap ()->getIndexBase ();
932  }
933 
934  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
935  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
937  getRowMap () const {
938  return getCrsGraphRef ().getRowMap ();
939  }
940 
941  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
942  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
944  getColMap () const {
945  return getCrsGraphRef ().getColMap ();
946  }
947 
948  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
949  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
951  getDomainMap () const {
952  return getCrsGraphRef ().getDomainMap ();
953  }
954 
955  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
956  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
958  getRangeMap () const {
959  return getCrsGraphRef ().getRangeMap ();
960  }
961 
962  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
963  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
965  getGraph () const {
966  if (staticGraph_ != Teuchos::null) {
967  return staticGraph_;
968  }
969  return myGraph_;
970  }
971 
972  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
973  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
975  getCrsGraph () const {
976  if (staticGraph_ != Teuchos::null) {
977  return staticGraph_;
978  }
979  return myGraph_;
980  }
981 
982  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
985  getCrsGraphRef () const
986  {
987 #ifdef HAVE_TPETRA_DEBUG
988  constexpr bool debug = true;
989 #else
990  constexpr bool debug = false;
991 #endif // HAVE_TPETRA_DEBUG
992 
993  if (! this->staticGraph_.is_null ()) {
994  return * (this->staticGraph_);
995  }
996  else {
997  if (debug) {
998  const char tfecfFuncName[] = "getCrsGraphRef: ";
999  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1000  (this->myGraph_.is_null (), std::logic_error,
1001  "Both staticGraph_ and myGraph_ are null. "
1002  "Please report this bug to the Tpetra developers.");
1003  }
1004  return * (this->myGraph_);
1005  }
1006  }
1007 
1008  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1009  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
1012  {
1013  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1014  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
1015  numCols,
1016  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
1017  staticGraph_->getLocalGraphDevice());
1018  }
1019 
1020  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1021  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
1023  getLocalMatrixHost () const
1024  {
1025  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1026  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
1027  valuesPacked_wdv.getHostView(Access::ReadWrite),
1028  staticGraph_->getLocalGraphHost());
1029  }
1030 
1031 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1032  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1033  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1036  {
1037  auto localMatrix = getLocalMatrixDevice();
1038 #ifdef HAVE_TPETRACORE_CUDA
1039 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
1040  if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()) &&
1041  std::is_same<Node, Tpetra::KokkosCompat::KokkosCudaWrapperNode>::value)
1042  {
1043  if(this->ordinalRowptrs.data() == nullptr)
1044  {
1045  auto originalRowptrs = localMatrix.graph.row_map;
1046  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1047  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1048  this->ordinalRowptrs = ordinal_rowptrs_type(
1049  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1050  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1051  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1052  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1053  KOKKOS_LAMBDA(LocalOrdinal i)
1054  {
1055  ordinalRowptrs_(i) = originalRowptrs(i);
1056  });
1057  }
1058  //return local operator using ordinalRowptrs
1059  return std::make_shared<local_multiply_op_type>(
1060  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1061  }
1062 #endif
1063 #endif
1064 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1065  return std::make_shared<local_multiply_op_type>(
1066  std::make_shared<local_matrix_device_type>(localMatrix));
1067  }
1068 
1069  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1070  bool
1072  isStaticGraph () const {
1073  return myGraph_.is_null ();
1074  }
1075 
1076  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1077  bool
1080  return true;
1081  }
1082 
1083  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1084  bool
1087  return true;
1088  }
1089 
1090  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1091  void
1093  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1094  const bool verbose)
1095  {
1096  using Details::Behavior;
1098  using std::endl;
1099  const char tfecfFuncName[] = "allocateValues: ";
1100  const char suffix[] =
1101  " Please report this bug to the Tpetra developers.";
1102  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1103 
1104  std::unique_ptr<std::string> prefix;
1105  if (verbose) {
1106  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1107  std::ostringstream os;
1108  os << *prefix << "lg: "
1109  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1110  << ", gas: Graph"
1111  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1112  << "Allocated" << endl;
1113  std::cerr << os.str();
1114  }
1115 
1116  const bool debug = Behavior::debug("CrsMatrix");
1117  if (debug) {
1118  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1119  (this->staticGraph_.is_null (), std::logic_error,
1120  "staticGraph_ is null." << suffix);
1121 
1122  // If the graph indices are already allocated, then gas should be
1123  // GraphAlreadyAllocated. Otherwise, gas should be
1124  // GraphNotYetAllocated.
1125  if ((gas == GraphAlreadyAllocated) !=
1126  staticGraph_->indicesAreAllocated ()) {
1127  const char err1[] = "The caller has asserted that the graph "
1128  "is ";
1129  const char err2[] = "already allocated, but the static graph "
1130  "says that its indices are ";
1131  const char err3[] = "already allocated. ";
1132  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1133  (gas == GraphAlreadyAllocated &&
1134  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1135  err1 << err2 << "not " << err3 << suffix);
1136  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1137  (gas != GraphAlreadyAllocated &&
1138  staticGraph_->indicesAreAllocated (), std::logic_error,
1139  err1 << "not " << err2 << err3 << suffix);
1140  }
1141 
1142  // If the graph is unallocated, then it had better be a
1143  // matrix-owned graph. ("Matrix-owned graph" means that the
1144  // matrix gets to define the graph structure. If the CrsMatrix
1145  // constructor that takes an RCP<const CrsGraph> was used, then
1146  // the matrix does _not_ own the graph.)
1147  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1148  (! this->staticGraph_->indicesAreAllocated () &&
1149  this->myGraph_.is_null (), std::logic_error,
1150  "The static graph says that its indices are not allocated, "
1151  "but the graph is not owned by the matrix." << suffix);
1152  }
1153 
1154  if (gas == GraphNotYetAllocated) {
1155  if (debug) {
1156  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1157  (this->myGraph_.is_null (), std::logic_error,
1158  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1159  }
1160  try {
1161  this->myGraph_->allocateIndices (lg, verbose);
1162  }
1163  catch (std::exception& e) {
1164  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1165  (true, std::runtime_error, "CrsGraph::allocateIndices "
1166  "threw an exception: " << e.what ());
1167  }
1168  catch (...) {
1169  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1170  (true, std::runtime_error, "CrsGraph::allocateIndices "
1171  "threw an exception not a subclass of std::exception.");
1172  }
1173  }
1174 
1175  // Allocate matrix values.
1176  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1177  if (debug) {
1178  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1179  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1180  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1181  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1182  }
1183 
1184  // Allocate array of (packed???) matrix values.
1185  using values_type = typename local_matrix_device_type::values_type;
1186  if (verbose) {
1187  std::ostringstream os;
1188  os << *prefix << "Allocate values_wdv: Pre "
1189  << valuesUnpacked_wdv.extent(0) << ", post "
1190  << lclTotalNumEntries << endl;
1191  std::cerr << os.str();
1192  }
1193 // this->k_values1D_ =
1194  valuesUnpacked_wdv = values_wdv_type(
1195  values_type("Tpetra::CrsMatrix::values",
1196  lclTotalNumEntries));
1197  }
1198 
1199 
1200  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1201  void
1203  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1204  {
1206  using ::Tpetra::Details::getEntryOnHost;
1207  using Teuchos::arcp_const_cast;
1208  using Teuchos::Array;
1209  using Teuchos::ArrayRCP;
1210  using Teuchos::null;
1211  using Teuchos::RCP;
1212  using Teuchos::rcp;
1213  using std::endl;
1214  using row_map_type = typename local_graph_device_type::row_map_type;
1215  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1216  using values_type = typename local_matrix_device_type::values_type;
1217  Details::ProfilingRegion regionFLGAM
1218  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1219 
1220  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1221  "fillComplete or expertStaticFillComplete): ";
1222  const char suffix[] =
1223  " Please report this bug to the Tpetra developers.";
1224  const bool debug = Details::Behavior::debug("CrsMatrix");
1225  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1226 
1227  std::unique_ptr<std::string> prefix;
1228  if (verbose) {
1229  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1230  std::ostringstream os;
1231  os << *prefix << endl;
1232  std::cerr << os.str ();
1233  }
1234 
1235  if (debug) {
1236  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1237  // matrix owns the graph, which means myGraph_ is not null.
1238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1239  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1240  "(myGraph_) is null. This means that the matrix has a "
1241  "const (a.k.a. \"static\") graph. fillComplete or "
1242  "expertStaticFillComplete should never call "
1243  "fillLocalGraphAndMatrix in that case." << suffix);
1244  }
1245 
1246  const size_t lclNumRows = this->getLocalNumRows ();
1247 
1248  // This method's goal is to fill in the three arrays (compressed
1249  // sparse row format) that define the sparse graph's and matrix's
1250  // structure, and the sparse matrix's values.
1251  //
1252  // Get references to the data in myGraph_, so we can modify them
1253  // as well. Note that we only call fillLocalGraphAndMatrix() if
1254  // the matrix owns the graph, which means myGraph_ is not null.
1255 
1256  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1257  // See: https://github.com/trilinos/Trilinos/issues/12237
1258  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1259  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1260 
1261  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1262  myGraph_->rowPtrsUnpacked_dev_;
1263 
1264  if (debug) {
1265  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1266  (curRowOffsets.extent (0) == 0, std::logic_error,
1267  "curRowOffsets.extent(0) == 0.");
1268  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1269  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1270  "curRowOffsets.extent(0) = "
1271  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1272  << (lclNumRows + 1) << ".");
1273  const size_t numOffsets = curRowOffsets.extent (0);
1274  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1275  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1276  (numOffsets != 0 &&
1277  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1278  std::logic_error, "numOffsets = " <<
1279  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1280  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1281  << numOffsets << ") = " << valToCheck << ".");
1282  }
1283 
1284  if (myGraph_->getLocalNumEntries() !=
1285  myGraph_->getLocalAllocationSize()) {
1286 
1287  // Use the nonconst version of row_map_type for k_ptrs,
1288  // because row_map_type is const and we need to modify k_ptrs here.
1289  typename row_map_type::non_const_type k_ptrs;
1290  row_map_type k_ptrs_const;
1291  lclinds_1d_type k_inds;
1292  values_type k_vals;
1293 
1294  if (verbose) {
1295  std::ostringstream os;
1296  const auto numEnt = myGraph_->getLocalNumEntries();
1297  const auto allocSize = myGraph_->getLocalAllocationSize();
1298  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1299  << ", allocSize=" << allocSize << endl;
1300  std::cerr << os.str ();
1301  }
1302  // The matrix's current 1-D storage is "unpacked." This means
1303  // the row offsets may differ from what the final row offsets
1304  // should be. This could happen, for example, if the user
1305  // set an upper
1306  // bound on the number of entries per row, but didn't fill all
1307  // those entries.
1308  if (debug && curRowOffsets.extent (0) != 0) {
1309  const size_t numOffsets =
1310  static_cast<size_t> (curRowOffsets.extent (0));
1311  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1312  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1313  (static_cast<size_t> (valToCheck) !=
1314  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1315  std::logic_error, "(unpacked branch) Before "
1316  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1317  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1318  " = " << valuesUnpacked_wdv.extent (0) << ".");
1319  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1320  (static_cast<size_t> (valToCheck) !=
1321  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1322  std::logic_error, "(unpacked branch) Before "
1323  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1324  << ") = " << valToCheck
1325  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1326  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1327  }
1328  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1329  // the array of valid entry counts per row.
1330 
1331  // Total number of entries in the matrix on the calling
1332  // process. We will compute this in the loop below. It's
1333  // cheap to compute and useful as a sanity check.
1334  size_t lclTotalNumEntries = 0;
1335  {
1336  // Allocate the packed row offsets array. We use a nonconst
1337  // temporary (packedRowOffsets) here, because k_ptrs is
1338  // const. We will assign packedRowOffsets to k_ptrs below.
1339  if (verbose) {
1340  std::ostringstream os;
1341  os << *prefix << "Allocate packed row offsets: "
1342  << (lclNumRows+1) << endl;
1343  std::cerr << os.str ();
1344  }
1345  typename row_map_type::non_const_type
1346  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1347  typename row_entries_type::const_type numRowEnt_h =
1348  myGraph_->k_numRowEntries_;
1349  // We're computing offsets on device. This function can
1350  // handle numRowEnt_h being a host View.
1351  lclTotalNumEntries =
1352  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1353  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1354  // to use packedRowOffsets in the loop above and assign here.
1355  k_ptrs = packedRowOffsets;
1356  k_ptrs_const = k_ptrs;
1357  }
1358 
1359  if (debug) {
1360  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1361  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1362  std::logic_error,
1363  "(unpacked branch) After packing k_ptrs, "
1364  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1365  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1366  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1367  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1368  (valToCheck != lclTotalNumEntries, std::logic_error,
1369  "(unpacked branch) After filling k_ptrs, "
1370  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1371  << " != total number of entries on the calling process = "
1372  << lclTotalNumEntries << ".");
1373  }
1374 
1375  // Allocate the arrays of packed column indices and values.
1376  if (verbose) {
1377  std::ostringstream os;
1378  os << *prefix << "Allocate packed local column indices: "
1379  << lclTotalNumEntries << endl;
1380  std::cerr << os.str ();
1381  }
1382  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1383  if (verbose) {
1384  std::ostringstream os;
1385  os << *prefix << "Allocate packed values: "
1386  << lclTotalNumEntries << endl;
1387  std::cerr << os.str ();
1388  }
1389  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1390 
1391  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1392  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1393  // the packed row offsets array k_ptrs that we created above.
1394  //
1395  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1396  // need to keep around the unpacked row offsets, column
1397  // indices, and values arrays.
1398 
1399  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1400  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1401  using inds_packer_type = pack_functor<
1402  typename Graph::local_graph_device_type::entries_type::non_const_type,
1403  typename Graph::local_inds_dualv_type::t_dev::const_type,
1404  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1405  typename Graph::local_graph_device_type::row_map_type>;
1406  inds_packer_type indsPacker (
1407  k_inds,
1408  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1409  k_ptrs, curRowOffsets);
1410  using exec_space = typename decltype (k_inds)::execution_space;
1411  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1412  Kokkos::parallel_for
1413  ("Tpetra::CrsMatrix pack column indices",
1414  range_type (0, lclNumRows), indsPacker);
1415 
1416  // Pack the values from unpacked valuesUnpacked_wdv into packed
1417  // k_vals. We will replace valuesPacked_wdv below.
1418  using vals_packer_type = pack_functor<
1419  typename values_type::non_const_type,
1420  typename values_type::const_type,
1421  typename row_map_type::non_const_type,
1422  typename row_map_type::const_type>;
1423  vals_packer_type valsPacker (
1424  k_vals,
1425  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1426  k_ptrs, curRowOffsets);
1427  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1428  range_type (0, lclNumRows), valsPacker);
1429 
1430  if (debug) {
1431  const char myPrefix[] = "(\"Optimize Storage\""
1432  "=true branch) After packing, ";
1433  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1434  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1435  << "k_ptrs.extent(0) = 0. This probably means that "
1436  "rowPtrsUnpacked_ was never allocated.");
1437  if (k_ptrs.extent (0) != 0) {
1438  const size_t numOffsets (k_ptrs.extent (0));
1439  const auto valToCheck =
1440  getEntryOnHost (k_ptrs, numOffsets - 1);
1441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1442  (size_t (valToCheck) != k_vals.extent (0),
1443  std::logic_error, myPrefix <<
1444  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1445  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1446  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1447  (size_t (valToCheck) != k_inds.extent (0),
1448  std::logic_error, myPrefix <<
1449  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1450  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1451  }
1452  }
1453  // Build the local graph.
1454  myGraph_->setRowPtrsPacked(k_ptrs_const);
1455  myGraph_->lclIndsPacked_wdv =
1456  typename crs_graph_type::local_inds_wdv_type(k_inds);
1457  valuesPacked_wdv = values_wdv_type(k_vals);
1458  }
1459  else { // We don't have to pack, so just set the pointers.
1460  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1461  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1462  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1463  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1464  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1465  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1466  valuesPacked_wdv = valuesUnpacked_wdv;
1467 
1468  if (verbose) {
1469  std::ostringstream os;
1470  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1471  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1472  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1473  << valuesUnpacked_wdv.extent(0) << endl;
1474  std::cerr << os.str();
1475  }
1476 
1477  if (debug) {
1478  const char myPrefix[] =
1479  "(\"Optimize Storage\"=false branch) ";
1480  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1481  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1482  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1483  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1484  "that rowPtrsUnpacked_ was never allocated.");
1485  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1486  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1487  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1488  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1489  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1490  std::logic_error, myPrefix <<
1491  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1492  << " != valuesPacked_wdv.extent(0) = "
1493  << valuesPacked_wdv.extent (0) << ".");
1494  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1495  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1496  std::logic_error, myPrefix <<
1497  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1498  << " != myGraph_->lclIndsPacked.extent(0) = "
1499  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1500  }
1501  }
1502  }
1503 
1504  if (debug) {
1505  const char myPrefix[] = "After packing, ";
1506  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1507  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1508  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1509  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1510  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1511  (lclNumRows+1) << ".");
1512  if (rowPtrsPackedHost.extent (0) != 0) {
1513  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1514  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1515  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1516  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1517  std::logic_error, myPrefix << "k_ptrs_const(" <<
1518  (numOffsets-1) << ") = " << valToCheck
1519  << " != valuesPacked_wdv.extent(0) = "
1520  << valuesPacked_wdv.extent (0) << ".");
1521  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1522  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1523  std::logic_error, myPrefix << "k_ptrs_const(" <<
1524  (numOffsets-1) << ") = " << valToCheck
1525  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1526  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1527  }
1528  }
1529 
1530  // May we ditch the old allocations for the packed (and otherwise
1531  // "optimized") allocations, later in this routine? Optimize
1532  // storage if the graph is not static, or if the graph already has
1533  // optimized storage.
1534  const bool defaultOptStorage =
1535  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1536  const bool requestOptimizedStorage =
1537  (! params.is_null () &&
1538  params->get ("Optimize Storage", defaultOptStorage)) ||
1539  (params.is_null () && defaultOptStorage);
1540 
1541  // The graph has optimized storage when indices are allocated,
1542  // myGraph_->k_numRowEntries_ is empty, and there are more than
1543  // zero rows on this process.
1544  if (requestOptimizedStorage) {
1545  // Free the old, unpacked, unoptimized allocations.
1546  // Free graph data structures that are only needed for
1547  // unpacked 1-D storage.
1548  if (verbose) {
1549  std::ostringstream os;
1550  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1551  << myGraph_->k_numRowEntries_.extent(0) << endl;
1552  std::cerr << os.str();
1553  }
1554 
1555  myGraph_->k_numRowEntries_ = row_entries_type ();
1556 
1557  // Keep the new 1-D packed allocations.
1558  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1559  // We directly set the memory spaces to avoid a memcpy from device to host
1560  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1561  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1562  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1563  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1564  valuesUnpacked_wdv = valuesPacked_wdv;
1565 
1566  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1567  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1568  }
1569  else {
1570  if (verbose) {
1571  std::ostringstream os;
1572  os << *prefix << "User requested NOT to optimize storage"
1573  << endl;
1574  std::cerr << os.str();
1575  }
1576  }
1577  }
1578 
1579  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1580  void
1582  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1583  {
1584  using ::Tpetra::Details::ProfilingRegion;
1585  using Teuchos::ArrayRCP;
1586  using Teuchos::Array;
1587  using Teuchos::null;
1588  using Teuchos::RCP;
1589  using Teuchos::rcp;
1590  using std::endl;
1591  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1592  using non_const_row_map_type = typename row_map_type::non_const_type;
1593  using values_type = typename local_matrix_device_type::values_type;
1594  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1595  const size_t lclNumRows = getLocalNumRows();
1596 
1597  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1598  std::unique_ptr<std::string> prefix;
1599  if (verbose) {
1600  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1601  std::ostringstream os;
1602  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1603  std::cerr << os.str ();
1604  }
1605 
1606  // The goals of this routine are first, to allocate and fill
1607  // packed 1-D storage (see below for an explanation) in the vals
1608  // array, and second, to give vals to the local matrix and
1609  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1610  // row offsets, within the scope of this routine, since we're only
1611  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1612  // fill both the graph and the matrix at the same time).
1613 
1614  // get data from staticGraph_
1615  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1616  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1617  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1618 
1619  row_map_type k_ptrs; // "packed" row offsets array
1620  values_type k_vals; // "packed" values array
1621 
1622  // May we ditch the old allocations for the packed (and otherwise
1623  // "optimized") allocations, later in this routine? Request
1624  // optimized storage by default.
1625  bool requestOptimizedStorage = true;
1626  const bool default_OptimizeStorage =
1627  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1628  if (! params.is_null() &&
1629  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1630  requestOptimizedStorage = false;
1631  }
1632  // If we're not allowed to change a static graph, then we can't
1633  // change the storage of the matrix, either. This means that if
1634  // the graph's storage isn't already optimized, we can't optimize
1635  // the matrix's storage either. Check and give warning, as
1636  // appropriate.
1637  if (! staticGraph_->isStorageOptimized () &&
1638  requestOptimizedStorage) {
1640  (true, std::runtime_error, "You requested optimized storage "
1641  "by setting the \"Optimize Storage\" flag to \"true\" in "
1642  "the ParameterList, or by virtue of default behavior. "
1643  "However, the associated CrsGraph was filled separately and "
1644  "requested not to optimize storage. Therefore, the "
1645  "CrsMatrix cannot optimize storage.");
1646  requestOptimizedStorage = false;
1647  }
1648 
1649  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1650  // See: https://github.com/trilinos/Trilinos/issues/12237
1651  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1652  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1653 
1654  // The matrix's values are currently
1655  // stored in a 1-D format. However, this format is "unpacked";
1656  // it doesn't necessarily have the same row offsets as indicated
1657  // by the ptrs array returned by allocRowPtrs. This could
1658  // happen, for example, if the user
1659  // fixed the number of matrix entries in
1660  // each row, but didn't fill all those entries.
1661  //
1662  // As above, we don't need to keep the "packed" row offsets
1663  // array ptrs here, but we do need it here temporarily, so we
1664  // have to allocate it. We'll free ptrs later in this method.
1665  //
1666  // Note that this routine checks whether storage has already
1667  // been packed. This is a common case for solution of nonlinear
1668  // PDEs using the finite element method, as long as the
1669  // structure of the sparse matrix does not change between linear
1670  // solves.
1671  if (nodeNumEntries != nodeNumAllocated) {
1672  if (verbose) {
1673  std::ostringstream os;
1674  os << *prefix << "Unpacked 1-D storage: numEnt="
1675  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1676  << endl;
1677  std::cerr << os.str();
1678  }
1679  // We have to pack the 1-D storage, since the user didn't fill
1680  // up all requested storage.
1681  if (verbose) {
1682  std::ostringstream os;
1683  os << *prefix << "Allocate packed row offsets: "
1684  << (lclNumRows+1) << endl;
1685  std::cerr << os.str();
1686  }
1687  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1688  lclNumRows+1);
1689  // Total number of entries in the matrix on the calling
1690  // process. We will compute this in the loop below. It's
1691  // cheap to compute and useful as a sanity check.
1692  size_t lclTotalNumEntries = 0;
1693  k_ptrs = tmpk_ptrs;
1694  {
1695  typename row_entries_type::const_type numRowEnt_h =
1696  staticGraph_->k_numRowEntries_;
1697  // This function can handle the counts being a host View.
1698  lclTotalNumEntries =
1699  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1700  }
1701 
1702  // Allocate the "packed" values array.
1703  // It has exactly the right number of entries.
1704  if (verbose) {
1705  std::ostringstream os;
1706  os << *prefix << "Allocate packed values: "
1707  << lclTotalNumEntries << endl;
1708  std::cerr << os.str ();
1709  }
1710  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1711 
1712  // Pack values_wdv into k_vals. We will replace values_wdv below.
1713  pack_functor<
1714  typename values_type::non_const_type,
1715  typename values_type::const_type,
1716  typename row_map_type::non_const_type,
1717  typename row_map_type::const_type> valsPacker
1718  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1719  tmpk_ptrs, k_rowPtrs);
1720 
1721  using exec_space = typename decltype (k_vals)::execution_space;
1722  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1723  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1724  range_type (0, lclNumRows), valsPacker);
1725  valuesPacked_wdv = values_wdv_type(k_vals);
1726  }
1727  else { // We don't have to pack, so just set the pointer.
1728  valuesPacked_wdv = valuesUnpacked_wdv;
1729  if (verbose) {
1730  std::ostringstream os;
1731  os << *prefix << "Storage already packed: "
1732  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1733  std::cerr << os.str();
1734  }
1735  }
1736 
1737  // May we ditch the old allocations for the packed one?
1738  if (requestOptimizedStorage) {
1739  // The user requested optimized storage, so we can dump the
1740  // unpacked 1-D storage, and keep the packed storage.
1741  valuesUnpacked_wdv = valuesPacked_wdv;
1742 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1743  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1744  }
1745  }
1746 
1747  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1748  void
1750  insertIndicesAndValues (crs_graph_type& graph,
1751  RowInfo& rowInfo,
1752  const typename crs_graph_type::SLocalGlobalViews& newInds,
1753  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1754  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1755  const ELocalGlobal lg,
1756  const ELocalGlobal I)
1757  {
1758  const size_t oldNumEnt = rowInfo.numEntries;
1759  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1760 
1761  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1762  // that probably relates to scalar_type vs. impl_scalar_type
1763  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1764  // details; look for GCC_WORKAROUND macro definition.
1765  if (numInserted > 0) {
1766  const size_t startOffset = oldNumEnt;
1767  memcpy (&oldRowVals[startOffset], &newRowVals[0],
1768  numInserted * sizeof (impl_scalar_type));
1769  }
1770  }
1771 
1772  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1773  void
1775  insertLocalValues (const LocalOrdinal lclRow,
1776  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1777  const Teuchos::ArrayView<const Scalar>& values,
1778  const CombineMode CM)
1779  {
1780  using std::endl;
1781  const char tfecfFuncName[] = "insertLocalValues: ";
1782 
1783  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1784  (! this->isFillActive (), std::runtime_error,
1785  "Fill is not active. After calling fillComplete, you must call "
1786  "resumeFill before you may insert entries into the matrix again.");
1787  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1788  (this->isStaticGraph (), std::runtime_error,
1789  "Cannot insert indices with static graph; use replaceLocalValues() "
1790  "instead.");
1791  // At this point, we know that myGraph_ is nonnull.
1792  crs_graph_type& graph = * (this->myGraph_);
1793  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1794  (graph.colMap_.is_null (), std::runtime_error,
1795  "Cannot insert local indices without a column map.");
1796  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1797  (graph.isGloballyIndexed (),
1798  std::runtime_error, "Graph indices are global; use "
1799  "insertGlobalValues().");
1800  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1801  (values.size () != indices.size (), std::runtime_error,
1802  "values.size() = " << values.size ()
1803  << " != indices.size() = " << indices.size () << ".");
1804  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1805  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1806  "Local row index " << lclRow << " does not belong to this process.");
1807 
1808  if (! graph.indicesAreAllocated ()) {
1809  // We only allocate values at most once per process, so it's OK
1810  // to check TPETRA_VERBOSE here.
1811  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1812  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1813  }
1814 
1815 #ifdef HAVE_TPETRA_DEBUG
1816  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1817  // In a debug build, test whether any of the given column indices
1818  // are not in the column Map. Keep track of the invalid column
1819  // indices so we can tell the user about them.
1820  {
1821  using Teuchos::toString;
1822 
1823  const map_type& colMap = * (graph.colMap_);
1824  Teuchos::Array<LocalOrdinal> badColInds;
1825  bool allInColMap = true;
1826  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1827  if (! colMap.isNodeLocalElement (indices[k])) {
1828  allInColMap = false;
1829  badColInds.push_back (indices[k]);
1830  }
1831  }
1832  if (! allInColMap) {
1833  std::ostringstream os;
1834  os << "You attempted to insert entries in owned row " << lclRow
1835  << ", at the following column indices: " << toString (indices)
1836  << "." << endl;
1837  os << "Of those, the following indices are not in the column Map on "
1838  "this process: " << toString (badColInds) << "." << endl << "Since "
1839  "the matrix has a column Map already, it is invalid to insert "
1840  "entries at those locations.";
1841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1842  (true, std::invalid_argument, os.str ());
1843  }
1844  }
1845 #endif // HAVE_TPETRA_DEBUG
1846 
1847  RowInfo rowInfo = graph.getRowInfo (lclRow);
1848 
1849  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1850  if (CM == ADD) {
1851  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1852  valsView[offset] += values[k]; };
1853  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1854  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1855  } else if (CM == INSERT) {
1856  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1857  valsView[offset] = values[k]; };
1858  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1859  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1860  } else {
1861  std::ostringstream os;
1862  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1863  << "but this has not been implemented." << endl;
1864  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1865  (true, std::invalid_argument, os.str ());
1866  }
1867  }
1868 
1869  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1870  void
1872  insertLocalValues (const LocalOrdinal localRow,
1873  const LocalOrdinal numEnt,
1874  const Scalar vals[],
1875  const LocalOrdinal cols[],
1876  const CombineMode CM)
1877  {
1878  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1879  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1880  this->insertLocalValues (localRow, colsT, valsT, CM);
1881  }
1882 
1883  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1884  void
1887  RowInfo& rowInfo,
1888  const GlobalOrdinal gblColInds[],
1889  const impl_scalar_type vals[],
1890  const size_t numInputEnt)
1891  {
1892 #ifdef HAVE_TPETRA_DEBUG
1893  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1894  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1895  const size_t curNumEnt = rowInfo.numEntries;
1896 #endif // HAVE_TPETRA_DEBUG
1897 
1898  if (! graph.indicesAreAllocated ()) {
1899  // We only allocate values at most once per process, so it's OK
1900  // to check TPETRA_VERBOSE here.
1901  using ::Tpetra::Details::Behavior;
1902  const bool verbose = Behavior::verbose("CrsMatrix");
1903  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1904  // mfh 23 Jul 2017: allocateValues invalidates existing
1905  // getRowInfo results. Once we get rid of lazy graph
1906  // allocation, we'll be able to move the getRowInfo call outside
1907  // of this method.
1908  rowInfo = graph.getRowInfo (rowInfo.localRow);
1909  }
1910 
1911  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1912  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1913  valsView[offset] += vals[k];
1914  };
1915  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1916 #ifdef HAVE_TPETRA_DEBUG
1917  //numInserted is only used inside the debug code below.
1918  auto numInserted =
1919 #endif
1920  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1921 
1922 #ifdef HAVE_TPETRA_DEBUG
1923  size_t newNumEnt = curNumEnt + numInserted;
1924  const size_t chkNewNumEnt =
1925  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1926  if (chkNewNumEnt != newNumEnt) {
1927  std::ostringstream os;
1928  os << std::endl << "newNumEnt = " << newNumEnt
1929  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1930  << ") = " << chkNewNumEnt << "." << std::endl
1931  << "\torigNumEnt: " << origNumEnt << std::endl
1932  << "\tnumInputEnt: " << numInputEnt << std::endl
1933  << "\tgblColInds: [";
1934  for (size_t k = 0; k < numInputEnt; ++k) {
1935  os << gblColInds[k];
1936  if (k + size_t (1) < numInputEnt) {
1937  os << ",";
1938  }
1939  }
1940  os << "]" << std::endl
1941  << "\tvals: [";
1942  for (size_t k = 0; k < numInputEnt; ++k) {
1943  os << vals[k];
1944  if (k + size_t (1) < numInputEnt) {
1945  os << ",";
1946  }
1947  }
1948  os << "]" << std::endl;
1949 
1950  if (this->supportsRowViews ()) {
1951  values_host_view_type vals2;
1952  if (this->isGloballyIndexed ()) {
1953  global_inds_host_view_type gblColInds2;
1954  const GlobalOrdinal gblRow =
1955  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1956  if (gblRow ==
1957  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1958  os << "Local row index " << rowInfo.localRow << " is invalid!"
1959  << std::endl;
1960  }
1961  else {
1962  bool getViewThrew = false;
1963  try {
1964  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1965  }
1966  catch (std::exception& e) {
1967  getViewThrew = true;
1968  os << "getGlobalRowView threw exception:" << std::endl
1969  << e.what () << std::endl;
1970  }
1971  if (! getViewThrew) {
1972  os << "\tNew global column indices: ";
1973  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1974  os << gblColInds2[jjj] << " ";
1975  os << std::endl;
1976  os << "\tNew values: ";
1977  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1978  os << vals2[jjj] << " ";
1979  os << std::endl;
1980  }
1981  }
1982  }
1983  else if (this->isLocallyIndexed ()) {
1984  local_inds_host_view_type lclColInds2;
1985  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1986  os << "\tNew local column indices: ";
1987  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1988  os << lclColInds2[jjj] << " ";
1989  os << std::endl;
1990  os << "\tNew values: ";
1991  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1992  os << vals2[jjj] << " ";
1993  os << std::endl;
1994  }
1995  }
1996 
1997  os << "Please report this bug to the Tpetra developers.";
1998  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1999  (true, std::logic_error, os.str ());
2000  }
2001 #endif // HAVE_TPETRA_DEBUG
2002  }
2003 
2004  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2005  void
2007  insertGlobalValues (const GlobalOrdinal gblRow,
2008  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2009  const Teuchos::ArrayView<const Scalar>& values)
2010  {
2011  using Teuchos::toString;
2012  using std::endl;
2013  typedef impl_scalar_type IST;
2014  typedef LocalOrdinal LO;
2015  typedef GlobalOrdinal GO;
2016  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2017  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
2018  const char tfecfFuncName[] = "insertGlobalValues: ";
2019 
2020 #ifdef HAVE_TPETRA_DEBUG
2021  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2022  (values.size () != indices.size (), std::runtime_error,
2023  "values.size() = " << values.size () << " != indices.size() = "
2024  << indices.size () << ".");
2025 #endif // HAVE_TPETRA_DEBUG
2026 
2027  // getRowMap() is not thread safe, because it increments RCP's
2028  // reference count. getCrsGraphRef() is thread safe.
2029  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2030  const LO lclRow = rowMap.getLocalElement (gblRow);
2031 
2032  if (lclRow == OTLO::invalid ()) {
2033  // Input row is _not_ owned by the calling process.
2034  //
2035  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2036  // is not in the row Map, it doesn't matter whether or not the
2037  // graph is static; the data just get stashed for later use by
2038  // globalAssemble().
2039  this->insertNonownedGlobalValues (gblRow, indices, values);
2040  }
2041  else { // Input row _is_ owned by the calling process
2042  if (this->isStaticGraph ()) {
2043  // Uh oh! Not allowed to insert into owned rows in that case.
2044  const int myRank = rowMap.getComm ()->getRank ();
2045  const int numProcs = rowMap.getComm ()->getSize ();
2046  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2047  (true, std::runtime_error,
2048  "The matrix was constructed with a constant (\"static\") graph, "
2049  "yet the given global row index " << gblRow << " is in the row "
2050  "Map on the calling process (with rank " << myRank << ", of " <<
2051  numProcs << " process(es)). In this case, you may not insert "
2052  "new entries into rows owned by the calling process.");
2053  }
2054 
2055  crs_graph_type& graph = * (this->myGraph_);
2056  const IST* const inputVals =
2057  reinterpret_cast<const IST*> (values.getRawPtr ());
2058  const GO* const inputGblColInds = indices.getRawPtr ();
2059  const size_t numInputEnt = indices.size ();
2060  RowInfo rowInfo = graph.getRowInfo (lclRow);
2061 
2062  // If the matrix has a column Map, check at this point whether
2063  // the column indices belong to the column Map.
2064  //
2065  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2066  // test to the CrsGraph method, since it may have to do this
2067  // anyway.
2068  if (! graph.colMap_.is_null ()) {
2069  const map_type& colMap = * (graph.colMap_);
2070  // In a debug build, keep track of the nonowned ("bad") column
2071  // indices, so that we can display them in the exception
2072  // message. In a release build, just ditch the loop early if
2073  // we encounter a nonowned column index.
2074 #ifdef HAVE_TPETRA_DEBUG
2075  Teuchos::Array<GO> badColInds;
2076 #endif // HAVE_TPETRA_DEBUG
2077  const size_type numEntriesToInsert = indices.size ();
2078  bool allInColMap = true;
2079  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2080  if (! colMap.isNodeGlobalElement (indices[k])) {
2081  allInColMap = false;
2082 #ifdef HAVE_TPETRA_DEBUG
2083  badColInds.push_back (indices[k]);
2084 #else
2085  break;
2086 #endif // HAVE_TPETRA_DEBUG
2087  }
2088  }
2089  if (! allInColMap) {
2090  std::ostringstream os;
2091  os << "You attempted to insert entries in owned row " << gblRow
2092  << ", at the following column indices: " << toString (indices)
2093  << "." << endl;
2094 #ifdef HAVE_TPETRA_DEBUG
2095  os << "Of those, the following indices are not in the column Map "
2096  "on this process: " << toString (badColInds) << "." << endl
2097  << "Since the matrix has a column Map already, it is invalid "
2098  "to insert entries at those locations.";
2099 #else
2100  os << "At least one of those indices is not in the column Map "
2101  "on this process." << endl << "It is invalid to insert into "
2102  "columns not in the column Map on the process that owns the "
2103  "row.";
2104 #endif // HAVE_TPETRA_DEBUG
2105  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2106  (true, std::invalid_argument, os.str ());
2107  }
2108  }
2109 
2110  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2111  inputVals, numInputEnt);
2112  }
2113  }
2114 
2115 
2116  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2117  void
2119  insertGlobalValues (const GlobalOrdinal globalRow,
2120  const LocalOrdinal numEnt,
2121  const Scalar vals[],
2122  const GlobalOrdinal inds[])
2123  {
2124  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2125  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2126  this->insertGlobalValues (globalRow, indsT, valsT);
2127  }
2128 
2129 
2130  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2131  void
2134  const GlobalOrdinal gblRow,
2135  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2136  const Teuchos::ArrayView<const Scalar>& values,
2137  const bool debug)
2138  {
2139  typedef impl_scalar_type IST;
2140  typedef LocalOrdinal LO;
2141  typedef GlobalOrdinal GO;
2142  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2143  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2144 
2145  if (debug) {
2146  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2147  (values.size () != indices.size (), std::runtime_error,
2148  "values.size() = " << values.size () << " != indices.size() = "
2149  << indices.size () << ".");
2150  }
2151 
2152  // getRowMap() is not thread safe, because it increments RCP's
2153  // reference count. getCrsGraphRef() is thread safe.
2154  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2155  const LO lclRow = rowMap.getLocalElement (gblRow);
2156  if (lclRow == OTLO::invalid ()) {
2157  // Input row is _not_ owned by the calling process.
2158  //
2159  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2160  // is not in the row Map, it doesn't matter whether or not the
2161  // graph is static; the data just get stashed for later use by
2162  // globalAssemble().
2163  this->insertNonownedGlobalValues (gblRow, indices, values);
2164  }
2165  else { // Input row _is_ owned by the calling process
2166  if (this->isStaticGraph ()) {
2167  // Uh oh! Not allowed to insert into owned rows in that case.
2168  const int myRank = rowMap.getComm ()->getRank ();
2169  const int numProcs = rowMap.getComm ()->getSize ();
2170  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2171  (true, std::runtime_error,
2172  "The matrix was constructed with a constant (\"static\") graph, "
2173  "yet the given global row index " << gblRow << " is in the row "
2174  "Map on the calling process (with rank " << myRank << ", of " <<
2175  numProcs << " process(es)). In this case, you may not insert "
2176  "new entries into rows owned by the calling process.");
2177  }
2178 
2179  crs_graph_type& graph = * (this->myGraph_);
2180  const IST* const inputVals =
2181  reinterpret_cast<const IST*> (values.getRawPtr ());
2182  const GO* const inputGblColInds = indices.getRawPtr ();
2183  const size_t numInputEnt = indices.size ();
2184  RowInfo rowInfo = graph.getRowInfo (lclRow);
2185 
2186  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2187  // This branch is similar in function to the following branch, but for
2188  // the special case that the target graph is locally indexed.
2189  // In this case, we cannot simply filter
2190  // out global indices that don't exist on the receiving process and
2191  // insert the remaining (global) indices, but we must convert them (the
2192  // remaining global indices) to local and call `insertLocalValues`.
2193  const map_type& colMap = * (graph.colMap_);
2194  size_t curOffset = 0;
2195  while (curOffset < numInputEnt) {
2196  // Find a sequence of input indices that are in the column Map on the
2197  // calling process. Doing a sequence at a time, instead of one at a
2198  // time, amortizes some overhead.
2199  Teuchos::Array<LO> lclIndices;
2200  size_t endOffset = curOffset;
2201  for ( ; endOffset < numInputEnt; ++endOffset) {
2202  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2203  if (lclIndex != OTLO::invalid())
2204  lclIndices.push_back(lclIndex);
2205  else
2206  break;
2207  }
2208  // curOffset, endOffset: half-exclusive range of indices in the column
2209  // Map on the calling process. If endOffset == curOffset, the range is
2210  // empty.
2211  const LO numIndInSeq = (endOffset - curOffset);
2212  if (numIndInSeq != 0) {
2213  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2214  }
2215  // Invariant before the increment line: Either endOffset ==
2216  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2217  // on the calling process.
2218  if (debug) {
2219  const bool invariant = endOffset == numInputEnt ||
2220  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2221  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2222  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2223  }
2224  curOffset = endOffset + 1;
2225  }
2226  }
2227  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2228  const map_type& colMap = * (graph.colMap_);
2229  size_t curOffset = 0;
2230  while (curOffset < numInputEnt) {
2231  // Find a sequence of input indices that are in the column
2232  // Map on the calling process. Doing a sequence at a time,
2233  // instead of one at a time, amortizes some overhead.
2234  size_t endOffset = curOffset;
2235  for ( ; endOffset < numInputEnt &&
2236  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2237  ++endOffset)
2238  {}
2239  // curOffset, endOffset: half-exclusive range of indices in
2240  // the column Map on the calling process. If endOffset ==
2241  // curOffset, the range is empty.
2242  const LO numIndInSeq = (endOffset - curOffset);
2243  if (numIndInSeq != 0) {
2244  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2245  this->insertGlobalValuesImpl (graph, rowInfo,
2246  inputGblColInds + curOffset,
2247  inputVals + curOffset,
2248  numIndInSeq);
2249  }
2250  // Invariant before the increment line: Either endOffset ==
2251  // numInputEnt, or inputGblColInds[endOffset] is not in the
2252  // column Map on the calling process.
2253  if (debug) {
2254  const bool invariant = endOffset == numInputEnt ||
2255  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2256  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2257  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2258  }
2259  curOffset = endOffset + 1;
2260  }
2261  }
2262  else { // we don't have a column Map.
2263  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2264  inputVals, numInputEnt);
2265  }
2266  }
2267  }
2268 
2269  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2270  void
2271  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2272  insertGlobalValuesFilteredChecked(
2273  const GlobalOrdinal gblRow,
2274  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2275  const Teuchos::ArrayView<const Scalar>& values,
2276  const char* const prefix,
2277  const bool debug,
2278  const bool verbose)
2279  {
2281  using std::endl;
2282 
2283  try {
2284  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2285  }
2286  catch(std::exception& e) {
2287  std::ostringstream os;
2288  if (verbose) {
2289  const size_t maxNumToPrint =
2291  os << *prefix << ": insertGlobalValuesFiltered threw an "
2292  "exception: " << e.what() << endl
2293  << "Global row index: " << gblRow << endl;
2294  verbosePrintArray(os, indices, "Global column indices",
2295  maxNumToPrint);
2296  os << endl;
2297  verbosePrintArray(os, values, "Values", maxNumToPrint);
2298  os << endl;
2299  }
2300  else {
2301  os << ": insertGlobalValuesFiltered threw an exception: "
2302  << e.what();
2303  }
2304  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2305  }
2306  }
2307 
2308  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2309  LocalOrdinal
2312  const crs_graph_type& graph,
2313  const RowInfo& rowInfo,
2314  const LocalOrdinal inds[],
2315  const impl_scalar_type newVals[],
2316  const LocalOrdinal numElts)
2317  {
2318  typedef LocalOrdinal LO;
2319  typedef GlobalOrdinal GO;
2320  const bool sorted = graph.isSorted ();
2321 
2322  size_t hint = 0; // Guess for the current index k into rowVals
2323  LO numValid = 0; // number of valid local column indices
2324 
2325  if (graph.isLocallyIndexed ()) {
2326  // Get a view of the column indices in the row. This amortizes
2327  // the cost of getting the view over all the entries of inds.
2328  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2329 
2330  for (LO j = 0; j < numElts; ++j) {
2331  const LO lclColInd = inds[j];
2332  const size_t offset =
2333  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2334  lclColInd, hint, sorted);
2335  if (offset != rowInfo.numEntries) {
2336  rowVals[offset] = newVals[j];
2337  hint = offset + 1;
2338  ++numValid;
2339  }
2340  }
2341  }
2342  else if (graph.isGloballyIndexed ()) {
2343  if (graph.colMap_.is_null ()) {
2344  return Teuchos::OrdinalTraits<LO>::invalid ();
2345  }
2346  const map_type colMap = * (graph.colMap_);
2347 
2348  // Get a view of the column indices in the row. This amortizes
2349  // the cost of getting the view over all the entries of inds.
2350  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2351 
2352  for (LO j = 0; j < numElts; ++j) {
2353  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2354  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2355  const size_t offset =
2356  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2357  gblColInd, hint, sorted);
2358  if (offset != rowInfo.numEntries) {
2359  rowVals[offset] = newVals[j];
2360  hint = offset + 1;
2361  ++numValid;
2362  }
2363  }
2364  }
2365  }
2366  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2367  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2368  // to be neither locally nor globally indexed on a process.
2369  // This means that the graph or matrix has no entries on that
2370  // process. Epetra also works like this. It's related to lazy
2371  // allocation (on first insertion, not at graph / matrix
2372  // construction). Lazy allocation will go away because it is
2373  // not thread scalable.
2374 
2375  return numValid;
2376  }
2377 
2378  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2379  LocalOrdinal
2381  replaceLocalValues (const LocalOrdinal localRow,
2382  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2383  const Teuchos::ArrayView<const Scalar>& vals)
2384  {
2385  typedef LocalOrdinal LO;
2386 
2387  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2388  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2389  return Teuchos::OrdinalTraits<LO>::invalid ();
2390  }
2391  const LO* const inputInds = lclCols.getRawPtr ();
2392  const Scalar* const inputVals = vals.getRawPtr ();
2393  return this->replaceLocalValues (localRow, numInputEnt,
2394  inputVals, inputInds);
2395  }
2396 
2397  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2399  local_ordinal_type
2402  const local_ordinal_type localRow,
2403  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2404  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2405  {
2406  using LO = local_ordinal_type;
2407  const LO numInputEnt = inputInds.extent(0);
2408  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2409  return Teuchos::OrdinalTraits<LO>::invalid();
2410  }
2411  const Scalar* const inVals =
2412  reinterpret_cast<const Scalar*>(inputVals.data());
2413  return this->replaceLocalValues(localRow, numInputEnt,
2414  inVals, inputInds.data());
2415  }
2416 
2417  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2418  LocalOrdinal
2420  replaceLocalValues (const LocalOrdinal localRow,
2421  const LocalOrdinal numEnt,
2422  const Scalar inputVals[],
2423  const LocalOrdinal inputCols[])
2424  {
2425  typedef impl_scalar_type IST;
2426  typedef LocalOrdinal LO;
2427 
2428  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2429  // Fill must be active and the "nonconst" graph must exist.
2430  return Teuchos::OrdinalTraits<LO>::invalid ();
2431  }
2432  const crs_graph_type& graph = * (this->staticGraph_);
2433  const RowInfo rowInfo = graph.getRowInfo (localRow);
2434 
2435  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2436  // The calling process does not own this row, so it is not
2437  // allowed to modify its values.
2438  return static_cast<LO> (0);
2439  }
2440  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2441  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2442  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2443  inputCols, inVals, numEnt);
2444  }
2445 
2446  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2447  LocalOrdinal
2450  const crs_graph_type& graph,
2451  const RowInfo& rowInfo,
2452  const GlobalOrdinal inds[],
2453  const impl_scalar_type newVals[],
2454  const LocalOrdinal numElts)
2455  {
2456  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2457  auto fun =
2458  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2459  rowVals[offset] = newVals[k];
2460  };
2461  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2462  return graph.findGlobalIndices(rowInfo, indsT, cb);
2463  }
2464 
2465  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2466  LocalOrdinal
2468  replaceGlobalValues (const GlobalOrdinal globalRow,
2469  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2470  const Teuchos::ArrayView<const Scalar>& inputVals)
2471  {
2472  typedef LocalOrdinal LO;
2473 
2474  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2475  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2476  return Teuchos::OrdinalTraits<LO>::invalid ();
2477  }
2478  return this->replaceGlobalValues (globalRow, numInputEnt,
2479  inputVals.getRawPtr (),
2480  inputGblColInds.getRawPtr ());
2481  }
2482 
2483  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2484  LocalOrdinal
2486  replaceGlobalValues (const GlobalOrdinal globalRow,
2487  const LocalOrdinal numEnt,
2488  const Scalar inputVals[],
2489  const GlobalOrdinal inputGblColInds[])
2490  {
2491  typedef impl_scalar_type IST;
2492  typedef LocalOrdinal LO;
2493 
2494  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2495  // Fill must be active and the "nonconst" graph must exist.
2496  return Teuchos::OrdinalTraits<LO>::invalid ();
2497  }
2498  const crs_graph_type& graph = * (this->staticGraph_);
2499 
2500  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2501  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2502  // The input local row is invalid on the calling process,
2503  // which means that the calling process summed 0 entries.
2504  return static_cast<LO> (0);
2505  }
2506 
2507  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2508  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2509  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2510  inputGblColInds, inVals, numEnt);
2511  }
2512 
2513  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2515  local_ordinal_type
2518  const global_ordinal_type globalRow,
2519  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2520  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2521  {
2522  // We use static_assert here to check the template parameters,
2523  // rather than std::enable_if (e.g., on the return value, to
2524  // enable compilation only if the template parameters match the
2525  // desired attributes). This turns obscure link errors into
2526  // clear compilation errors. It also makes the return value a
2527  // lot easier to see.
2528  using LO = local_ordinal_type;
2529  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2530  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2531  return Teuchos::OrdinalTraits<LO>::invalid();
2532  }
2533  const Scalar* const inVals =
2534  reinterpret_cast<const Scalar*>(inputVals.data());
2535  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2536  inputInds.data());
2537  }
2538 
2539  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2540  LocalOrdinal
2543  const crs_graph_type& graph,
2544  const RowInfo& rowInfo,
2545  const GlobalOrdinal inds[],
2546  const impl_scalar_type newVals[],
2547  const LocalOrdinal numElts,
2548  const bool atomic)
2549  {
2550  typedef LocalOrdinal LO;
2551  typedef GlobalOrdinal GO;
2552 
2553  const bool sorted = graph.isSorted ();
2554 
2555  size_t hint = 0; // guess at the index's relative offset in the row
2556  LO numValid = 0; // number of valid input column indices
2557 
2558  if (graph.isLocallyIndexed ()) {
2559  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2560  // pointer does NOT change its reference count. Thus, this
2561  // code is still thread safe.
2562  if (graph.colMap_.is_null ()) {
2563  // NO input column indices are valid in this case, since if
2564  // the column Map is null on the calling process, then the
2565  // calling process owns no graph entries.
2566  return numValid;
2567  }
2568  const map_type& colMap = * (graph.colMap_);
2569 
2570  // Get a view of the column indices in the row. This amortizes
2571  // the cost of getting the view over all the entries of inds.
2572  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2573  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2574 
2575  for (LO j = 0; j < numElts; ++j) {
2576  const LO lclColInd = colMap.getLocalElement (inds[j]);
2577  if (lclColInd != LINV) {
2578  const size_t offset =
2579  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2580  lclColInd, hint, sorted);
2581  if (offset != rowInfo.numEntries) {
2582  if (atomic) {
2583  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2584  }
2585  else {
2586  rowVals[offset] += newVals[j];
2587  }
2588  hint = offset + 1;
2589  numValid++;
2590  }
2591  }
2592  }
2593  }
2594  else if (graph.isGloballyIndexed ()) {
2595  // Get a view of the column indices in the row. This amortizes
2596  // the cost of getting the view over all the entries of inds.
2597  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2598 
2599  for (LO j = 0; j < numElts; ++j) {
2600  const GO gblColInd = inds[j];
2601  const size_t offset =
2602  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2603  gblColInd, hint, sorted);
2604  if (offset != rowInfo.numEntries) {
2605  if (atomic) {
2606  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2607  }
2608  else {
2609  rowVals[offset] += newVals[j];
2610  }
2611  hint = offset + 1;
2612  numValid++;
2613  }
2614  }
2615  }
2616  // If the graph is neither locally nor globally indexed on the
2617  // calling process, that means the calling process has no graph
2618  // entries. Thus, none of the input column indices are valid.
2619 
2620  return numValid;
2621  }
2622 
2623  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2624  LocalOrdinal
2626  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2627  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2628  const Teuchos::ArrayView<const Scalar>& inputVals,
2629  const bool atomic)
2630  {
2631  typedef LocalOrdinal LO;
2632 
2633  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2634  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2635  return Teuchos::OrdinalTraits<LO>::invalid ();
2636  }
2637  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2638  inputVals.getRawPtr (),
2639  inputGblColInds.getRawPtr (),
2640  atomic);
2641  }
2642 
2643  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2644  LocalOrdinal
2646  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2647  const LocalOrdinal numInputEnt,
2648  const Scalar inputVals[],
2649  const GlobalOrdinal inputGblColInds[],
2650  const bool atomic)
2651  {
2652  typedef impl_scalar_type IST;
2653  typedef LocalOrdinal LO;
2654  typedef GlobalOrdinal GO;
2655 
2656  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2657  // Fill must be active and the "nonconst" graph must exist.
2658  return Teuchos::OrdinalTraits<LO>::invalid ();
2659  }
2660  const crs_graph_type& graph = * (this->staticGraph_);
2661 
2662  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2663  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2664  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2665  // thread safe in a debug build, in part because it uses
2666  // Teuchos::ArrayView, and in part because of the data structure
2667  // used to stash outgoing entries.
2668  using Teuchos::ArrayView;
2669  ArrayView<const GO> inputGblColInds_av(
2670  numInputEnt == 0 ? nullptr : inputGblColInds,
2671  numInputEnt);
2672  ArrayView<const Scalar> inputVals_av(
2673  numInputEnt == 0 ? nullptr :
2674  inputVals, numInputEnt);
2675  // gblRow is not in the row Map on the calling process, so stash
2676  // the given entries away in a separate data structure.
2677  // globalAssemble() (called during fillComplete()) will exchange
2678  // that data and sum it in using sumIntoGlobalValues().
2679  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2680  inputVals_av);
2681  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2682  // since we won't know whether the given indices were valid
2683  // until globalAssemble (called in fillComplete) is called.
2684  // That's why insertNonownedGlobalValues doesn't return
2685  // anything. Just for consistency, I'll return the number of
2686  // entries that the user gave us.
2687  return numInputEnt;
2688  }
2689  else { // input row is in the row Map on the calling process
2690  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2691  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2692  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2693  inputGblColInds, inVals,
2694  numInputEnt, atomic);
2695  }
2696  }
2697 
2698  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2699  LocalOrdinal
2701  transformLocalValues (const LocalOrdinal lclRow,
2702  const LocalOrdinal numInputEnt,
2703  const impl_scalar_type inputVals[],
2704  const LocalOrdinal inputCols[],
2705  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2706  const bool atomic)
2707  {
2708  using Tpetra::Details::OrdinalTraits;
2709  typedef LocalOrdinal LO;
2710 
2711  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2712  // Fill must be active and the "nonconst" graph must exist.
2713  return Teuchos::OrdinalTraits<LO>::invalid ();
2714  }
2715  const crs_graph_type& graph = * (this->staticGraph_);
2716  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2717 
2718  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2719  // The calling process does not own this row, so it is not
2720  // allowed to modify its values.
2721  return static_cast<LO> (0);
2722  }
2723  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2724  return this->transformLocalValues (curRowVals.data (), graph,
2725  rowInfo, inputCols, inputVals,
2726  numInputEnt, f, atomic);
2727  }
2728 
2729  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2730  LocalOrdinal
2731  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2732  transformGlobalValues (const GlobalOrdinal gblRow,
2733  const LocalOrdinal numInputEnt,
2734  const impl_scalar_type inputVals[],
2735  const GlobalOrdinal inputCols[],
2736  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2737  const bool atomic)
2738  {
2739  using Tpetra::Details::OrdinalTraits;
2740  typedef LocalOrdinal LO;
2741 
2742  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2743  // Fill must be active and the "nonconst" graph must exist.
2744  return OrdinalTraits<LO>::invalid ();
2745  }
2746  const crs_graph_type& graph = * (this->staticGraph_);
2747  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2748 
2749  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2750  // The calling process does not own this row, so it is not
2751  // allowed to modify its values.
2752  return static_cast<LO> (0);
2753  }
2754  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2755  return this->transformGlobalValues (curRowVals.data (), graph,
2756  rowInfo, inputCols, inputVals,
2757  numInputEnt, f, atomic);
2758  }
2759 
2760  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2761  LocalOrdinal
2762  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2763  transformLocalValues (impl_scalar_type rowVals[],
2764  const crs_graph_type& graph,
2765  const RowInfo& rowInfo,
2766  const LocalOrdinal inds[],
2767  const impl_scalar_type newVals[],
2768  const LocalOrdinal numElts,
2769  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2770  const bool atomic)
2771  {
2772  typedef impl_scalar_type ST;
2773  typedef LocalOrdinal LO;
2774  typedef GlobalOrdinal GO;
2775 
2776  //if (newVals.extent (0) != inds.extent (0)) {
2777  // The sizes of the input arrays must match.
2778  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2779  //}
2780  //const LO numElts = static_cast<LO> (inds.extent (0));
2781  const bool sorted = graph.isSorted ();
2782 
2783  LO numValid = 0; // number of valid input column indices
2784  size_t hint = 0; // Guess for the current index k into rowVals
2785 
2786  if (graph.isLocallyIndexed ()) {
2787  // Get a view of the column indices in the row. This amortizes
2788  // the cost of getting the view over all the entries of inds.
2789  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2790 
2791  for (LO j = 0; j < numElts; ++j) {
2792  const LO lclColInd = inds[j];
2793  const size_t offset =
2794  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2795  lclColInd, hint, sorted);
2796  if (offset != rowInfo.numEntries) {
2797  if (atomic) {
2798  // NOTE (mfh 30 Nov 2015) The commented-out code is
2799  // wrong because another thread may have changed
2800  // rowVals[offset] between those two lines of code.
2801  //
2802  //const ST newVal = f (rowVals[offset], newVals[j]);
2803  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2804 
2805  volatile ST* const dest = &rowVals[offset];
2806  (void) atomic_binary_function_update (dest, newVals[j], f);
2807  }
2808  else {
2809  // use binary function f
2810  rowVals[offset] = f (rowVals[offset], newVals[j]);
2811  }
2812  hint = offset + 1;
2813  ++numValid;
2814  }
2815  }
2816  }
2817  else if (graph.isGloballyIndexed ()) {
2818  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2819  // pointer does NOT change its reference count. Thus, this
2820  // code is still thread safe.
2821  if (graph.colMap_.is_null ()) {
2822  // NO input column indices are valid in this case. Either
2823  // the column Map hasn't been set yet (so local indices
2824  // don't exist yet), or the calling process owns no graph
2825  // entries.
2826  return numValid;
2827  }
2828  const map_type& colMap = * (graph.colMap_);
2829  // Get a view of the column indices in the row. This amortizes
2830  // the cost of getting the view over all the entries of inds.
2831  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2832 
2833  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2834  for (LO j = 0; j < numElts; ++j) {
2835  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2836  if (gblColInd != GINV) {
2837  const size_t offset =
2838  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2839  gblColInd, hint, sorted);
2840  if (offset != rowInfo.numEntries) {
2841  if (atomic) {
2842  // NOTE (mfh 30 Nov 2015) The commented-out code is
2843  // wrong because another thread may have changed
2844  // rowVals[offset] between those two lines of code.
2845  //
2846  //const ST newVal = f (rowVals[offset], newVals[j]);
2847  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2848 
2849  volatile ST* const dest = &rowVals[offset];
2850  (void) atomic_binary_function_update (dest, newVals[j], f);
2851  }
2852  else {
2853  // use binary function f
2854  rowVals[offset] = f (rowVals[offset], newVals[j]);
2855  }
2856  hint = offset + 1;
2857  numValid++;
2858  }
2859  }
2860  }
2861  }
2862  // If the graph is neither locally nor globally indexed on the
2863  // calling process, that means the calling process has no graph
2864  // entries. Thus, none of the input column indices are valid.
2865 
2866  return numValid;
2867  }
2868 
2869  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2870  LocalOrdinal
2871  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2872  transformGlobalValues (impl_scalar_type rowVals[],
2873  const crs_graph_type& graph,
2874  const RowInfo& rowInfo,
2875  const GlobalOrdinal inds[],
2876  const impl_scalar_type newVals[],
2877  const LocalOrdinal numElts,
2878  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2879  const bool atomic)
2880  {
2881  typedef impl_scalar_type ST;
2882  typedef LocalOrdinal LO;
2883  typedef GlobalOrdinal GO;
2884 
2885  //if (newVals.extent (0) != inds.extent (0)) {
2886  // The sizes of the input arrays must match.
2887  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2888  //}
2889  //const LO numElts = static_cast<LO> (inds.extent (0));
2890  const bool sorted = graph.isSorted ();
2891 
2892  LO numValid = 0; // number of valid input column indices
2893  size_t hint = 0; // Guess for the current index k into rowVals
2894 
2895  if (graph.isGloballyIndexed ()) {
2896  // Get a view of the column indices in the row. This amortizes
2897  // the cost of getting the view over all the entries of inds.
2898  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2899 
2900  for (LO j = 0; j < numElts; ++j) {
2901  const GO gblColInd = inds[j];
2902  const size_t offset =
2903  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2904  gblColInd, hint, sorted);
2905  if (offset != rowInfo.numEntries) {
2906  if (atomic) {
2907  // NOTE (mfh 30 Nov 2015) The commented-out code is
2908  // wrong because another thread may have changed
2909  // rowVals[offset] between those two lines of code.
2910  //
2911  //const ST newVal = f (rowVals[offset], newVals[j]);
2912  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2913 
2914  volatile ST* const dest = &rowVals[offset];
2915  (void) atomic_binary_function_update (dest, newVals[j], f);
2916  }
2917  else {
2918  // use binary function f
2919  rowVals[offset] = f (rowVals[offset], newVals[j]);
2920  }
2921  hint = offset + 1;
2922  ++numValid;
2923  }
2924  }
2925  }
2926  else if (graph.isLocallyIndexed ()) {
2927  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2928  // pointer does NOT change its reference count. Thus, this
2929  // code is still thread safe.
2930  if (graph.colMap_.is_null ()) {
2931  // NO input column indices are valid in this case. Either the
2932  // column Map hasn't been set yet (so local indices don't
2933  // exist yet), or the calling process owns no graph entries.
2934  return numValid;
2935  }
2936  const map_type& colMap = * (graph.colMap_);
2937  // Get a view of the column indices in the row. This amortizes
2938  // the cost of getting the view over all the entries of inds.
2939  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2940 
2941  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2942  for (LO j = 0; j < numElts; ++j) {
2943  const LO lclColInd = colMap.getLocalElement (inds[j]);
2944  if (lclColInd != LINV) {
2945  const size_t offset =
2946  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2947  lclColInd, hint, sorted);
2948  if (offset != rowInfo.numEntries) {
2949  if (atomic) {
2950  // NOTE (mfh 30 Nov 2015) The commented-out code is
2951  // wrong because another thread may have changed
2952  // rowVals[offset] between those two lines of code.
2953  //
2954  //const ST newVal = f (rowVals[offset], newVals[j]);
2955  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2956 
2957  volatile ST* const dest = &rowVals[offset];
2958  (void) atomic_binary_function_update (dest, newVals[j], f);
2959  }
2960  else {
2961  // use binary function f
2962  rowVals[offset] = f (rowVals[offset], newVals[j]);
2963  }
2964  hint = offset + 1;
2965  numValid++;
2966  }
2967  }
2968  }
2969  }
2970  // If the graph is neither locally nor globally indexed on the
2971  // calling process, that means the calling process has no graph
2972  // entries. Thus, none of the input column indices are valid.
2973 
2974  return numValid;
2975  }
2976 
2977  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2978  LocalOrdinal
2981  const crs_graph_type& graph,
2982  const RowInfo& rowInfo,
2983  const LocalOrdinal inds[],
2984  const impl_scalar_type newVals[],
2985  const LocalOrdinal numElts,
2986  const bool atomic)
2987  {
2988  typedef LocalOrdinal LO;
2989  typedef GlobalOrdinal GO;
2990 
2991  const bool sorted = graph.isSorted ();
2992 
2993  size_t hint = 0; // Guess for the current index k into rowVals
2994  LO numValid = 0; // number of valid local column indices
2995 
2996  if (graph.isLocallyIndexed ()) {
2997  // Get a view of the column indices in the row. This amortizes
2998  // the cost of getting the view over all the entries of inds.
2999  auto colInds = graph.getLocalIndsViewHost (rowInfo);
3000 
3001  for (LO j = 0; j < numElts; ++j) {
3002  const LO lclColInd = inds[j];
3003  const size_t offset =
3004  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3005  lclColInd, hint, sorted);
3006  if (offset != rowInfo.numEntries) {
3007  if (atomic) {
3008  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3009  }
3010  else {
3011  rowVals[offset] += newVals[j];
3012  }
3013  hint = offset + 1;
3014  ++numValid;
3015  }
3016  }
3017  }
3018  else if (graph.isGloballyIndexed ()) {
3019  if (graph.colMap_.is_null ()) {
3020  return Teuchos::OrdinalTraits<LO>::invalid ();
3021  }
3022  const map_type colMap = * (graph.colMap_);
3023 
3024  // Get a view of the column indices in the row. This amortizes
3025  // the cost of getting the view over all the entries of inds.
3026  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
3027 
3028  for (LO j = 0; j < numElts; ++j) {
3029  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3030  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3031  const size_t offset =
3032  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3033  gblColInd, hint, sorted);
3034  if (offset != rowInfo.numEntries) {
3035  if (atomic) {
3036  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3037  }
3038  else {
3039  rowVals[offset] += newVals[j];
3040  }
3041  hint = offset + 1;
3042  ++numValid;
3043  }
3044  }
3045  }
3046  }
3047  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3048  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3049  // to be neither locally nor globally indexed on a process.
3050  // This means that the graph or matrix has no entries on that
3051  // process. Epetra also works like this. It's related to lazy
3052  // allocation (on first insertion, not at graph / matrix
3053  // construction). Lazy allocation will go away because it is
3054  // not thread scalable.
3055 
3056  return numValid;
3057  }
3058 
3059  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3060  LocalOrdinal
3062  sumIntoLocalValues (const LocalOrdinal localRow,
3063  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3064  const Teuchos::ArrayView<const Scalar>& values,
3065  const bool atomic)
3066  {
3067  using LO = local_ordinal_type;
3068  const LO numInputEnt = static_cast<LO>(indices.size());
3069  if (static_cast<LO>(values.size()) != numInputEnt) {
3070  return Teuchos::OrdinalTraits<LO>::invalid();
3071  }
3072  const LO* const inputInds = indices.getRawPtr();
3073  const scalar_type* const inputVals = values.getRawPtr();
3074  return this->sumIntoLocalValues(localRow, numInputEnt,
3075  inputVals, inputInds, atomic);
3076  }
3077 
3078  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3080  local_ordinal_type
3083  const local_ordinal_type localRow,
3084  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3085  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3086  const bool atomic)
3087  {
3088  using LO = local_ordinal_type;
3089  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3090  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3091  return Teuchos::OrdinalTraits<LO>::invalid();
3092  }
3093  const scalar_type* inVals =
3094  reinterpret_cast<const scalar_type*>(inputVals.data());
3095  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3096  inputInds.data(), atomic);
3097  }
3098 
3099  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3100  LocalOrdinal
3102  sumIntoLocalValues (const LocalOrdinal localRow,
3103  const LocalOrdinal numEnt,
3104  const Scalar vals[],
3105  const LocalOrdinal cols[],
3106  const bool atomic)
3107  {
3108  typedef impl_scalar_type IST;
3109  typedef LocalOrdinal LO;
3110 
3111  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3112  // Fill must be active and the "nonconst" graph must exist.
3113  return Teuchos::OrdinalTraits<LO>::invalid ();
3114  }
3115  const crs_graph_type& graph = * (this->staticGraph_);
3116  const RowInfo rowInfo = graph.getRowInfo (localRow);
3117 
3118  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3119  // The calling process does not own this row, so it is not
3120  // allowed to modify its values.
3121  return static_cast<LO> (0);
3122  }
3123  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3124  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3125  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3126  cols, inputVals, numEnt, atomic);
3127  }
3128 
3129  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3131  values_dualv_type::t_host::const_type
3133  getValuesViewHost (const RowInfo& rowinfo) const
3134  {
3135  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3136  return typename values_dualv_type::t_host::const_type ();
3137  else
3138  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3139  rowinfo.allocSize,
3140  Access::ReadOnly);
3141  }
3142 
3143  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3145  values_dualv_type::t_host
3148  {
3149  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3150  return typename values_dualv_type::t_host ();
3151  else
3152  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3153  rowinfo.allocSize,
3154  Access::ReadWrite);
3155  }
3156 
3157  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3159  values_dualv_type::t_dev::const_type
3161  getValuesViewDevice (const RowInfo& rowinfo) const
3162  {
3163  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3164  return typename values_dualv_type::t_dev::const_type ();
3165  else
3166  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3167  rowinfo.allocSize,
3168  Access::ReadOnly);
3169  }
3170 
3171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3173  values_dualv_type::t_dev
3176  {
3177  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3178  return typename values_dualv_type::t_dev ();
3179  else
3180  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3181  rowinfo.allocSize,
3182  Access::ReadWrite);
3183  }
3184 
3185 
3186  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3187  void
3190  nonconst_local_inds_host_view_type &indices,
3191  nonconst_values_host_view_type &values,
3192  size_t& numEntries) const
3193  {
3194  using Teuchos::ArrayView;
3195  using Teuchos::av_reinterpret_cast;
3196  const char tfecfFuncName[] = "getLocalRowCopy: ";
3197 
3198  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3199  (! this->hasColMap (), std::runtime_error,
3200  "The matrix does not have a column Map yet. This means we don't have "
3201  "local indices for columns yet, so it doesn't make sense to call this "
3202  "method. If the matrix doesn't have a column Map yet, you should call "
3203  "fillComplete on it first.");
3204 
3205  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3206  const size_t theNumEntries = rowinfo.numEntries;
3207  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3208  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3209  static_cast<size_t> (values.size ()) < theNumEntries,
3210  std::runtime_error, "Row with local index " << localRow << " has " <<
3211  theNumEntries << " entry/ies, but indices.size() = " <<
3212  indices.size () << " and values.size() = " << values.size () << ".");
3213  numEntries = theNumEntries; // first side effect
3214 
3215  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3216  if (staticGraph_->isLocallyIndexed ()) {
3217  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3218  auto curVals = getValuesViewHost(rowinfo);
3219 
3220  for (size_t j = 0; j < theNumEntries; ++j) {
3221  values[j] = curVals[j];
3222  indices[j] = curLclInds(j);
3223  }
3224  }
3225  else if (staticGraph_->isGloballyIndexed ()) {
3226  // Don't call getColMap(), because it touches RCP's reference count.
3227  const map_type& colMap = * (staticGraph_->colMap_);
3228  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3229  auto curVals = getValuesViewHost(rowinfo);
3230 
3231  for (size_t j = 0; j < theNumEntries; ++j) {
3232  values[j] = curVals[j];
3233  indices[j] = colMap.getLocalElement (curGblInds(j));
3234  }
3235  }
3236  }
3237  }
3238 
3239 
3240 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3241 void
3244  nonconst_global_inds_host_view_type &indices,
3245  nonconst_values_host_view_type &values,
3246  size_t& numEntries) const
3247  {
3248  using Teuchos::ArrayView;
3249  using Teuchos::av_reinterpret_cast;
3250  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3251 
3252  const RowInfo rowinfo =
3253  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3254  const size_t theNumEntries = rowinfo.numEntries;
3255  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3256  static_cast<size_t> (indices.size ()) < theNumEntries ||
3257  static_cast<size_t> (values.size ()) < theNumEntries,
3258  std::runtime_error, "Row with global index " << globalRow << " has "
3259  << theNumEntries << " entry/ies, but indices.size() = " <<
3260  indices.size () << " and values.size() = " << values.size () << ".");
3261  numEntries = theNumEntries; // first side effect
3262 
3263  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3264  if (staticGraph_->isLocallyIndexed ()) {
3265  const map_type& colMap = * (staticGraph_->colMap_);
3266  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3267  auto curVals = getValuesViewHost(rowinfo);
3268 
3269  for (size_t j = 0; j < theNumEntries; ++j) {
3270  values[j] = curVals[j];
3271  indices[j] = colMap.getGlobalElement (curLclInds(j));
3272  }
3273  }
3274  else if (staticGraph_->isGloballyIndexed ()) {
3275  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3276  auto curVals = getValuesViewHost(rowinfo);
3277 
3278  for (size_t j = 0; j < theNumEntries; ++j) {
3279  values[j] = curVals[j];
3280  indices[j] = curGblInds(j);
3281  }
3282  }
3283  }
3284  }
3285 
3286 
3287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3288  void
3290  getLocalRowView(LocalOrdinal localRow,
3291  local_inds_host_view_type &indices,
3292  values_host_view_type &values) const
3293  {
3294  const char tfecfFuncName[] = "getLocalRowView: ";
3295 
3296  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3297  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3298  "its indices as global indices, so you cannot get a view with local "
3299  "column indices. If the matrix has a column Map, you may call "
3300  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3301  "a view with global column indices by calling getGlobalRowCopy().");
3302 
3303  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3304  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3305  rowInfo.numEntries > 0) {
3306  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3307  rowInfo.offset1D,
3308  rowInfo.numEntries,
3309  Access::ReadOnly);
3310  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3311  rowInfo.numEntries,
3312  Access::ReadOnly);
3313  }
3314  else {
3315  // This does the right thing (reports an empty row) if the input
3316  // row is invalid.
3317  indices = local_inds_host_view_type();
3318  values = values_host_view_type();
3319  }
3320 
3321 #ifdef HAVE_TPETRA_DEBUG
3322  const char suffix[] = ". This should never happen. Please report this "
3323  "bug to the Tpetra developers.";
3324  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3325  (static_cast<size_t> (indices.size ()) !=
3326  static_cast<size_t> (values.size ()), std::logic_error,
3327  "At the end of this method, for local row " << localRow << ", "
3328  "indices.size() = " << indices.size () << " != values.size () = "
3329  << values.size () << suffix);
3330  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3331  (static_cast<size_t> (indices.size ()) !=
3332  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3333  "At the end of this method, for local row " << localRow << ", "
3334  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3335  << rowInfo.numEntries << suffix);
3336  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3337  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3338  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3339  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3340  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3341  expectedNumEntries << suffix);
3342 #endif // HAVE_TPETRA_DEBUG
3343  }
3344 
3345 
3346  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3347  void
3349  getGlobalRowView (GlobalOrdinal globalRow,
3350  global_inds_host_view_type &indices,
3351  values_host_view_type &values) const
3352  {
3353  const char tfecfFuncName[] = "getGlobalRowView: ";
3354 
3355  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3356  isLocallyIndexed (), std::runtime_error,
3357  "The matrix is locally indexed, so we cannot return a view of the row "
3358  "with global column indices. Use getGlobalRowCopy() instead.");
3359 
3360  // This does the right thing (reports an empty row) if the input
3361  // row is invalid.
3362  const RowInfo rowInfo =
3363  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3364  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3365  rowInfo.numEntries > 0) {
3366  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3367  rowInfo.numEntries,
3368  Access::ReadOnly);
3369  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3370  rowInfo.numEntries,
3371  Access::ReadOnly);
3372  }
3373  else {
3374  indices = global_inds_host_view_type();
3375  values = values_host_view_type();
3376  }
3377 
3378 #ifdef HAVE_TPETRA_DEBUG
3379  const char suffix[] = ". This should never happen. Please report this "
3380  "bug to the Tpetra developers.";
3381  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3382  (static_cast<size_t> (indices.size ()) !=
3383  static_cast<size_t> (values.size ()), std::logic_error,
3384  "At the end of this method, for global row " << globalRow << ", "
3385  "indices.size() = " << indices.size () << " != values.size () = "
3386  << values.size () << suffix);
3387  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3388  (static_cast<size_t> (indices.size ()) !=
3389  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3390  "At the end of this method, for global row " << globalRow << ", "
3391  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3392  << rowInfo.numEntries << suffix);
3393  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3394  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3395  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3396  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3397  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3398  " " << expectedNumEntries << suffix);
3399 #endif // HAVE_TPETRA_DEBUG
3400  }
3401 
3402 
3403  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3404  void
3406  scale (const Scalar& alpha)
3407  {
3408  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3409 
3410  const size_t nlrs = staticGraph_->getLocalNumRows ();
3411  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3412  if (! staticGraph_->indicesAreAllocated () ||
3413  nlrs == 0 || numEntries == 0) {
3414  // do nothing
3415  }
3416  else {
3417 
3418  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3419  KokkosBlas::scal(vals, theAlpha, vals);
3420 
3421  }
3422  }
3423 
3424  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3425  void
3427  setAllToScalar (const Scalar& alpha)
3428  {
3429  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3430 
3431  // replace all values in the matrix
3432  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3433  // however, if there are no valid entries, we can short-circuit
3434  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3435  const size_t numEntries = staticGraph_->getLocalNumEntries();
3436  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3437  // do nothing
3438  }
3439  else {
3440  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3441  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3442  theAlpha);
3443  // CAG: This fence was found to be required on Cuda with UVM=on.
3444  Kokkos::fence("CrsMatrix::setAllToScalar");
3445  }
3446  }
3447 
3448  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3449  void
3451  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3452  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3453  const typename local_matrix_device_type::values_type& values)
3454  {
3455  using ProfilingRegion=Details::ProfilingRegion;
3456  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3457  const char tfecfFuncName[] = "setAllValues: ";
3458  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3459  (columnIndices.size () != values.size (), std::invalid_argument,
3460  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3461  " = " << values.size () << ".");
3462  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3463  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3464 
3465  try {
3466  myGraph_->setAllIndices (rowPointers, columnIndices);
3467  }
3468  catch (std::exception &e) {
3469  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3470  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3471  "exception: " << e.what ());
3472  }
3473 
3474  // Make sure that myGraph_ now has a local graph. It may not be
3475  // fillComplete yet, so it's important to check. We don't care
3476  // whether setAllIndices() did a shallow copy or a deep copy, so a
3477  // good way to check is to compare dimensions.
3478  auto lclGraph = myGraph_->getLocalGraphDevice ();
3479  const size_t numEnt = lclGraph.entries.extent (0);
3480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3481  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3482  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3483  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3484  "local graph. Please report this bug to the Tpetra developers.");
3485 
3486  valuesPacked_wdv = values_wdv_type(values);
3487  valuesUnpacked_wdv = valuesPacked_wdv;
3488 
3489  // Storage MUST be packed, since the interface doesn't give any
3490  // way to indicate any extra space at the end of each row.
3491  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3492 
3493  checkInternalState ();
3494  }
3495 
3496  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3497  void
3499  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3500  {
3501  using ProfilingRegion=Details::ProfilingRegion;
3502  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3503 
3504  auto graph = localDeviceMatrix.graph;
3505  //FIXME how to check whether graph is allocated
3506 
3507  auto rows = graph.row_map;
3508  auto columns = graph.entries;
3509  auto values = localDeviceMatrix.values;
3510 
3511  setAllValues(rows,columns,values);
3512  }
3513 
3514  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3515  void
3517  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3518  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3519  const Teuchos::ArrayRCP<Scalar>& val)
3520  {
3521  using Kokkos::Compat::getKokkosViewDeepCopy;
3522  using Teuchos::ArrayRCP;
3523  using Teuchos::av_reinterpret_cast;
3524  typedef device_type DT;
3525  typedef impl_scalar_type IST;
3526  typedef typename local_graph_device_type::row_map_type row_map_type;
3527  //typedef typename row_map_type::non_const_value_type row_offset_type;
3528  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3529 
3530  // The row offset type may depend on the execution space. It may
3531  // not necessarily be size_t. If it's not, we need to make a deep
3532  // copy. We need to make a deep copy anyway so that Kokkos can
3533  // own the memory. Regardless, ptrIn gets the copy.
3534  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3535  Kokkos::View<const size_t*,
3536  typename row_map_type::array_layout,
3537  Kokkos::HostSpace,
3538  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3539  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3540 
3541  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3542  (ptrNative.extent (0) != ptrSizeT.extent (0),
3543  std::logic_error, "ptrNative.extent(0) = " <<
3544  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3545  << ptrSizeT.extent (0) << ". Please report this bug to the "
3546  "Tpetra developers.");
3547 
3548  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3549  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3550  this->setAllValues (ptrNative, indIn, valIn);
3551  }
3552 
3553  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3554  void
3556  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3557  {
3558  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3560  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3561 
3562  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3563  // this method in CrsGraph too, so don't call it (otherwise build
3564  // warnings will show up and annoy users). Instead, copy results
3565  // in and out, if the memory space requires it.
3566 
3567  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3568  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3569  offsets.resize (lclNumRows);
3570  }
3571 
3572  // The input ArrayRCP must always be a host pointer. Thus, if
3573  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3574  // to write to that allocation directly as a Kokkos::View.
3575  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3576  // It is always syntactically correct to assign a raw host
3577  // pointer to a device View, so this code will compile correctly
3578  // even if this branch never runs.
3579  typedef Kokkos::View<size_t*, device_type,
3580  Kokkos::MemoryUnmanaged> output_type;
3581  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3582  staticGraph_->getLocalDiagOffsets (offsetsOut);
3583  }
3584  else {
3585  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3586  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3587  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3588  Kokkos::MemoryUnmanaged> output_type;
3589  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3590  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3591  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3592  }
3593  }
3594 
3595  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3596  void
3599  {
3600  using Teuchos::ArrayRCP;
3601  using Teuchos::ArrayView;
3602  using Teuchos::av_reinterpret_cast;
3603  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3604  typedef local_ordinal_type LO;
3605 
3606 
3607  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3608  staticGraph_.is_null (), std::runtime_error,
3609  "This method requires that the matrix have a graph.");
3610  auto rowMapPtr = this->getRowMap ();
3611  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3612  // Processes on which the row Map or its communicator is null
3613  // don't participate. Users shouldn't even call this method on
3614  // those processes.
3615  return;
3616  }
3617  auto colMapPtr = this->getColMap ();
3618  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3619  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3620  "This method requires that the matrix have a column Map.");
3621  const map_type& rowMap = * rowMapPtr;
3622  const map_type& colMap = * colMapPtr;
3623  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3624 
3625 #ifdef HAVE_TPETRA_DEBUG
3626  // isCompatible() requires an all-reduce, and thus this check
3627  // should only be done in debug mode.
3628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3629  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3630  "The input Vector's Map must be compatible with the CrsMatrix's row "
3631  "Map. You may check this by using Map's isCompatible method: "
3632  "diag.getMap ()->isCompatible (A.getRowMap ());");
3633 #endif // HAVE_TPETRA_DEBUG
3634 
3635  if (this->isFillComplete ()) {
3636  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3637  // 1-D subview of the first (and only) column of D_lcl.
3638  const auto D_lcl_1d =
3639  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3640 
3641  const auto lclRowMap = rowMap.getLocalMap ();
3642  const auto lclColMap = colMap.getLocalMap ();
3644  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3645  lclColMap,
3646  getLocalMatrixDevice ());
3647  }
3648  else {
3650  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3651  }
3652  }
3653 
3654  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3655  void
3658  const Kokkos::View<const size_t*, device_type,
3659  Kokkos::MemoryUnmanaged>& offsets) const
3660  {
3661  typedef LocalOrdinal LO;
3662 
3663 #ifdef HAVE_TPETRA_DEBUG
3664  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3665  const map_type& rowMap = * (this->getRowMap ());
3666  // isCompatible() requires an all-reduce, and thus this check
3667  // should only be done in debug mode.
3668  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3669  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3670  "The input Vector's Map must be compatible with (in the sense of Map::"
3671  "isCompatible) the CrsMatrix's row Map.");
3672 #endif // HAVE_TPETRA_DEBUG
3673 
3674  // For now, we fill the Vector on the host and sync to device.
3675  // Later, we may write a parallel kernel that works entirely on
3676  // device.
3677  //
3678  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3679  // we write a device kernel, it will not need to assume UVM.
3680 
3681  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3682  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3683  // Get 1-D subview of the first (and only) column of D_lcl.
3684  auto D_lcl_1d =
3685  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3686 
3687  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3688  getLocalMatrixDevice ());
3689  }
3690 
3691  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3692  void
3695  const Teuchos::ArrayView<const size_t>& offsets) const
3696  {
3697  using LO = LocalOrdinal;
3698  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3699  using IST = impl_scalar_type;
3700 
3701 #ifdef HAVE_TPETRA_DEBUG
3702  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3703  const map_type& rowMap = * (this->getRowMap ());
3704  // isCompatible() requires an all-reduce, and thus this check
3705  // should only be done in debug mode.
3706  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3707  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3708  "The input Vector's Map must be compatible with (in the sense of Map::"
3709  "isCompatible) the CrsMatrix's row Map.");
3710 #endif // HAVE_TPETRA_DEBUG
3711 
3712  // See #1510. In case diag has already been marked modified on
3713  // device, we need to clear that flag, since the code below works
3714  // on host.
3715  //diag.clear_sync_state ();
3716 
3717  // For now, we fill the Vector on the host and sync to device.
3718  // Later, we may write a parallel kernel that works entirely on
3719  // device.
3720  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3721  // 1-D subview of the first (and only) column of lclVecHost.
3722  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3723 
3724  using host_offsets_view_type =
3725  Kokkos::View<const size_t*, Kokkos::HostSpace,
3726  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3727  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3728  // Find the diagonal entries and put them in lclVecHost1d.
3729  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3730  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3731  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3732 
3733  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3734  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3735  Kokkos::parallel_for
3736  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3737  range_type (0, myNumRows),
3738  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3739  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3740  if (h_offsets[lclRow] != INV) {
3741  auto curRowOffset = rowPtrsPackedHost (lclRow);
3742  lclVecHost1d(lclRow) =
3743  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3744  }
3745  });
3746  //diag.sync_device ();
3747  }
3748 
3749 
3750  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3751  void
3754  {
3755  using ::Tpetra::Details::ProfilingRegion;
3756  using Teuchos::ArrayRCP;
3757  using Teuchos::ArrayView;
3758  using Teuchos::null;
3759  using Teuchos::RCP;
3760  using Teuchos::rcp;
3761  using Teuchos::rcpFromRef;
3763  const char tfecfFuncName[] = "leftScale: ";
3764 
3765  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3766 
3767  RCP<const vec_type> xp;
3768  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3769  // Take from Epetra: If we have a non-trivial exporter, we must
3770  // import elements that are permuted or are on other processors.
3771  auto exporter = this->getCrsGraphRef ().getExporter ();
3772  if (exporter.get () != nullptr) {
3773  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3774  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3775  xp = tempVec;
3776  }
3777  else {
3778  xp = rcpFromRef (x);
3779  }
3780  }
3781  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3782  xp = rcpFromRef (x);
3783  }
3784  else {
3785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3786  (true, std::invalid_argument, "x's Map must be the same as "
3787  "either the row Map or the range Map of the CrsMatrix.");
3788  }
3789 
3790  if (this->isFillComplete()) {
3791  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3792  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3794  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3795  x_lcl_1d, false, false);
3796  }
3797  else {
3798  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3799  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3800  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3801  " fillComplete");
3802  }
3803  }
3804 
3805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3806  void
3809  {
3810  using ::Tpetra::Details::ProfilingRegion;
3811  using Teuchos::ArrayRCP;
3812  using Teuchos::ArrayView;
3813  using Teuchos::null;
3814  using Teuchos::RCP;
3815  using Teuchos::rcp;
3816  using Teuchos::rcpFromRef;
3818  const char tfecfFuncName[] = "rightScale: ";
3819 
3820  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3821 
3822  RCP<const vec_type> xp;
3823  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3824  // Take from Epetra: If we have a non-trivial exporter, we must
3825  // import elements that are permuted or are on other processors.
3826  auto importer = this->getCrsGraphRef ().getImporter ();
3827  if (importer.get () != nullptr) {
3828  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3829  tempVec->doImport (x, *importer, REPLACE);
3830  xp = tempVec;
3831  }
3832  else {
3833  xp = rcpFromRef (x);
3834  }
3835  }
3836  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3837  xp = rcpFromRef (x);
3838  } else {
3839  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3840  (true, std::runtime_error, "x's Map must be the same as "
3841  "either the domain Map or the column Map of the CrsMatrix.");
3842  }
3843 
3844  if (this->isFillComplete()) {
3845  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3846  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3848  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3849  x_lcl_1d, false, false);
3850  }
3851  else {
3852  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3854  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3855  " fillComplete");
3856  }
3857  }
3858 
3859  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3863  {
3864  using Teuchos::ArrayView;
3865  using Teuchos::outArg;
3866  using Teuchos::REDUCE_SUM;
3867  using Teuchos::reduceAll;
3868 
3869  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3870  // local part of this computation. It could make sense to put
3871  // this operation in the Kokkos::CrsMatrix.
3872 
3873  // check the cache first
3874  mag_type mySum = STM::zero ();
3875  if (getLocalNumEntries() > 0) {
3876  if (isStorageOptimized ()) {
3877  // "Optimized" storage is packed storage. That means we can
3878  // iterate in one pass through the 1-D values array.
3879  const size_t numEntries = getLocalNumEntries ();
3880  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3881  for (size_t k = 0; k < numEntries; ++k) {
3882  auto val = values[k];
3883  // Note (etp 06 Jan 2015) We need abs() here for composite types
3884  // (in general, if mag_type is on the left-hand-side, we need
3885  // abs() on the right-hand-side)
3886  const mag_type val_abs = STS::abs (val);
3887  mySum += val_abs * val_abs;
3888  }
3889  }
3890  else {
3891  const LocalOrdinal numRows =
3892  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3893  for (LocalOrdinal r = 0; r < numRows; ++r) {
3894  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3895  const size_t numEntries = rowInfo.numEntries;
3896  auto A_r = this->getValuesViewHost(rowInfo);
3897  for (size_t k = 0; k < numEntries; ++k) {
3898  const impl_scalar_type val = A_r[k];
3899  const mag_type val_abs = STS::abs (val);
3900  mySum += val_abs * val_abs;
3901  }
3902  }
3903  }
3904  }
3905  mag_type totalSum = STM::zero ();
3906  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3907  mySum, outArg (totalSum));
3908  return STM::sqrt (totalSum);
3909  }
3910 
3911  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3912  void
3914  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3915  {
3916  const char tfecfFuncName[] = "replaceColMap: ";
3917  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3918  // Then replacing the column Map might mean that we need to
3919  // reindex the column indices.
3920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3921  myGraph_.is_null (), std::runtime_error,
3922  "This method does not work if the matrix has a const graph. The whole "
3923  "idea of a const graph is that you are not allowed to change it, but "
3924  "this method necessarily must modify the graph, since the graph owns "
3925  "the matrix's column Map.");
3926  myGraph_->replaceColMap (newColMap);
3927  }
3928 
3929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3930  void
3933  const Teuchos::RCP<const map_type>& newColMap,
3934  const Teuchos::RCP<const import_type>& newImport,
3935  const bool sortEachRow)
3936  {
3937  const char tfecfFuncName[] = "reindexColumns: ";
3938  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3939  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3940  "The input graph is null, but the matrix does not own its graph.");
3941 
3942  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3943  const bool sortGraph = false; // we'll sort graph & matrix together below
3944 
3945  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3946 
3947  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3948  const LocalOrdinal lclNumRows =
3949  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3950 
3951  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3952 
3953  const RowInfo rowInfo = theGraph.getRowInfo (row);
3954  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3955  auto vals = this->getValuesViewHostNonConst (rowInfo);
3956 
3957  sort2 (lclColInds.data (),
3958  lclColInds.data () + rowInfo.numEntries,
3959  vals.data ());
3960  }
3961  theGraph.indicesAreSorted_ = true;
3962  }
3963  }
3964 
3965  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3966  void
3968  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3969  {
3970  const char tfecfFuncName[] = "replaceDomainMap: ";
3971  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3972  myGraph_.is_null (), std::runtime_error,
3973  "This method does not work if the matrix has a const graph. The whole "
3974  "idea of a const graph is that you are not allowed to change it, but this"
3975  " method necessarily must modify the graph, since the graph owns the "
3976  "matrix's domain Map and Import objects.");
3977  myGraph_->replaceDomainMap (newDomainMap);
3978  }
3979 
3980  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3981  void
3983  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3984  Teuchos::RCP<const import_type>& newImporter)
3985  {
3986  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3987  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3988  myGraph_.is_null (), std::runtime_error,
3989  "This method does not work if the matrix has a const graph. The whole "
3990  "idea of a const graph is that you are not allowed to change it, but this"
3991  " method necessarily must modify the graph, since the graph owns the "
3992  "matrix's domain Map and Import objects.");
3993  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3994  }
3995 
3996  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3997  void
3999  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
4000  {
4001  const char tfecfFuncName[] = "replaceRangeMap: ";
4002  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4003  myGraph_.is_null (), std::runtime_error,
4004  "This method does not work if the matrix has a const graph. The whole "
4005  "idea of a const graph is that you are not allowed to change it, but this"
4006  " method necessarily must modify the graph, since the graph owns the "
4007  "matrix's domain Map and Import objects.");
4008  myGraph_->replaceRangeMap (newRangeMap);
4009  }
4010 
4011  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4012  void
4014  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
4015  Teuchos::RCP<const export_type>& newExporter)
4016  {
4017  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
4018  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4019  myGraph_.is_null (), std::runtime_error,
4020  "This method does not work if the matrix has a const graph. The whole "
4021  "idea of a const graph is that you are not allowed to change it, but this"
4022  " method necessarily must modify the graph, since the graph owns the "
4023  "matrix's domain Map and Import objects.");
4024  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
4025  }
4026 
4027  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4028  void
4030  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4031  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4032  const Teuchos::ArrayView<const Scalar>& values)
4033  {
4034  using Teuchos::Array;
4035  typedef GlobalOrdinal GO;
4036  typedef typename Array<GO>::size_type size_type;
4037 
4038  const size_type numToInsert = indices.size ();
4039  // Add the new data to the list of nonlocals.
4040  // This creates the arrays if they don't exist yet.
4041  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4042  Array<GO>& curRowInds = curRow.first;
4043  Array<Scalar>& curRowVals = curRow.second;
4044  const size_type newCapacity = curRowInds.size () + numToInsert;
4045  curRowInds.reserve (newCapacity);
4046  curRowVals.reserve (newCapacity);
4047  for (size_type k = 0; k < numToInsert; ++k) {
4048  curRowInds.push_back (indices[k]);
4049  curRowVals.push_back (values[k]);
4050  }
4051  }
4052 
4053  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4054  void
4057  {
4058  using Details::Behavior;
4060  using Teuchos::Comm;
4061  using Teuchos::outArg;
4062  using Teuchos::RCP;
4063  using Teuchos::rcp;
4064  using Teuchos::REDUCE_MAX;
4065  using Teuchos::REDUCE_MIN;
4066  using Teuchos::reduceAll;
4067  using std::endl;
4069  //typedef LocalOrdinal LO;
4070  typedef GlobalOrdinal GO;
4071  typedef typename Teuchos::Array<GO>::size_type size_type;
4072  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4073  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4074 
4075  const bool verbose = Behavior::verbose("CrsMatrix");
4076  std::unique_ptr<std::string> prefix;
4077  if (verbose) {
4078  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4079  std::ostringstream os;
4080  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4081  << endl;
4082  std::cerr << os.str();
4083  }
4084  RCP<const Comm<int> > comm = getComm ();
4085 
4086  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4087  (! isFillActive (), std::runtime_error, "Fill must be active before "
4088  "you may call this method.");
4089 
4090  const size_t myNumNonlocalRows = nonlocals_.size ();
4091 
4092  // If no processes have nonlocal rows, then we don't have to do
4093  // anything. Checking this is probably cheaper than constructing
4094  // the Map of nonlocal rows (see below) and noticing that it has
4095  // zero global entries.
4096  {
4097  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4098  int someoneHasNonlocalRows = 0;
4099  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4100  outArg (someoneHasNonlocalRows));
4101  if (someoneHasNonlocalRows == 0) {
4102  return; // no process has nonlocal rows, so nothing to do
4103  }
4104  }
4105 
4106  // 1. Create a list of the "nonlocal" rows on each process. this
4107  // requires iterating over nonlocals_, so while we do this,
4108  // deduplicate the entries and get a count for each nonlocal
4109  // row on this process.
4110  // 2. Construct a new row Map corresponding to those rows. This
4111  // Map is likely overlapping. We know that the Map is not
4112  // empty on all processes, because the above all-reduce and
4113  // return exclude that case.
4114 
4115  RCP<const map_type> nonlocalRowMap;
4116  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4117  {
4118  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4119  size_type curPos = 0;
4120  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4121  ++mapIter, ++curPos) {
4122  myNonlocalGblRows[curPos] = mapIter->first;
4123  // Get the values and column indices by reference, since we
4124  // intend to change them in place (that's what "erase" does).
4125  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4126  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4127 
4128  // Sort both arrays jointly, using the column indices as keys,
4129  // then merge them jointly. "Merge" here adds values
4130  // corresponding to the same column indices. The first 2 args
4131  // of merge2 are output arguments that work just like the
4132  // return value of std::unique.
4133  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4134  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4135  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4136  merge2 (gblCols_newEnd, vals_newEnd,
4137  gblCols.begin (), gblCols.end (),
4138  vals.begin (), vals.end ());
4139  gblCols.erase (gblCols_newEnd, gblCols.end ());
4140  vals.erase (vals_newEnd, vals.end ());
4141  numEntPerNonlocalRow[curPos] = gblCols.size ();
4142  }
4143 
4144  // Currently, Map requires that its indexBase be the global min
4145  // of all its global indices. Map won't compute this for us, so
4146  // we must do it. If our process has no nonlocal rows, set the
4147  // "min" to the max possible GO value. This ensures that if
4148  // some process has at least one nonlocal row, then it will pick
4149  // that up as the min. We know that at least one process has a
4150  // nonlocal row, since the all-reduce and return at the top of
4151  // this method excluded that case.
4152  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4153  {
4154  auto iter = std::min_element (myNonlocalGblRows.begin (),
4155  myNonlocalGblRows.end ());
4156  if (iter != myNonlocalGblRows.end ()) {
4157  myMinNonlocalGblRow = *iter;
4158  }
4159  }
4160  GO gblMinNonlocalGblRow = 0;
4161  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4162  outArg (gblMinNonlocalGblRow));
4163  const GO indexBase = gblMinNonlocalGblRow;
4164  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4165  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4166  }
4167 
4168  // 3. Use the values and column indices for each nonlocal row, as
4169  // stored in nonlocals_, to construct a CrsMatrix corresponding
4170  // to nonlocal rows. We have
4171  // exact counts of the number of entries in each nonlocal row.
4172 
4173  if (verbose) {
4174  std::ostringstream os;
4175  os << *prefix << "Create nonlocal matrix" << endl;
4176  std::cerr << os.str();
4177  }
4178  RCP<crs_matrix_type> nonlocalMatrix =
4179  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4180  {
4181  size_type curPos = 0;
4182  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4183  ++mapIter, ++curPos) {
4184  const GO gblRow = mapIter->first;
4185  // Get values & column indices by ref, just to avoid copy.
4186  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4187  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4188  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4189  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4190  }
4191  }
4192  // There's no need to fill-complete the nonlocals matrix.
4193  // We just use it as a temporary container for the Export.
4194 
4195  // 4. If the original row Map is one to one, then we can Export
4196  // directly from nonlocalMatrix into this. Otherwise, we have
4197  // to create a temporary matrix with a one-to-one row Map,
4198  // Export into that, then Import from the temporary matrix into
4199  // *this.
4200 
4201  auto origRowMap = this->getRowMap ();
4202  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4203 
4204  int isLocallyComplete = 1; // true by default
4205 
4206  if (origRowMapIsOneToOne) {
4207  if (verbose) {
4208  std::ostringstream os;
4209  os << *prefix << "Original row Map is 1-to-1" << endl;
4210  std::cerr << os.str();
4211  }
4212  export_type exportToOrig (nonlocalRowMap, origRowMap);
4213  if (! exportToOrig.isLocallyComplete ()) {
4214  isLocallyComplete = 0;
4215  }
4216  if (verbose) {
4217  std::ostringstream os;
4218  os << *prefix << "doExport from nonlocalMatrix" << endl;
4219  std::cerr << os.str();
4220  }
4221  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4222  // We're done at this point!
4223  }
4224  else {
4225  if (verbose) {
4226  std::ostringstream os;
4227  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4228  std::cerr << os.str();
4229  }
4230  // If you ask a Map whether it is one to one, it does some
4231  // communication and stashes intermediate results for later use
4232  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4233  // much more then the original cost of calling isOneToOne.
4234  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4235  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4236  if (! exportToOneToOne.isLocallyComplete ()) {
4237  isLocallyComplete = 0;
4238  }
4239 
4240  // Create a temporary matrix with the one-to-one row Map.
4241  //
4242  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4243  // each row, to avoid reallocation during the Export operation.
4244  if (verbose) {
4245  std::ostringstream os;
4246  os << *prefix << "Create & doExport into 1-to-1 matrix"
4247  << endl;
4248  std::cerr << os.str();
4249  }
4250  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4251  // Export from matrix of nonlocals into the temp one-to-one matrix.
4252  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4253  Tpetra::ADD);
4254 
4255  // We don't need the matrix of nonlocals anymore, so get rid of
4256  // it, to keep the memory high-water mark down.
4257  if (verbose) {
4258  std::ostringstream os;
4259  os << *prefix << "Free nonlocalMatrix" << endl;
4260  std::cerr << os.str();
4261  }
4262  nonlocalMatrix = Teuchos::null;
4263 
4264  // Import from the one-to-one matrix to the original matrix.
4265  if (verbose) {
4266  std::ostringstream os;
4267  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4268  std::cerr << os.str();
4269  }
4270  import_type importToOrig (oneToOneRowMap, origRowMap);
4271  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4272  }
4273 
4274  // It's safe now to clear out nonlocals_, since we've already
4275  // committed side effects to *this. The standard idiom for
4276  // clearing a Container like std::map, is to swap it with an empty
4277  // Container and let the swapped Container fall out of scope.
4278  if (verbose) {
4279  std::ostringstream os;
4280  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4281  std::cerr << os.str();
4282  }
4283  decltype (nonlocals_) newNonlocals;
4284  std::swap (nonlocals_, newNonlocals);
4285 
4286  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4287  // don't like throwing an exception here. A local return value
4288  // would likely be more useful to users. However, if users find
4289  // themselves exercising nonlocal inserts often, then they are
4290  // probably novice users who need the help. See Gibhub Issues
4291  // #603 and #601 (esp. the latter) for discussion.
4292 
4293  int isGloballyComplete = 0; // output argument of reduceAll
4294  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4295  outArg (isGloballyComplete));
4296  TEUCHOS_TEST_FOR_EXCEPTION
4297  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4298  "you called insertGlobalValues with a global row index which is not in "
4299  "the matrix's row Map on any process in its communicator.");
4300  }
4301 
4302  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4303  void
4305  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4306  {
4307  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4308  myGraph_->resumeFill (params);
4309  }
4310  fillComplete_ = false;
4311  }
4312 
4313  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4314  bool
4317  return getCrsGraphRef ().haveGlobalConstants ();
4318  }
4319 
4320  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4321  void
4323  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4324  {
4325  const char tfecfFuncName[] = "fillComplete(params): ";
4326 
4327  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4328  (this->getCrsGraph ().is_null (), std::logic_error,
4329  "getCrsGraph() returns null. This should not happen at this point. "
4330  "Please report this bug to the Tpetra developers.");
4331 
4332  const crs_graph_type& graph = this->getCrsGraphRef ();
4333  if (this->isStaticGraph () && graph.isFillComplete ()) {
4334  // If this matrix's graph is fill complete and the user did not
4335  // supply a domain or range Map, use the graph's domain and
4336  // range Maps.
4337  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4338  }
4339  else { // assume that user's row Map is the domain and range Map
4340  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4341  Teuchos::RCP<const map_type> domainMap = rangeMap;
4342  this->fillComplete (domainMap, rangeMap, params);
4343  }
4344  }
4345 
4346  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4347  void
4349  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4350  const Teuchos::RCP<const map_type>& rangeMap,
4351  const Teuchos::RCP<Teuchos::ParameterList>& params)
4352  {
4353  using Details::Behavior;
4355  using Teuchos::ArrayRCP;
4356  using Teuchos::RCP;
4357  using Teuchos::rcp;
4358  using std::endl;
4359  const char tfecfFuncName[] = "fillComplete: ";
4360  ProfilingRegion regionFillComplete
4361  ("Tpetra::CrsMatrix::fillComplete");
4362  const bool verbose = Behavior::verbose("CrsMatrix");
4363  std::unique_ptr<std::string> prefix;
4364  if (verbose) {
4365  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4366  std::ostringstream os;
4367  os << *prefix << endl;
4368  std::cerr << os.str ();
4369  }
4370  Details::ProfilingRegion region(
4371  "Tpetra::CrsMatrix::fillCompete",
4372  "fillCompete");
4373 
4374  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4375  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4376  "Matrix fill state must be active (isFillActive() "
4377  "must be true) before you may call fillComplete().");
4378  const int numProcs = this->getComm ()->getSize ();
4379 
4380  //
4381  // Read parameters from the input ParameterList.
4382  //
4383  {
4384  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4385 
4386  // If true, the caller promises that no process did nonlocal
4387  // changes since the last call to fillComplete.
4388  bool assertNoNonlocalInserts = false;
4389  // If true, makeColMap sorts remote GIDs (within each remote
4390  // process' group).
4391  bool sortGhosts = true;
4392 
4393  if (! params.is_null ()) {
4394  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4395  assertNoNonlocalInserts);
4396  if (params->isParameter ("sort column map ghost gids")) {
4397  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4398  }
4399  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4400  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4401  }
4402  }
4403  // We also don't need to do global assembly if there is only one
4404  // process in the communicator.
4405  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4406  // This parameter only matters if this matrix owns its graph.
4407  if (! this->myGraph_.is_null ()) {
4408  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4409  }
4410 
4411  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4412  if (this->hasColMap ()) { // use local indices
4413  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4414  }
4415  else { // no column Map, so use global indices
4416  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4417  }
4418  }
4419  // Global assemble, if we need to. This call only costs a single
4420  // all-reduce if we didn't need global assembly after all.
4421  if (needGlobalAssemble) {
4422  this->globalAssemble ();
4423  }
4424  else {
4425  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4426  (numProcs == 1 && nonlocals_.size() > 0,
4427  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4428  "An invalid entry (i.e., with row index not in the row Map) must have "
4429  "been submitted to the CrsMatrix.");
4430  }
4431  }
4432  if (this->isStaticGraph ()) {
4433  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4434  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4435  // checks below only in debug mode. It would be nicer to do a
4436  // local check, then propagate the error state in a deferred
4437  // way, whenever communication happens. That would reduce the
4438  // cost of checking, to the point where it may make sense to
4439  // enable it even in release mode.
4440 #ifdef HAVE_TPETRA_DEBUG
4441  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4442  // input Maps incurs a penalty of two all-reduces for the
4443  // otherwise optimal const graph case.
4444  //
4445  // We could turn these (max) 2 all-reduces into (max) 1, by
4446  // fusing them. We could do this by adding a "locallySameAs"
4447  // method to Map, which would return one of four states:
4448  //
4449  // a. Certainly globally the same
4450  // b. Certainly globally not the same
4451  // c. Locally the same
4452  // d. Locally not the same
4453  //
4454  // The first two states don't require further communication.
4455  // The latter two states require an all-reduce to communicate
4456  // globally, but we only need one all-reduce, since we only need
4457  // to check whether at least one of the Maps is wrong.
4458  const bool domainMapsMatch =
4459  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4460  const bool rangeMapsMatch =
4461  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4462 
4463  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4464  (! domainMapsMatch, std::runtime_error,
4465  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4466  "The graph cannot be changed because it was given to the CrsMatrix "
4467  "constructor as const. You can fix this by passing in the graph's "
4468  "domain Map and range Map to the matrix's fillComplete call.");
4469 
4470  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4471  (! rangeMapsMatch, std::runtime_error,
4472  "The CrsMatrix's range Map does not match the graph's range Map. "
4473  "The graph cannot be changed because it was given to the CrsMatrix "
4474  "constructor as const. You can fix this by passing in the graph's "
4475  "domain Map and range Map to the matrix's fillComplete call.");
4476 #endif // HAVE_TPETRA_DEBUG
4477 
4478  // The matrix does _not_ own the graph, and the graph's
4479  // structure is already fixed, so just fill the local matrix.
4480  this->fillLocalMatrix (params);
4481  }
4482  else {
4483  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4484  // Set the graph's domain and range Maps. This will clear the
4485  // Import if the domain Map has changed (is a different
4486  // pointer), and the Export if the range Map has changed (is a
4487  // different pointer).
4488  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4489 
4490  // Make the graph's column Map, if necessary.
4491  Teuchos::Array<int> remotePIDs (0);
4492  const bool mustBuildColMap = ! this->hasColMap ();
4493  if (mustBuildColMap) {
4494  this->myGraph_->makeColMap (remotePIDs);
4495  }
4496 
4497  // Make indices local, if necessary. The method won't do
4498  // anything if the graph is already locally indexed.
4499  const std::pair<size_t, std::string> makeIndicesLocalResult =
4500  this->myGraph_->makeIndicesLocal(verbose);
4501  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4502  // the error state to makeImportExport
4503  // which may do all-reduces and thus may
4504  // have the opportunity to communicate that error state.
4505  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4506  (makeIndicesLocalResult.first != 0, std::runtime_error,
4507  makeIndicesLocalResult.second);
4508 
4509  const bool sorted = this->myGraph_->isSorted ();
4510  const bool merged = this->myGraph_->isMerged ();
4511  this->sortAndMergeIndicesAndValues (sorted, merged);
4512 
4513  // Make Import and Export objects, if they haven't been made
4514  // already. If we made a column Map above, reuse information
4515  // from that process to avoid communiation in the Import setup.
4516  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4517 
4518  // The matrix _does_ own the graph, so fill the local graph at
4519  // the same time as the local matrix.
4520  this->fillLocalGraphAndMatrix (params);
4521 
4522  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4523  params->get ("compute global constants", true);
4524  if (callGraphComputeGlobalConstants) {
4525  this->myGraph_->computeGlobalConstants ();
4526  }
4527  else {
4528  this->myGraph_->computeLocalConstants ();
4529  }
4530  this->myGraph_->fillComplete_ = true;
4531  this->myGraph_->checkInternalState ();
4532  }
4533 
4534  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4535 
4536  this->fillComplete_ = true; // Now we're fill complete!
4537  {
4538  Details::ProfilingRegion region_cis(
4539  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4540  );
4541  this->checkInternalState ();
4542  }
4543  } //fillComplete(domainMap, rangeMap, params)
4544 
4545  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4546  void
4548  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4549  const Teuchos::RCP<const map_type> & rangeMap,
4550  const Teuchos::RCP<const import_type>& importer,
4551  const Teuchos::RCP<const export_type>& exporter,
4552  const Teuchos::RCP<Teuchos::ParameterList> &params)
4553  {
4554 #ifdef HAVE_TPETRA_MMM_TIMINGS
4555  std::string label;
4556  if(!params.is_null())
4557  label = params->get("Timer Label",label);
4558  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4559  using Teuchos::TimeMonitor;
4560 
4561  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4562 #endif
4563 
4564  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4565  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4566  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4567  "must be true) before calling fillComplete().");
4568  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4569  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4570 
4571  {
4572 #ifdef HAVE_TPETRA_MMM_TIMINGS
4573  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4574 #endif
4575  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4576  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4577  }
4578 
4579  {
4580 #ifdef HAVE_TPETRA_MMM_TIMINGS
4581  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4582 #endif
4583  // Fill the local graph and matrix
4584  fillLocalGraphAndMatrix (params);
4585  }
4586  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4587 
4588  // Now we're fill complete!
4589  fillComplete_ = true;
4590 
4591  // Sanity checks at the end.
4592 #ifdef HAVE_TPETRA_DEBUG
4593  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4594  ": We're at the end of fillComplete(), but isFillActive() is true. "
4595  "Please report this bug to the Tpetra developers.");
4596  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4597  ": We're at the end of fillComplete(), but isFillActive() is true. "
4598  "Please report this bug to the Tpetra developers.");
4599 #endif // HAVE_TPETRA_DEBUG
4600  {
4601 #ifdef HAVE_TPETRA_MMM_TIMINGS
4602  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4603 #endif
4604 
4605  checkInternalState();
4606  }
4607  }
4608 
4609  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4611  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4612  {
4613  impl_scalar_type* rowValueIter = vals;
4614  // beg,end define a half-exclusive interval over which to iterate.
4615  LocalOrdinal* beg = cols;
4616  LocalOrdinal* end = cols + rowLen;
4617  LocalOrdinal* newend = beg;
4618  if (beg != end) {
4619  LocalOrdinal* cur = beg + 1;
4620  impl_scalar_type* vcur = rowValueIter + 1;
4621  impl_scalar_type* vend = rowValueIter;
4622  cur = beg+1;
4623  while (cur != end) {
4624  if (*cur != *newend) {
4625  // new entry; save it
4626  ++newend;
4627  ++vend;
4628  (*newend) = (*cur);
4629  (*vend) = (*vcur);
4630  }
4631  else {
4632  // old entry; merge it
4633  //(*vend) = f (*vend, *vcur);
4634  (*vend) += *vcur;
4635  }
4636  ++cur;
4637  ++vcur;
4638  }
4639  ++newend; // one past the last entry, per typical [beg,end) semantics
4640  }
4641  return newend - beg;
4642  }
4643 
4644  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4645  void
4647  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4648  {
4649  using ::Tpetra::Details::ProfilingRegion;
4650  typedef LocalOrdinal LO;
4651  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4652  host_execution_space;
4653  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4654  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4655  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4656 
4657  if (! sorted || ! merged) {
4658  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4659  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4660  "\"static\" (const) graph, since the matrix does not own the graph.");
4661  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4662  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4663  "this matrix claims ! isStaticGraph(). "
4664  "Please report this bug to the Tpetra developers.");
4665  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4666  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4667  "this method if the graph's storage has already been optimized. "
4668  "Please report this bug to the Tpetra developers.");
4669 
4670  crs_graph_type& graph = * (this->myGraph_);
4671  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4672  size_t totalNumDups = 0;
4673  {
4674  //Accessing host unpacked (4-array CRS) local matrix.
4675  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4676  auto rowLengths_ = graph.k_numRowEntries_;
4677  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4678  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4679  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4680  [=] (const LO lclRow, size_t& numDups) {
4681  size_t rowBegin = rowBegins_(lclRow);
4682  size_t rowLen = rowLengths_(lclRow);
4683  LO* cols = cols_.data() + rowBegin;
4684  impl_scalar_type* vals = vals_.data() + rowBegin;
4685  if (! sorted) {
4686  sort2 (cols, cols + rowLen, vals);
4687  }
4688  if (! merged) {
4689  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4690  rowLengths_(lclRow) = newRowLength;
4691  numDups += rowLen - newRowLength;
4692  }
4693  }, totalNumDups);
4694  }
4695  if (! sorted) {
4696  graph.indicesAreSorted_ = true; // we just sorted every row
4697  }
4698  if (! merged) {
4699  graph.noRedundancies_ = true; // we just merged every row
4700  }
4701  }
4702  }
4703 
4704  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4705  void
4709  Scalar alpha,
4710  Scalar beta) const
4711  {
4713  using Teuchos::RCP;
4714  using Teuchos::rcp;
4715  using Teuchos::rcp_const_cast;
4716  using Teuchos::rcpFromRef;
4717  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4718  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4719 
4720  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4721  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4722  // failing only for the Kokkos refactor version of Tpetra. It's a
4723  // good idea regardless to have the bypass.
4724  if (alpha == ZERO) {
4725  if (beta == ZERO) {
4726  Y_in.putScalar (ZERO);
4727  } else if (beta != ONE) {
4728  Y_in.scale (beta);
4729  }
4730  return;
4731  }
4732 
4733  // It's possible that X is a view of Y or vice versa. We don't
4734  // allow this (apply() requires that X and Y not alias one
4735  // another), but it's helpful to detect and work around this case.
4736  // We don't try to to detect the more subtle cases (e.g., one is a
4737  // subview of the other, but their initial pointers differ). We
4738  // only need to do this if this matrix's Import is trivial;
4739  // otherwise, we don't actually apply the operator from X into Y.
4740 
4741  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4742  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4743 
4744  // If beta == 0, then the output MV will be overwritten; none of
4745  // its entries should be read. (Sparse BLAS semantics say that we
4746  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4747  // This matters if we need to do an Export operation; see below.
4748  const bool Y_is_overwritten = (beta == ZERO);
4749 
4750  // We treat the case of a replicated MV output specially.
4751  const bool Y_is_replicated =
4752  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4753 
4754  // This is part of the special case for replicated MV output.
4755  // We'll let each process do its thing, but do an all-reduce at
4756  // the end to sum up the results. Setting beta=0 on all processes
4757  // but Proc 0 makes the math work out for the all-reduce. (This
4758  // assumes that the replicated data is correctly replicated, so
4759  // that the data are the same on all processes.)
4760  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4761  beta = ZERO;
4762  }
4763 
4764  // Temporary MV for Import operation. After the block of code
4765  // below, this will be an (Imported if necessary) column Map MV
4766  // ready to give to localApply(...).
4767  RCP<const MV> X_colMap;
4768  if (importer.is_null ()) {
4769  if (! X_in.isConstantStride ()) {
4770  // Not all sparse mat-vec kernels can handle an input MV with
4771  // nonconstant stride correctly, so we have to copy it in that
4772  // case into a constant stride MV. To make a constant stride
4773  // copy of X_in, we force creation of the column (== domain)
4774  // Map MV (if it hasn't already been created, else fetch the
4775  // cached copy). This avoids creating a new MV each time.
4776  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4777  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4778  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4779  }
4780  else {
4781  // The domain and column Maps are the same, so do the local
4782  // multiply using the domain Map input MV X_in.
4783  X_colMap = rcpFromRef (X_in);
4784  }
4785  }
4786  else { // need to Import source (multi)vector
4787  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4788 
4789  // We're doing an Import anyway, which will copy the relevant
4790  // elements of the domain Map MV X_in into a separate column Map
4791  // MV. Thus, we don't have to worry whether X_in is constant
4792  // stride.
4793  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4794 
4795  // Import from the domain Map MV to the column Map MV.
4796  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4797  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4798  }
4799 
4800  // Temporary MV for doExport (if needed), or for copying a
4801  // nonconstant stride output MV into a constant stride MV. This
4802  // is null if we don't need the temporary MV, that is, if the
4803  // Export is trivial (null).
4804  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4805 
4806  // If we have a nontrivial Export object, we must perform an
4807  // Export. In that case, the local multiply result will go into
4808  // the row Map multivector. We don't have to make a
4809  // constant-stride version of Y_in in this case, because we had to
4810  // make a constant stride Y_rowMap MV and do an Export anyway.
4811  if (! exporter.is_null ()) {
4812  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4813  {
4814  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4815 
4816  // If we're overwriting the output MV Y_in completely (beta ==
4817  // 0), then make sure that it is filled with zeros before we
4818  // do the Export. Otherwise, the ADD combine mode will use
4819  // data in Y_in, which is supposed to be zero.
4820  if (Y_is_overwritten) {
4821  Y_in.putScalar (ZERO);
4822  }
4823  else {
4824  // Scale output MV by beta, so that doExport sums in the
4825  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4826  Y_in.scale (beta);
4827  }
4828  // Do the Export operation.
4829  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4830  }
4831  }
4832  else { // Don't do an Export: row Map and range Map are the same.
4833  //
4834  // If Y_in does not have constant stride, or if the column Map
4835  // MV aliases Y_in, then we can't let the kernel write directly
4836  // to Y_in. Instead, we have to use the cached row (== range)
4837  // Map MV as temporary storage.
4838  //
4839  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4840  // the user passed in the same MultiVector for both X and Y. It
4841  // won't detect whether one MultiVector views the other. We
4842  // should also check the MultiVectors' raw data pointers.
4843  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4844  // Force creating the MV if it hasn't been created already.
4845  // This will reuse a previously created cached MV.
4846  Y_rowMap = getRowMapMultiVector (Y_in, true);
4847 
4848  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4849  // since we're overwriting it anyway.
4850  if (beta != ZERO) {
4851  Tpetra::deep_copy (*Y_rowMap, Y_in);
4852  }
4853  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4854  Tpetra::deep_copy (Y_in, *Y_rowMap);
4855  }
4856  else {
4857  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4858  }
4859  }
4860 
4861  // If the range Map is a locally replicated Map, sum up
4862  // contributions from each process. We set beta = 0 on all
4863  // processes but Proc 0 initially, so this will handle the scaling
4864  // factor beta correctly.
4865  if (Y_is_replicated) {
4866  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4867  Y_in.reduce ();
4868  }
4869  }
4870 
4871  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4872  void
4876  const Teuchos::ETransp mode,
4877  Scalar alpha,
4878  Scalar beta) const
4879  {
4881  using Teuchos::null;
4882  using Teuchos::RCP;
4883  using Teuchos::rcp;
4884  using Teuchos::rcp_const_cast;
4885  using Teuchos::rcpFromRef;
4886  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4887 
4888  // Take shortcuts for alpha == 0.
4889  if (alpha == ZERO) {
4890  // Follow the Sparse BLAS convention by ignoring both the matrix
4891  // and X_in, in this case.
4892  if (beta == ZERO) {
4893  // Follow the Sparse BLAS convention by overwriting any Inf or
4894  // NaN values in Y_in, in this case.
4895  Y_in.putScalar (ZERO);
4896  }
4897  else {
4898  Y_in.scale (beta);
4899  }
4900  return;
4901  }
4902 
4903  const size_t numVectors = X_in.getNumVectors ();
4904 
4905  // We don't allow X_in and Y_in to alias one another. It's hard
4906  // to check this, because advanced users could create views from
4907  // raw pointers. However, if X_in and Y_in reference the same
4908  // object, we will do the user a favor by copying X into new
4909  // storage (with a warning). We only need to do this if we have
4910  // trivial importers; otherwise, we don't actually apply the
4911  // operator from X into Y.
4912  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4913  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4914  // access X indirectly, in case we need to create temporary storage
4915  RCP<const MV> X;
4916 
4917  // some parameters for below
4918  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4919  const bool Y_is_overwritten = (beta == ZERO);
4920  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4921  beta = ZERO;
4922  }
4923 
4924  // The kernels do not allow input or output with nonconstant stride.
4925  if (! X_in.isConstantStride () && importer.is_null ()) {
4926  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4927  } else {
4928  X = rcpFromRef (X_in); // Reference to X_in
4929  }
4930 
4931  // Set up temporary multivectors for Import and/or Export.
4932  if (importer != Teuchos::null) {
4933  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4934  importMV_ = null;
4935  }
4936  if (importMV_ == null) {
4937  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4938  }
4939  }
4940  if (exporter != Teuchos::null) {
4941  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4942  exportMV_ = null;
4943  }
4944  if (exportMV_ == null) {
4945  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4946  }
4947  }
4948 
4949  // If we have a non-trivial exporter, we must import elements that
4950  // are permuted or are on other processors.
4951  if (! exporter.is_null ()) {
4952  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4953  exportMV_->doImport (X_in, *exporter, INSERT);
4954  X = exportMV_; // multiply out of exportMV_
4955  }
4956 
4957  // If we have a non-trivial importer, we must export elements that
4958  // are permuted or belong to other processors. We will compute
4959  // solution into the to-be-exported MV; get a view.
4960  if (importer != Teuchos::null) {
4961  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4962 
4963  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4964  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4965  // back and figure out why this helps. importMV_ SHOULD be
4966  // completely overwritten in the localApply(...) call
4967  // below, because beta == ZERO there.
4968  importMV_->putScalar (ZERO);
4969  // Do the local computation.
4970  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4971 
4972  if (Y_is_overwritten) {
4973  Y_in.putScalar (ZERO);
4974  } else {
4975  Y_in.scale (beta);
4976  }
4977  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4978  }
4979  // otherwise, multiply into Y
4980  else {
4981  // can't multiply in-situ; can't multiply into non-strided multivector
4982  //
4983  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4984  // the user passed in the same MultiVector for both X and Y. It
4985  // won't detect whether one MultiVector views the other. We
4986  // should also check the MultiVectors' raw data pointers.
4987  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4988  // Make a deep copy of Y_in, into which to write the multiply result.
4989  MV Y (Y_in, Teuchos::Copy);
4990  this->localApply (*X, Y, mode, alpha, beta);
4991  Tpetra::deep_copy (Y_in, Y);
4992  } else {
4993  this->localApply (*X, Y_in, mode, alpha, beta);
4994  }
4995  }
4996 
4997  // If the range Map is a locally replicated map, sum the
4998  // contributions from each process. (That's why we set beta=0
4999  // above for all processes but Proc 0.)
5000  if (Y_is_replicated) {
5001  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
5002  Y_in.reduce ();
5003  }
5004  }
5005 
5006  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5007  void
5011  const Teuchos::ETransp mode,
5012  const Scalar& alpha,
5013  const Scalar& beta) const
5014  {
5016  using Teuchos::NO_TRANS;
5017  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5018 
5019  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5020  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5021  auto matrix_lcl = getLocalMultiplyOperator();
5022 
5023  const bool debug = ::Tpetra::Details::Behavior::debug ();
5024  if (debug) {
5025  const char tfecfFuncName[] = "localApply: ";
5026  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5027  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5028  "X.getNumVectors() = " << X.getNumVectors () << " != "
5029  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5030  const bool transpose = (mode != Teuchos::NO_TRANS);
5031  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5032  (! transpose && X.getLocalLength () !=
5033  getColMap ()->getLocalNumElements (), std::runtime_error,
5034  "NO_TRANS case: X has the wrong number of local rows. "
5035  "X.getLocalLength() = " << X.getLocalLength () << " != "
5036  "getColMap()->getLocalNumElements() = " <<
5037  getColMap ()->getLocalNumElements () << ".");
5038  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5039  (! transpose && Y.getLocalLength () !=
5040  getRowMap ()->getLocalNumElements (), std::runtime_error,
5041  "NO_TRANS case: Y has the wrong number of local rows. "
5042  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5043  "getRowMap()->getLocalNumElements() = " <<
5044  getRowMap ()->getLocalNumElements () << ".");
5045  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5046  (transpose && X.getLocalLength () !=
5047  getRowMap ()->getLocalNumElements (), std::runtime_error,
5048  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5049  "rows. X.getLocalLength() = " << X.getLocalLength ()
5050  << " != getRowMap()->getLocalNumElements() = "
5051  << getRowMap ()->getLocalNumElements () << ".");
5052  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5053  (transpose && Y.getLocalLength () !=
5054  getColMap ()->getLocalNumElements (), std::runtime_error,
5055  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5056  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5057  << " != getColMap()->getLocalNumElements() = "
5058  << getColMap ()->getLocalNumElements () << ".");
5059  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5060  (! isFillComplete (), std::runtime_error, "The matrix is not "
5061  "fill complete. You must call fillComplete() (possibly with "
5062  "domain and range Map arguments) without an intervening "
5063  "resumeFill() call before you may call this method.");
5064  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5065  (! X.isConstantStride () || ! Y.isConstantStride (),
5066  std::runtime_error, "X and Y must be constant stride.");
5067  // If the two pointers are null, then they don't alias one
5068  // another, even though they are equal.
5069  // Kokkos does not guarantee that zero row-extent vectors
5070  // point to different places, so we have to check that too.
5071  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5072  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5073  && X_lcl.extent(0) != 0,
5074  std::runtime_error, "X and Y may not alias one another.");
5075  }
5076 
5077  LocalOrdinal nrows = getLocalNumRows();
5078  LocalOrdinal maxRowImbalance = 0;
5079  if(nrows != 0)
5080  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5081 
5082  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5083  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5084  else
5085  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5086  }
5087 
5088  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5089  void
5093  Teuchos::ETransp mode,
5094  Scalar alpha,
5095  Scalar beta) const
5096  {
5098  const char fnName[] = "Tpetra::CrsMatrix::apply";
5099 
5100  TEUCHOS_TEST_FOR_EXCEPTION
5101  (! isFillComplete (), std::runtime_error,
5102  fnName << ": Cannot call apply() until fillComplete() "
5103  "has been called.");
5104 
5105  if (mode == Teuchos::NO_TRANS) {
5106  ProfilingRegion regionNonTranspose (fnName);
5107  this->applyNonTranspose (X, Y, alpha, beta);
5108  }
5109  else {
5110  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5111 
5112  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
5113  //when bets==0. This was not the case with transpose in a multithreaded
5114  //environment where a multiplication with subsequent atomic_adds is used
5115  //since 0 is effectively not special cased. Doing the explicit set to zero here
5116  //This catches cases where Y is nan or inf.
5117  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5118  if (beta == ZERO) {
5119  Y.putScalar (ZERO);
5120  }
5121  this->applyTranspose (X, Y, mode, alpha, beta);
5122  }
5123  }
5124 
5125 
5126  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5127  template<class T>
5128  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5130  convert () const
5131  {
5132  using Teuchos::RCP;
5133  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5134  const char tfecfFuncName[] = "convert: ";
5135 
5136  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5137  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5138  "of the conversion) is not fill complete. You must first call "
5139  "fillComplete() (possibly with the domain and range Map) without an "
5140  "intervening call to resumeFill(), before you may call this method.");
5141 
5142  RCP<output_matrix_type> newMatrix
5143  (new output_matrix_type (this->getCrsGraph ()));
5144  // Copy old values into new values. impl_scalar_type and T may
5145  // differ, so we can't use Kokkos::deep_copy.
5147  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5148  this->getLocalMatrixDevice ().values);
5149  // Since newmat has a static (const) graph, the graph already has
5150  // a column Map, and Import and Export objects already exist (if
5151  // applicable). Thus, calling fillComplete is cheap.
5152  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5153 
5154  return newMatrix;
5155  }
5156 
5157 
5158  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5159  void
5162  {
5163  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5164  if (debug) {
5165  const char tfecfFuncName[] = "checkInternalState: ";
5166  const char err[] = "Internal state is not consistent. "
5167  "Please report this bug to the Tpetra developers.";
5168 
5169  // This version of the graph (RCP<const crs_graph_type>) must
5170  // always be nonnull.
5171  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5172  (staticGraph_.is_null (), std::logic_error, err);
5173  // myGraph == null means that the matrix has a const ("static")
5174  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5175  // graph).
5176  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5177  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5178  std::logic_error, err);
5179  // if matrix is fill complete, then graph must be fill complete
5180  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5181  (isFillComplete () && ! staticGraph_->isFillComplete (),
5182  std::logic_error, err << " Specifically, the matrix is fill complete, "
5183  "but its graph is NOT fill complete.");
5184  // if values are allocated and they are non-zero in number, then
5185  // one of the allocations should be present
5186  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5187  (staticGraph_->indicesAreAllocated () &&
5188  staticGraph_->getLocalAllocationSize() > 0 &&
5189  staticGraph_->getLocalNumRows() > 0 &&
5190  valuesUnpacked_wdv.extent (0) == 0,
5191  std::logic_error, err);
5192  }
5193  }
5194 
5195  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5196  std::string
5199  {
5200  std::ostringstream os;
5201 
5202  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5203  if (this->getObjectLabel () != "") {
5204  os << "Label: \"" << this->getObjectLabel () << "\", ";
5205  }
5206  if (isFillComplete ()) {
5207  os << "isFillComplete: true"
5208  << ", global dimensions: [" << getGlobalNumRows () << ", "
5209  << getGlobalNumCols () << "]"
5210  << ", global number of entries: " << getGlobalNumEntries ()
5211  << "}";
5212  }
5213  else {
5214  os << "isFillComplete: false"
5215  << ", global dimensions: [" << getGlobalNumRows () << ", "
5216  << getGlobalNumCols () << "]}";
5217  }
5218  return os.str ();
5219  }
5220 
5221  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5222  void
5224  describe (Teuchos::FancyOStream &out,
5225  const Teuchos::EVerbosityLevel verbLevel) const
5226  {
5227  using std::endl;
5228  using std::setw;
5229  using Teuchos::ArrayView;
5230  using Teuchos::Comm;
5231  using Teuchos::RCP;
5232  using Teuchos::TypeNameTraits;
5233  using Teuchos::VERB_DEFAULT;
5234  using Teuchos::VERB_NONE;
5235  using Teuchos::VERB_LOW;
5236  using Teuchos::VERB_MEDIUM;
5237  using Teuchos::VERB_HIGH;
5238  using Teuchos::VERB_EXTREME;
5239 
5240  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5241 
5242  if (vl == VERB_NONE) {
5243  return; // Don't print anything at all
5244  }
5245 
5246  // By convention, describe() always begins with a tab.
5247  Teuchos::OSTab tab0 (out);
5248 
5249  RCP<const Comm<int> > comm = this->getComm();
5250  const int myRank = comm->getRank();
5251  const int numProcs = comm->getSize();
5252  size_t width = 1;
5253  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5254  ++width;
5255  }
5256  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5257 
5258  // none: print nothing
5259  // low: print O(1) info from node 0
5260  // medium: print O(P) info, num entries per process
5261  // high: print O(N) info, num entries per row
5262  // extreme: print O(NNZ) info: print indices and values
5263  //
5264  // for medium and higher, print constituent objects at specified verbLevel
5265  if (myRank == 0) {
5266  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5267  }
5268  Teuchos::OSTab tab1 (out);
5269 
5270  if (myRank == 0) {
5271  if (this->getObjectLabel () != "") {
5272  out << "Label: \"" << this->getObjectLabel () << "\", ";
5273  }
5274  {
5275  out << "Template parameters:" << endl;
5276  Teuchos::OSTab tab2 (out);
5277  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5278  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5279  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5280  << "Node: " << TypeNameTraits<Node>::name () << endl;
5281  }
5282  if (isFillComplete()) {
5283  out << "isFillComplete: true" << endl
5284  << "Global dimensions: [" << getGlobalNumRows () << ", "
5285  << getGlobalNumCols () << "]" << endl
5286  << "Global number of entries: " << getGlobalNumEntries () << endl
5287  << endl << "Global max number of entries in a row: "
5288  << getGlobalMaxNumRowEntries () << endl;
5289  }
5290  else {
5291  out << "isFillComplete: false" << endl
5292  << "Global dimensions: [" << getGlobalNumRows () << ", "
5293  << getGlobalNumCols () << "]" << endl;
5294  }
5295  }
5296 
5297  if (vl < VERB_MEDIUM) {
5298  return; // all done!
5299  }
5300 
5301  // Describe the row Map.
5302  if (myRank == 0) {
5303  out << endl << "Row Map:" << endl;
5304  }
5305  if (getRowMap ().is_null ()) {
5306  if (myRank == 0) {
5307  out << "null" << endl;
5308  }
5309  }
5310  else {
5311  if (myRank == 0) {
5312  out << endl;
5313  }
5314  getRowMap ()->describe (out, vl);
5315  }
5316 
5317  // Describe the column Map.
5318  if (myRank == 0) {
5319  out << "Column Map: ";
5320  }
5321  if (getColMap ().is_null ()) {
5322  if (myRank == 0) {
5323  out << "null" << endl;
5324  }
5325  } else if (getColMap () == getRowMap ()) {
5326  if (myRank == 0) {
5327  out << "same as row Map" << endl;
5328  }
5329  } else {
5330  if (myRank == 0) {
5331  out << endl;
5332  }
5333  getColMap ()->describe (out, vl);
5334  }
5335 
5336  // Describe the domain Map.
5337  if (myRank == 0) {
5338  out << "Domain Map: ";
5339  }
5340  if (getDomainMap ().is_null ()) {
5341  if (myRank == 0) {
5342  out << "null" << endl;
5343  }
5344  } else if (getDomainMap () == getRowMap ()) {
5345  if (myRank == 0) {
5346  out << "same as row Map" << endl;
5347  }
5348  } else if (getDomainMap () == getColMap ()) {
5349  if (myRank == 0) {
5350  out << "same as column Map" << endl;
5351  }
5352  } else {
5353  if (myRank == 0) {
5354  out << endl;
5355  }
5356  getDomainMap ()->describe (out, vl);
5357  }
5358 
5359  // Describe the range Map.
5360  if (myRank == 0) {
5361  out << "Range Map: ";
5362  }
5363  if (getRangeMap ().is_null ()) {
5364  if (myRank == 0) {
5365  out << "null" << endl;
5366  }
5367  } else if (getRangeMap () == getDomainMap ()) {
5368  if (myRank == 0) {
5369  out << "same as domain Map" << endl;
5370  }
5371  } else if (getRangeMap () == getRowMap ()) {
5372  if (myRank == 0) {
5373  out << "same as row Map" << endl;
5374  }
5375  } else {
5376  if (myRank == 0) {
5377  out << endl;
5378  }
5379  getRangeMap ()->describe (out, vl);
5380  }
5381 
5382  // O(P) data
5383  for (int curRank = 0; curRank < numProcs; ++curRank) {
5384  if (myRank == curRank) {
5385  out << "Process rank: " << curRank << endl;
5386  Teuchos::OSTab tab2 (out);
5387  if (! staticGraph_->indicesAreAllocated ()) {
5388  out << "Graph indices not allocated" << endl;
5389  }
5390  else {
5391  out << "Number of allocated entries: "
5392  << staticGraph_->getLocalAllocationSize () << endl;
5393  }
5394  out << "Number of entries: " << getLocalNumEntries () << endl
5395  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5396  << endl;
5397  }
5398  // Give output time to complete by executing some barriers.
5399  comm->barrier ();
5400  comm->barrier ();
5401  comm->barrier ();
5402  }
5403 
5404  if (vl < VERB_HIGH) {
5405  return; // all done!
5406  }
5407 
5408  // O(N) and O(NNZ) data
5409  for (int curRank = 0; curRank < numProcs; ++curRank) {
5410  if (myRank == curRank) {
5411  out << std::setw(width) << "Proc Rank"
5412  << std::setw(width) << "Global Row"
5413  << std::setw(width) << "Num Entries";
5414  if (vl == VERB_EXTREME) {
5415  out << std::setw(width) << "(Index,Value)";
5416  }
5417  out << endl;
5418  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5419  const size_t nE = getNumEntriesInLocalRow(r);
5420  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5421  out << std::setw(width) << myRank
5422  << std::setw(width) << gid
5423  << std::setw(width) << nE;
5424  if (vl == VERB_EXTREME) {
5425  if (isGloballyIndexed()) {
5426  global_inds_host_view_type rowinds;
5427  values_host_view_type rowvals;
5428  getGlobalRowView (gid, rowinds, rowvals);
5429  for (size_t j = 0; j < nE; ++j) {
5430  out << " (" << rowinds[j]
5431  << ", " << rowvals[j]
5432  << ") ";
5433  }
5434  }
5435  else if (isLocallyIndexed()) {
5436  local_inds_host_view_type rowinds;
5437  values_host_view_type rowvals;
5438  getLocalRowView (r, rowinds, rowvals);
5439  for (size_t j=0; j < nE; ++j) {
5440  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5441  << ", " << rowvals[j]
5442  << ") ";
5443  }
5444  } // globally or locally indexed
5445  } // vl == VERB_EXTREME
5446  out << endl;
5447  } // for each row r on this process
5448  } // if (myRank == curRank)
5449 
5450  // Give output time to complete
5451  comm->barrier ();
5452  comm->barrier ();
5453  comm->barrier ();
5454  } // for each process p
5455  }
5456 
5457  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5458  bool
5461  {
5462  // It's not clear what kind of compatibility checks on sizes can
5463  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5464  // compatibility.
5465 
5466  // Currently, the source object must be a RowMatrix with the same
5467  // four template parameters as the target CrsMatrix. We might
5468  // relax this requirement later.
5469  const row_matrix_type* srcRowMat =
5470  dynamic_cast<const row_matrix_type*> (&source);
5471  return (srcRowMat != nullptr);
5472  }
5473 
5474  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5475  void
5478  const typename crs_graph_type::padding_type& padding,
5479  const bool verbose)
5480  {
5482  using Details::padCrsArrays;
5483  using std::endl;
5484  using LO = local_ordinal_type;
5485  using row_ptrs_type =
5486  typename local_graph_device_type::row_map_type::non_const_type;
5487  using range_policy =
5488  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5489  const char tfecfFuncName[] = "applyCrsPadding";
5490  const char suffix[] =
5491  ". Please report this bug to the Tpetra developers.";
5492  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5493 
5494  std::unique_ptr<std::string> prefix;
5495  if (verbose) {
5496  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5497  std::ostringstream os;
5498  os << *prefix << "padding: ";
5499  padding.print(os);
5500  os << endl;
5501  std::cerr << os.str();
5502  }
5503  const int myRank = ! verbose ? -1 : [&] () {
5504  auto map = this->getMap();
5505  if (map.is_null()) {
5506  return -1;
5507  }
5508  auto comm = map->getComm();
5509  if (comm.is_null()) {
5510  return -1;
5511  }
5512  return comm->getRank();
5513  } ();
5514 
5515  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5516  if (! myGraph_->indicesAreAllocated()) {
5517  if (verbose) {
5518  std::ostringstream os;
5519  os << *prefix << "Call allocateIndices" << endl;
5520  std::cerr << os.str();
5521  }
5522  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5523  }
5524 
5525  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5526  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5527  // size needs to increase. That should be the job of
5528  // padCrsArrays.
5529 
5530  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5531  // would use it directly.
5532 
5533  if (verbose) {
5534  std::ostringstream os;
5535  os << *prefix << "Allocate row_ptrs_beg: "
5536  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5537  std::cerr << os.str();
5538  }
5539  using Kokkos::view_alloc;
5540  using Kokkos::WithoutInitializing;
5541  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5542  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5543  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5544  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5545 
5546  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5547  size_t(row_ptr_beg.extent(0) - 1);
5548  if (verbose) {
5549  std::ostringstream os;
5550  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5551  std::cerr << os.str();
5552  }
5553  row_ptrs_type row_ptr_end(
5554  view_alloc("row_ptr_end", WithoutInitializing), N);
5555 
5556  row_ptrs_type num_row_entries_d;
5557 
5558  const bool refill_num_row_entries =
5559  myGraph_->k_numRowEntries_.extent(0) != 0;
5560 
5561  if (refill_num_row_entries) { // unpacked storage
5562  // We can't assume correct *this capture until C++17, and it's
5563  // likely more efficient just to capture what we need anyway.
5564  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5565  myGraph_->k_numRowEntries_);
5566  Kokkos::parallel_for
5567  ("Fill end row pointers", range_policy(0, N),
5568  KOKKOS_LAMBDA (const size_t i) {
5569  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5570  });
5571  }
5572  else {
5573  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5574  // storage, we don't need row_ptr_end to be separate allocation;
5575  // could just have it alias row_ptr_beg+1.
5576  Kokkos::parallel_for
5577  ("Fill end row pointers", range_policy(0, N),
5578  KOKKOS_LAMBDA (const size_t i) {
5579  row_ptr_end(i) = row_ptr_beg(i+1);
5580  });
5581  }
5582 
5583  if (myGraph_->isGloballyIndexed()) {
5584  padCrsArrays(row_ptr_beg, row_ptr_end,
5585  myGraph_->gblInds_wdv,
5586  valuesUnpacked_wdv, padding, myRank, verbose);
5587  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5588  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5589  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5590  (newValuesLen != newColIndsLen, std::logic_error,
5591  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5592  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5593  << suffix);
5594  }
5595  else {
5596  padCrsArrays(row_ptr_beg, row_ptr_end,
5597  myGraph_->lclIndsUnpacked_wdv,
5598  valuesUnpacked_wdv, padding, myRank, verbose);
5599  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5600  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5601  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5602  (newValuesLen != newColIndsLen, std::logic_error,
5603  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5604  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5605  << suffix);
5606  }
5607 
5608  if (refill_num_row_entries) {
5609  Kokkos::parallel_for
5610  ("Fill num entries", range_policy(0, N),
5611  KOKKOS_LAMBDA (const size_t i) {
5612  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5613  });
5614  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5615  }
5616 
5617  if (verbose) {
5618  std::ostringstream os;
5619  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5620  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5621  << ", new size: " << row_ptr_beg.extent(0) << endl;
5622  std::cerr << os.str();
5623  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5624  row_ptr_beg.extent(0) );
5625  }
5626  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5627  }
5628 
5629  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5630  void
5631  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5632  copyAndPermuteStaticGraph(
5633  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5634  const size_t numSameIDs,
5635  const LocalOrdinal permuteToLIDs[],
5636  const LocalOrdinal permuteFromLIDs[],
5637  const size_t numPermutes)
5638  {
5639  using Details::ProfilingRegion;
5640  using Teuchos::Array;
5641  using Teuchos::ArrayView;
5642  using std::endl;
5643  using LO = LocalOrdinal;
5644  using GO = GlobalOrdinal;
5645  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5646  const char suffix[] =
5647  " Please report this bug to the Tpetra developers.";
5648  ProfilingRegion regionCAP
5649  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5650 
5651  const bool debug = Details::Behavior::debug("CrsGraph");
5652  const bool verbose = Details::Behavior::verbose("CrsGraph");
5653  std::unique_ptr<std::string> prefix;
5654  if (verbose) {
5655  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5656  std::ostringstream os;
5657  os << *prefix << "Start" << endl;
5658  }
5659  const char* const prefix_raw =
5660  verbose ? prefix.get()->c_str() : nullptr;
5661 
5662  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5663  //
5664  // Copy the first numSame row from source to target (this matrix).
5665  // This involves copying rows corresponding to LIDs [0, numSame-1].
5666  //
5667  const map_type& srcRowMap = * (srcMat.getRowMap ());
5668  nonconst_global_inds_host_view_type rowInds;
5669  nonconst_values_host_view_type rowVals;
5670  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5671  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5672  // Global ID for the current row index in the source matrix.
5673  // The first numSameIDs GIDs in the two input lists are the
5674  // same, so sourceGID == targetGID in this case.
5675  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5676  const GO targetGID = sourceGID;
5677 
5678  ArrayView<const GO>rowIndsConstView;
5679  ArrayView<const Scalar> rowValsConstView;
5680 
5681  if (sourceIsLocallyIndexed) {
5682  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5683  if (rowLength > static_cast<size_t> (rowInds.size())) {
5684  Kokkos::resize(rowInds,rowLength);
5685  Kokkos::resize(rowVals,rowLength);
5686  }
5687  // Resizing invalidates an Array's views, so we must make new
5688  // ones, even if rowLength hasn't changed.
5689  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5690  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5691 
5692  // The source matrix is locally indexed, so we have to get a
5693  // copy. Really it's the GIDs that have to be copied (because
5694  // they have to be converted from LIDs).
5695  size_t checkRowLength = 0;
5696  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5697  rowValsView, checkRowLength);
5698  if (debug) {
5699  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5700  (rowLength != checkRowLength, std::logic_error, "For "
5701  "global row index " << sourceGID << ", the source "
5702  "matrix's getNumEntriesInGlobalRow returns a row length "
5703  "of " << rowLength << ", but getGlobalRowCopy reports "
5704  "a row length of " << checkRowLength << "." << suffix);
5705  }
5706 
5707  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5708  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5709  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5710  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5711  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5712  rowIndsView.data(), rowIndsView.extent(0),
5713  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5714  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5715  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5716  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5717  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5718  // KDDKDD UVM TEMPORARY: KokkosView interface
5719  }
5720  else { // source matrix is globally indexed.
5721  global_inds_host_view_type rowIndsView;
5722  values_host_view_type rowValsView;
5723  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5724  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5725  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5726  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5727  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5728  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5729  rowIndsView.data(), rowIndsView.extent(0),
5730  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5731  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5732  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5733  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5734  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5735  // KDDKDD UVM TEMPORARY: KokkosView interface
5736 
5737  }
5738 
5739  // Applying a permutation to a matrix with a static graph
5740  // means REPLACE-ing entries.
5741  combineGlobalValues(targetGID, rowIndsConstView,
5742  rowValsConstView, REPLACE,
5743  prefix_raw, debug, verbose);
5744  }
5745 
5746  if (verbose) {
5747  std::ostringstream os;
5748  os << *prefix << "Do permutes" << endl;
5749  }
5750 
5751  const map_type& tgtRowMap = * (this->getRowMap ());
5752  for (size_t p = 0; p < numPermutes; ++p) {
5753  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5754  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5755 
5756  ArrayView<const GO> rowIndsConstView;
5757  ArrayView<const Scalar> rowValsConstView;
5758 
5759  if (sourceIsLocallyIndexed) {
5760  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5761  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5762  Kokkos::resize(rowInds,rowLength);
5763  Kokkos::resize(rowVals,rowLength);
5764  }
5765  // Resizing invalidates an Array's views, so we must make new
5766  // ones, even if rowLength hasn't changed.
5767  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5768  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5769 
5770  // The source matrix is locally indexed, so we have to get a
5771  // copy. Really it's the GIDs that have to be copied (because
5772  // they have to be converted from LIDs).
5773  size_t checkRowLength = 0;
5774  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5775  rowValsView, checkRowLength);
5776  if (debug) {
5777  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5778  (rowLength != checkRowLength, std::logic_error, "For "
5779  "source matrix global row index " << sourceGID << ", "
5780  "getNumEntriesInGlobalRow returns a row length of " <<
5781  rowLength << ", but getGlobalRowCopy a row length of "
5782  << checkRowLength << "." << suffix);
5783  }
5784 
5785  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5786  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5787  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5788  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5789  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5790  rowIndsView.data(), rowIndsView.extent(0),
5791  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5792  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5793  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5794  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5795  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5796  // KDDKDD UVM TEMPORARY: KokkosView interface
5797  }
5798  else {
5799  global_inds_host_view_type rowIndsView;
5800  values_host_view_type rowValsView;
5801  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5802  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5803  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5804  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5805  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5806  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5807  rowIndsView.data(), rowIndsView.extent(0),
5808  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5809  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5810  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5811  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5812  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5813  // KDDKDD UVM TEMPORARY: KokkosView interface
5814  }
5815 
5816  combineGlobalValues(targetGID, rowIndsConstView,
5817  rowValsConstView, REPLACE,
5818  prefix_raw, debug, verbose);
5819  }
5820 
5821  if (verbose) {
5822  std::ostringstream os;
5823  os << *prefix << "Done" << endl;
5824  }
5825  }
5826 
5827  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5828  void
5829  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5830  copyAndPermuteNonStaticGraph(
5831  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5832  const size_t numSameIDs,
5833  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5834  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5835  const size_t numPermutes)
5836  {
5837  using Details::ProfilingRegion;
5838  using Teuchos::Array;
5839  using Teuchos::ArrayView;
5840  using std::endl;
5841  using LO = LocalOrdinal;
5842  using GO = GlobalOrdinal;
5843  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5844  const char suffix[] =
5845  " Please report this bug to the Tpetra developers.";
5846  ProfilingRegion regionCAP
5847  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5848 
5849  const bool debug = Details::Behavior::debug("CrsGraph");
5850  const bool verbose = Details::Behavior::verbose("CrsGraph");
5851  std::unique_ptr<std::string> prefix;
5852  if (verbose) {
5853  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5854  std::ostringstream os;
5855  os << *prefix << "Start" << endl;
5856  }
5857  const char* const prefix_raw =
5858  verbose ? prefix.get()->c_str() : nullptr;
5859 
5860  {
5861  using row_graph_type = RowGraph<LO, GO, Node>;
5862  const row_graph_type& srcGraph = *(srcMat.getGraph());
5863  auto padding =
5864  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5865  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5866  applyCrsPadding(*padding, verbose);
5867  }
5868  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5869  //
5870  // Copy the first numSame row from source to target (this matrix).
5871  // This involves copying rows corresponding to LIDs [0, numSame-1].
5872  //
5873  const map_type& srcRowMap = * (srcMat.getRowMap ());
5874  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5875  using gids_type = nonconst_global_inds_host_view_type;
5876  using vals_type = nonconst_values_host_view_type;
5877  gids_type rowInds;
5878  vals_type rowVals;
5879  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5880  // Global ID for the current row index in the source matrix.
5881  // The first numSameIDs GIDs in the two input lists are the
5882  // same, so sourceGID == targetGID in this case.
5883  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5884  const GO targetGID = sourceGID;
5885 
5886  ArrayView<const GO> rowIndsConstView;
5887  ArrayView<const Scalar> rowValsConstView;
5888 
5889  if (sourceIsLocallyIndexed) {
5890 
5891  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5892  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5893  Kokkos::resize(rowInds,rowLength);
5894  Kokkos::resize(rowVals,rowLength);
5895  }
5896  // Resizing invalidates an Array's views, so we must make new
5897  // ones, even if rowLength hasn't changed.
5898  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5899  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5900 
5901  // The source matrix is locally indexed, so we have to get a
5902  // copy. Really it's the GIDs that have to be copied (because
5903  // they have to be converted from LIDs).
5904  size_t checkRowLength = 0;
5905  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5906  checkRowLength);
5907  if (debug) {
5908  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5909  (rowLength != checkRowLength, std::logic_error, ": For "
5910  "global row index " << sourceGID << ", the source "
5911  "matrix's getNumEntriesInGlobalRow returns a row length "
5912  "of " << rowLength << ", but getGlobalRowCopy reports "
5913  "a row length of " << checkRowLength << "." << suffix);
5914  }
5915  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5916  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5917  }
5918  else { // source matrix is globally indexed.
5919  global_inds_host_view_type rowIndsView;
5920  values_host_view_type rowValsView;
5921  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5922 
5923  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5924  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5925  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5926  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5927  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5928  rowIndsView.data(), rowIndsView.extent(0),
5929  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5930  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5931  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5932  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5933  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5934  // KDDKDD UVM TEMPORARY: KokkosView interface
5935  }
5936 
5937  // Combine the data into the target matrix.
5938  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5939  rowValsConstView, prefix_raw, debug, verbose);
5940  }
5941 
5942  if (verbose) {
5943  std::ostringstream os;
5944  os << *prefix << "Do permutes" << endl;
5945  }
5946  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5947  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5948 
5949  const map_type& tgtRowMap = * (this->getRowMap ());
5950  for (size_t p = 0; p < numPermutes; ++p) {
5951  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5952  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5953 
5954  ArrayView<const GO> rowIndsConstView;
5955  ArrayView<const Scalar> rowValsConstView;
5956 
5957  if (sourceIsLocallyIndexed) {
5958  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5959  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5960  Kokkos::resize(rowInds,rowLength);
5961  Kokkos::resize(rowVals,rowLength);
5962  }
5963  // Resizing invalidates an Array's views, so we must make new
5964  // ones, even if rowLength hasn't changed.
5965  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5966  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5967 
5968  // The source matrix is locally indexed, so we have to get a
5969  // copy. Really it's the GIDs that have to be copied (because
5970  // they have to be converted from LIDs).
5971  size_t checkRowLength = 0;
5972  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5973  rowValsView, checkRowLength);
5974  if (debug) {
5975  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5976  (rowLength != checkRowLength, std::logic_error, "For "
5977  "source matrix global row index " << sourceGID << ", "
5978  "getNumEntriesInGlobalRow returns a row length of " <<
5979  rowLength << ", but getGlobalRowCopy a row length of "
5980  << checkRowLength << "." << suffix);
5981  }
5982  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5983  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5984  }
5985  else {
5986  global_inds_host_view_type rowIndsView;
5987  values_host_view_type rowValsView;
5988  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5989 
5990  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5991  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5992  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5993  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5994  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5995  rowIndsView.data(), rowIndsView.extent(0),
5996  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5997  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5998  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5999  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6000  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6001  // KDDKDD UVM TEMPORARY: KokkosView interface
6002  }
6003 
6004  // Combine the data into the target matrix.
6005  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6006  rowValsConstView, prefix_raw, debug, verbose);
6007  }
6008 
6009  if (verbose) {
6010  std::ostringstream os;
6011  os << *prefix << "Done" << endl;
6012  }
6013  }
6014 
6015  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6016  void
6019  const SrcDistObject& srcObj,
6020  const size_t numSameIDs,
6021  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6022  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6023  const CombineMode /*CM*/)
6024  {
6025  using Details::Behavior;
6028  using std::endl;
6029 
6030  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6031  const char tfecfFuncName[] = "copyAndPermute: ";
6032  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6033 
6034  const bool verbose = Behavior::verbose("CrsMatrix");
6035  std::unique_ptr<std::string> prefix;
6036  if (verbose) {
6037  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6038  std::ostringstream os;
6039  os << *prefix << endl
6040  << *prefix << " numSameIDs: " << numSameIDs << endl
6041  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6042  << endl
6043  << *prefix << " "
6044  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6045  << endl
6046  << *prefix << " "
6047  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6048  << endl
6049  << *prefix << " "
6050  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6051  << endl;
6052  std::cerr << os.str ();
6053  }
6054 
6055  const auto numPermute = permuteToLIDs.extent (0);
6056  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6057  (numPermute != permuteFromLIDs.extent (0),
6058  std::invalid_argument, "permuteToLIDs.extent(0) = "
6059  << numPermute << "!= permuteFromLIDs.extent(0) = "
6060  << permuteFromLIDs.extent (0) << ".");
6061 
6062  // This dynamic cast should succeed, because we've already tested
6063  // it in checkSizes().
6065  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6066  if (isStaticGraph ()) {
6067  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6068  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6069  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6070  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6071 
6072  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6073  permuteToLIDs_h.data(),
6074  permuteFromLIDs_h.data(),
6075  numPermute);
6076  }
6077  else {
6078  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6079  permuteFromLIDs, numPermute);
6080  }
6081 
6082  if (verbose) {
6083  std::ostringstream os;
6084  os << *prefix << "Done" << endl;
6085  std::cerr << os.str();
6086  }
6087  }
6088 
6089  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6090  void
6093  (const SrcDistObject& source,
6094  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6095  Kokkos::DualView<char*, buffer_device_type>& exports,
6096  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6097  size_t& constantNumPackets)
6098  {
6099  using Details::Behavior;
6102  using Teuchos::outArg;
6103  using Teuchos::REDUCE_MAX;
6104  using Teuchos::reduceAll;
6105  using std::endl;
6106  typedef LocalOrdinal LO;
6107  typedef GlobalOrdinal GO;
6108  const char tfecfFuncName[] = "packAndPrepare: ";
6109  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6110 
6111  const bool debug = Behavior::debug("CrsMatrix");
6112  const bool verbose = Behavior::verbose("CrsMatrix");
6113 
6114  // Processes on which the communicator is null should not participate.
6115  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6116  if (pComm.is_null ()) {
6117  return;
6118  }
6119  const Teuchos::Comm<int>& comm = *pComm;
6120  const int myRank = comm.getSize ();
6121 
6122  std::unique_ptr<std::string> prefix;
6123  if (verbose) {
6124  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6125  std::ostringstream os;
6126  os << *prefix << "Start" << endl
6127  << *prefix << " "
6128  << dualViewStatusToString (exportLIDs, "exportLIDs")
6129  << endl
6130  << *prefix << " "
6131  << dualViewStatusToString (exports, "exports")
6132  << endl
6133  << *prefix << " "
6134  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6135  << endl;
6136  std::cerr << os.str ();
6137  }
6138 
6139  // Attempt to cast the source object to CrsMatrix. If successful,
6140  // use the source object's packNew() method to pack its data for
6141  // communication. Otherwise, attempt to cast to RowMatrix; if
6142  // successful, use the source object's pack() method. Otherwise,
6143  // the source object doesn't have the right type.
6144  //
6145  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6146  // RowMatrix to have the same Node type. Unfortunately, we don't
6147  // have a way to ask if the RowMatrix is "a RowMatrix with any
6148  // Node type," since RowMatrix doesn't have a base class. A
6149  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6150  // not currently exist, would satisfy this requirement.
6151  //
6152  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6153  // type doesn't technically need to match the target object's
6154  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6155  // and GO need not be the same, as long as there is no overflow of
6156  // the indices. However, checking for index overflow is global
6157  // and therefore undesirable.
6158 
6159  std::ostringstream msg; // for collecting error messages
6160  int lclBad = 0; // to be set below
6161 
6162  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6163  const crs_matrix_type* srcCrsMat =
6164  dynamic_cast<const crs_matrix_type*> (&source);
6165  if (srcCrsMat != nullptr) {
6166  if (verbose) {
6167  std::ostringstream os;
6168  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6169  "calling packNew" << endl;
6170  std::cerr << os.str ();
6171  }
6172  try {
6173  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6174  constantNumPackets);
6175  }
6176  catch (std::exception& e) {
6177  lclBad = 1;
6178  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6179  }
6180  }
6181  else {
6182  using Kokkos::HostSpace;
6183  using Kokkos::subview;
6184  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6185  using range_type = Kokkos::pair<size_t, size_t>;
6186 
6187  if (verbose) {
6188  std::ostringstream os;
6189  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6190  << endl;
6191  std::cerr << os.str ();
6192  }
6193 
6194  const row_matrix_type* srcRowMat =
6195  dynamic_cast<const row_matrix_type*> (&source);
6196  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6197  (srcRowMat == nullptr, std::invalid_argument,
6198  "The source object of the Import or Export operation is neither a "
6199  "CrsMatrix (with the same template parameters as the target object), "
6200  "nor a RowMatrix (with the same first four template parameters as the "
6201  "target object).");
6202 
6203  // For the RowMatrix case, we need to convert from
6204  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6205  // so terribly efficient, since packing a non-CrsMatrix
6206  // RowMatrix for Import/Export into a CrsMatrix is not a
6207  // critical case. Thus, we may allocate Teuchos::Array objects
6208  // here and copy to and from Kokkos::*View.
6209 
6210  // View exportLIDs's host data as a Teuchos::ArrayView.
6211  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6212  auto exportLIDs_h = exportLIDs.view_host ();
6213  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6214  exportLIDs_h.size ());
6215 
6216  // pack() will allocate exports_a as needed. We'll copy back
6217  // into exports (after (re)allocating exports if needed) below.
6218  Teuchos::Array<char> exports_a;
6219 
6220  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6221  // need to sync, since we're doing write-only access, but we do
6222  // need to mark the DualView as modified on host.
6223 
6224  numPacketsPerLID.clear_sync_state (); // write-only access
6225  numPacketsPerLID.modify_host ();
6226  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6227  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6228  numPacketsPerLID_h.size ());
6229 
6230  // Invoke RowMatrix's legacy pack() interface, using above
6231  // Teuchos::Array* objects.
6232  try {
6233  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6234  constantNumPackets);
6235  }
6236  catch (std::exception& e) {
6237  lclBad = 1;
6238  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6239  }
6240 
6241  // Allocate 'exports', and copy exports_a back into it.
6242  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6243  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6244  const std::string oldLabel = exports.d_view.label ();
6245  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6246  exports = exports_type (newLabel, newAllocSize);
6247  }
6248  // It's safe to assume that we're working on host anyway, so
6249  // just keep exports sync'd to host.
6250  // ignore current device contents
6251  exports.modify_host();
6252 
6253  auto exports_h = exports.view_host ();
6254  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6255 
6256  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6257  // exports_a into a nonowning Kokkos::View first before copying.
6258  typedef typename exports_type::t_host::execution_space HES;
6259  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6260  Kokkos::View<const char*, host_device_type>
6261  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6262  // DEEP_COPY REVIEW - NOT TESTED
6263  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6264  }
6265 
6266  if (debug) {
6267  int gblBad = 0; // output argument; to be set below
6268  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6269  if (gblBad != 0) {
6270  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6271  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6272  (true, std::logic_error, "packNew() or pack() threw an exception on "
6273  "one or more participating processes.");
6274  }
6275  }
6276  else {
6277  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6278  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6279  "or more participating processes. Here is this process' error "
6280  "message: " << msg.str ());
6281  }
6282 
6283  if (verbose) {
6284  std::ostringstream os;
6285  os << *prefix << "packAndPrepare: Done!" << endl
6286  << *prefix << " "
6287  << dualViewStatusToString (exportLIDs, "exportLIDs")
6288  << endl
6289  << *prefix << " "
6290  << dualViewStatusToString (exports, "exports")
6291  << endl
6292  << *prefix << " "
6293  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6294  << endl;
6295  std::cerr << os.str ();
6296  }
6297  }
6298 
6299  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6300  size_t
6301  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6302  packRow (char exports[],
6303  const size_t offset,
6304  const size_t numEnt,
6305  const GlobalOrdinal gidsIn[],
6306  const impl_scalar_type valsIn[],
6307  const size_t numBytesPerValue) const
6308  {
6309  using Kokkos::View;
6310  using Kokkos::subview;
6312  typedef LocalOrdinal LO;
6313  typedef GlobalOrdinal GO;
6314  typedef impl_scalar_type ST;
6315 
6316  if (numEnt == 0) {
6317  // Empty rows always take zero bytes, to ensure sparsity.
6318  return 0;
6319  }
6320 
6321  const GO gid = 0; // packValueCount wants this
6322  const LO numEntLO = static_cast<size_t> (numEnt);
6323 
6324  const size_t numEntBeg = offset;
6325  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6326  const size_t gidsBeg = numEntBeg + numEntLen;
6327  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6328  const size_t valsBeg = gidsBeg + gidsLen;
6329  const size_t valsLen = numEnt * numBytesPerValue;
6330