Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_CRSMATRIX_DEF_HPP
41 #define TPETRA_CRSMATRIX_DEF_HPP
42 
50 
51 #include "Tpetra_Import_Util.hpp"
52 #include "Tpetra_Import_Util2.hpp"
53 #include "Tpetra_RowMatrix.hpp"
54 #include "Tpetra_LocalCrsMatrixOperator.hpp"
55 
62 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
70 #include "Tpetra_Details_packCrsMatrix.hpp"
71 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
73 #include "Teuchos_FancyOStream.hpp"
74 #include "Teuchos_RCP.hpp"
75 #include "Teuchos_DataAccess.hpp"
76 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
77 #include "KokkosBlas1_scal.hpp"
78 #include "KokkosSparse_getDiagCopy.hpp"
79 #include "KokkosSparse_spmv.hpp"
80 
81 #include <memory>
82 #include <sstream>
83 #include <typeinfo>
84 #include <utility>
85 #include <vector>
86 
87 namespace Tpetra {
88 
89 namespace { // (anonymous)
90 
91  template<class T, class BinaryFunction>
92  T atomic_binary_function_update (volatile T* const dest,
93  const T& inputVal,
94  BinaryFunction f)
95  {
96  T oldVal = *dest;
97  T assume;
98 
99  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
100  // POWER architectures, because 'newval' depends on 'assume',
101  // which depends on 'oldVal', which depends on '*dest'. This
102  // sets up a chain of read dependencies that should ensure
103  // correct behavior given a sane memory model.
104  do {
105  assume = oldVal;
106  T newVal = f (assume, inputVal);
107  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
108  } while (assume != oldVal);
109 
110  return oldVal;
111  }
112 } // namespace (anonymous)
113 
114 //
115 // Users must never rely on anything in the Details namespace.
116 //
117 namespace Details {
118 
128 template<class Scalar>
129 struct AbsMax {
131  Scalar operator() (const Scalar& x, const Scalar& y) {
132  typedef Teuchos::ScalarTraits<Scalar> STS;
133  return std::max (STS::magnitude (x), STS::magnitude (y));
134  }
135 };
136 
137 } // namespace Details
138 } // namespace Tpetra
139 
140 namespace Tpetra {
141 
142  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
143  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
144  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
145  size_t maxNumEntriesPerRow,
146  const Teuchos::RCP<Teuchos::ParameterList>& params) :
147  dist_object_type (rowMap)
148  {
149  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
150  "[, RCP<ParameterList>]): ";
151  Teuchos::RCP<crs_graph_type> graph;
152  try {
153  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
154  params));
155  }
156  catch (std::exception& e) {
157  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
158  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
159  "size_t [, RCP<ParameterList>]) threw an exception: "
160  << e.what ());
161  }
162  // myGraph_ not null means that the matrix owns the graph. That's
163  // different than the const CrsGraph constructor, where the matrix
164  // does _not_ own the graph.
165  myGraph_ = graph;
166  staticGraph_ = myGraph_;
167  resumeFill (params);
169  }
170 
171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
173  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
174  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
175  const Teuchos::RCP<Teuchos::ParameterList>& params) :
176  dist_object_type (rowMap)
177  {
178  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
179  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
180  Teuchos::RCP<crs_graph_type> graph;
181  try {
182  using Teuchos::rcp;
183  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
184  params));
185  }
186  catch (std::exception& e) {
187  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
188  (true, std::runtime_error, "CrsGraph constructor "
189  "(RCP<const Map>, ArrayView<const size_t>"
190  "[, RCP<ParameterList>]) threw an exception: "
191  << e.what ());
192  }
193  // myGraph_ not null means that the matrix owns the graph. That's
194  // different than the const CrsGraph constructor, where the matrix
195  // does _not_ own the graph.
196  myGraph_ = graph;
197  staticGraph_ = graph;
198  resumeFill (params);
200  }
201 
202  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
204  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
205  const Teuchos::RCP<const map_type>& colMap,
206  const size_t maxNumEntPerRow,
207  const Teuchos::RCP<Teuchos::ParameterList>& params) :
208  dist_object_type (rowMap)
209  {
210  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
211  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
212  const char suffix[] =
213  " Please report this bug to the Tpetra developers.";
214 
215  // An artifact of debugging something a while back.
216  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
217  (! staticGraph_.is_null (), std::logic_error,
218  "staticGraph_ is not null at the beginning of the constructor."
219  << suffix);
220  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
221  (! myGraph_.is_null (), std::logic_error,
222  "myGraph_ is not null at the beginning of the constructor."
223  << suffix);
224  Teuchos::RCP<crs_graph_type> graph;
225  try {
226  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
227  maxNumEntPerRow,
228  params));
229  }
230  catch (std::exception& e) {
231  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
232  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
233  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
234  "exception: " << e.what ());
235  }
236  // myGraph_ not null means that the matrix owns the graph. That's
237  // different than the const CrsGraph constructor, where the matrix
238  // does _not_ own the graph.
239  myGraph_ = graph;
240  staticGraph_ = myGraph_;
241  resumeFill (params);
243  }
244 
245  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
247  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
248  const Teuchos::RCP<const map_type>& colMap,
249  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
250  const Teuchos::RCP<Teuchos::ParameterList>& params) :
251  dist_object_type (rowMap)
252  {
253  const char tfecfFuncName[] =
254  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
255  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
256  Teuchos::RCP<crs_graph_type> graph;
257  try {
258  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
259  numEntPerRowToAlloc,
260  params));
261  }
262  catch (std::exception& e) {
263  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
264  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
265  "RCP<const Map>, ArrayView<const size_t>[, "
266  "RCP<ParameterList>]) threw an exception: " << e.what ());
267  }
268  // myGraph_ not null means that the matrix owns the graph. That's
269  // different than the const CrsGraph constructor, where the matrix
270  // does _not_ own the graph.
271  myGraph_ = graph;
272  staticGraph_ = graph;
273  resumeFill (params);
275  }
276 
277 
278  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
280  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
281  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
282  dist_object_type (graph->getRowMap ()),
283  staticGraph_ (graph),
284  storageStatus_ (Details::STORAGE_1D_PACKED)
285  {
286  using std::endl;
287  typedef typename local_matrix_device_type::values_type values_type;
288  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
289  "RCP<ParameterList>]): ";
290  const bool verbose = Details::Behavior::verbose("CrsMatrix");
291 
292  std::unique_ptr<std::string> prefix;
293  if (verbose) {
294  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
295  std::ostringstream os;
296  os << *prefix << "Start" << endl;
297  std::cerr << os.str ();
298  }
299 
300  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
301  (graph.is_null (), std::runtime_error, "Input graph is null.");
302  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
303  (! graph->isFillComplete (), std::runtime_error, "Input graph "
304  "is not fill complete. You must call fillComplete on the "
305  "graph before using it to construct a CrsMatrix. Note that "
306  "calling resumeFill on the graph makes it not fill complete, "
307  "even if you had previously called fillComplete. In that "
308  "case, you must call fillComplete on the graph again.");
309 
310  // The graph is fill complete, so it is locally indexed and has a
311  // fixed structure. This means we can allocate the (1-D) array of
312  // values and build the local matrix right now. Note that the
313  // local matrix's number of columns comes from the column Map, not
314  // the domain Map.
315 
316  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
317  if (verbose) {
318  std::ostringstream os;
319  os << *prefix << "Allocate values: " << numEnt << endl;
320  std::cerr << os.str ();
321  }
322 
323  values_type val ("Tpetra::CrsMatrix::values", numEnt);
324  valuesPacked_wdv = values_wdv_type(val);
325  valuesUnpacked_wdv = valuesPacked_wdv;
326 
328 
329  if (verbose) {
330  std::ostringstream os;
331  os << *prefix << "Done" << endl;
332  std::cerr << os.str ();
333  }
334  }
335 
336  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
339  const Teuchos::RCP<const crs_graph_type>& graph,
340  const Teuchos::RCP<Teuchos::ParameterList>& params) :
341  dist_object_type (graph->getRowMap ()),
342  staticGraph_ (graph),
343  storageStatus_ (matrix.storageStatus_)
344  {
345  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
346  "local_matrix_device_type::values_type, "
347  "[,RCP<ParameterList>]): ";
348  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
349  (graph.is_null (), std::runtime_error, "Input graph is null.");
350  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
351  (! graph->isFillComplete (), std::runtime_error, "Input graph "
352  "is not fill complete. You must call fillComplete on the "
353  "graph before using it to construct a CrsMatrix. Note that "
354  "calling resumeFill on the graph makes it not fill complete, "
355  "even if you had previously called fillComplete. In that "
356  "case, you must call fillComplete on the graph again.");
357 
358  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
359  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
360 
361  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
362  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
363 
365  }
366 
367 
368  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
370  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
371  const typename local_matrix_device_type::values_type& values,
372  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
373  dist_object_type (graph->getRowMap ()),
374  staticGraph_ (graph),
375  storageStatus_ (Details::STORAGE_1D_PACKED)
376  {
377  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
378  "local_matrix_device_type::values_type, "
379  "[,RCP<ParameterList>]): ";
380  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
381  (graph.is_null (), std::runtime_error, "Input graph is null.");
382  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
383  (! graph->isFillComplete (), std::runtime_error, "Input graph "
384  "is not fill complete. You must call fillComplete on the "
385  "graph before using it to construct a CrsMatrix. Note that "
386  "calling resumeFill on the graph makes it not fill complete, "
387  "even if you had previously called fillComplete. In that "
388  "case, you must call fillComplete on the graph again.");
389 
390  // The graph is fill complete, so it is locally indexed and has a
391  // fixed structure. This means we can allocate the (1-D) array of
392  // values and build the local matrix right now. Note that the
393  // local matrix's number of columns comes from the column Map, not
394  // the domain Map.
395 
396  valuesPacked_wdv = values_wdv_type(values);
397  valuesUnpacked_wdv = valuesPacked_wdv;
398 
399  // FIXME (22 Jun 2016) I would very much like to get rid of
400  // k_values1D_ at some point. I find it confusing to have all
401  // these extra references lying around.
402  // KDDKDD ALMOST THERE, MARK!
403 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
404 
406  }
407 
408  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
410  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
411  const Teuchos::RCP<const map_type>& colMap,
412  const typename local_graph_device_type::row_map_type& rowPointers,
413  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
414  const typename local_matrix_device_type::values_type& values,
415  const Teuchos::RCP<Teuchos::ParameterList>& params) :
416  dist_object_type (rowMap),
417  storageStatus_ (Details::STORAGE_1D_PACKED)
418  {
419  using Details::getEntryOnHost;
420  using Teuchos::RCP;
421  using std::endl;
422  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
423  "RCP<const Map>, ptr, ind, val[, params]): ";
424  const char suffix[] =
425  ". Please report this bug to the Tpetra developers.";
426  const bool debug = Details::Behavior::debug("CrsMatrix");
427  const bool verbose = Details::Behavior::verbose("CrsMatrix");
428 
429  std::unique_ptr<std::string> prefix;
430  if (verbose) {
431  prefix = this->createPrefix(
432  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
433  std::ostringstream os;
434  os << *prefix << "Start" << endl;
435  std::cerr << os.str ();
436  }
437 
438  // Check the user's input. Note that this might throw only on
439  // some processes but not others, causing deadlock. We prefer
440  // deadlock due to exceptions to segfaults, because users can
441  // catch exceptions.
442  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
443  (values.extent(0) != columnIndices.extent(0),
444  std::invalid_argument, "values.extent(0)=" << values.extent(0)
445  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
446  << ".");
447  if (debug && rowPointers.extent(0) != 0) {
448  const size_t numEnt =
449  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
450  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
451  (numEnt != size_t(columnIndices.extent(0)) ||
452  numEnt != size_t(values.extent(0)),
453  std::invalid_argument, "Last entry of rowPointers says that "
454  "the matrix has " << numEnt << " entr"
455  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
456  "columnIndices and values don't match this. "
457  "columnIndices.extent(0)=" << columnIndices.extent (0)
458  << " and values.extent(0)=" << values.extent (0) << ".");
459  }
460 
461  RCP<crs_graph_type> graph;
462  try {
463  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
464  columnIndices, params));
465  }
466  catch (std::exception& e) {
467  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
468  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
469  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
470  << e.what ());
471  }
472  // The newly created CrsGraph _must_ have a local graph at this
473  // point. We don't really care whether CrsGraph's constructor
474  // deep-copies or shallow-copies the input, but the dimensions
475  // have to be right. That's how we tell whether the CrsGraph has
476  // a local graph.
477  auto lclGraph = graph->getLocalGraphDevice ();
478  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
479  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
480  lclGraph.entries.extent (0) != columnIndices.extent (0),
481  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
482  "ind[, params]) did not set the local graph correctly." << suffix);
483  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
484  (lclGraph.entries.extent (0) != values.extent (0),
485  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
486  "params]) did not set the local graph correctly. "
487  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
488  << " != values.extent(0) = " << values.extent (0) << suffix);
489 
490  // myGraph_ not null means that the matrix owns the graph. This
491  // is true because the column indices come in as nonconst,
492  // implying shared ownership.
493  myGraph_ = graph;
494  staticGraph_ = graph;
495 
496  // The graph may not be fill complete yet. However, it is locally
497  // indexed (since we have a column Map) and has a fixed structure
498  // (due to the input arrays). This means we can allocate the
499  // (1-D) array of values and build the local matrix right now.
500  // Note that the local matrix's number of columns comes from the
501  // column Map, not the domain Map.
502 
503  valuesPacked_wdv = values_wdv_type(values);
504  valuesUnpacked_wdv = valuesPacked_wdv;
505 
506  // FIXME (22 Jun 2016) I would very much like to get rid of
507  // k_values1D_ at some point. I find it confusing to have all
508  // these extra references lying around.
509 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
510 
512  if (verbose) {
513  std::ostringstream os;
514  os << *prefix << "Done" << endl;
515  std::cerr << os.str();
516  }
517  }
518 
519  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
521  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
522  const Teuchos::RCP<const map_type>& colMap,
523  const Teuchos::ArrayRCP<size_t>& ptr,
524  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
525  const Teuchos::ArrayRCP<Scalar>& val,
526  const Teuchos::RCP<Teuchos::ParameterList>& params) :
527  dist_object_type (rowMap),
528  storageStatus_ (Details::STORAGE_1D_PACKED)
529  {
530  using Kokkos::Compat::getKokkosViewDeepCopy;
531  using Teuchos::av_reinterpret_cast;
532  using Teuchos::RCP;
533  using values_type = typename local_matrix_device_type::values_type;
534  using IST = impl_scalar_type;
535  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
536  "RCP<const Map>, ptr, ind, val[, params]): ";
537 
538  RCP<crs_graph_type> graph;
539  try {
540  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
541  ind, params));
542  }
543  catch (std::exception& e) {
544  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
545  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
546  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
547  "RCP<ParameterList>]) threw an exception: " << e.what ());
548  }
549  // myGraph_ not null means that the matrix owns the graph. This
550  // is true because the column indices come in as nonconst,
551  // implying shared ownership.
552  myGraph_ = graph;
553  staticGraph_ = graph;
554 
555  // The graph may not be fill complete yet. However, it is locally
556  // indexed (since we have a column Map) and has a fixed structure
557  // (due to the input arrays). This means we can allocate the
558  // (1-D) array of values and build the local matrix right now.
559  // Note that the local matrix's number of columns comes from the
560  // column Map, not the domain Map.
561 
562  // The graph _must_ have a local graph at this point. We don't
563  // really care whether CrsGraph's constructor deep-copies or
564  // shallow-copies the input, but the dimensions have to be right.
565  // That's how we tell whether the CrsGraph has a local graph.
566  auto lclGraph = staticGraph_->getLocalGraphDevice ();
567  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
568  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
569  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
570  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
571  "ptr, ind[, params]) did not set the local graph correctly. "
572  "Please report this bug to the Tpetra developers.");
573 
574  values_type valIn =
575  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
576  valuesPacked_wdv = values_wdv_type(valIn);
577  valuesUnpacked_wdv = valuesPacked_wdv;
578 
579  // FIXME (22 Jun 2016) I would very much like to get rid of
580  // k_values1D_ at some point. I find it confusing to have all
581  // these extra references lying around.
582 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
583 
585  }
586 
587  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
589  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
590  const Teuchos::RCP<const map_type>& colMap,
591  const local_matrix_device_type& lclMatrix,
592  const Teuchos::RCP<Teuchos::ParameterList>& params) :
593  dist_object_type (rowMap),
594  storageStatus_ (Details::STORAGE_1D_PACKED),
595  fillComplete_ (true)
596  {
597  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
598  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
599  const char suffix[] =
600  " Please report this bug to the Tpetra developers.";
601 
602  Teuchos::RCP<crs_graph_type> graph;
603  try {
604  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
605  lclMatrix.graph, params));
606  }
607  catch (std::exception& e) {
608  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
609  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
610  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
611  "exception: " << e.what ());
612  }
613  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
614  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
615  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
616  "did not produce a fill-complete graph. Please report this bug to the "
617  "Tpetra developers.");
618  // myGraph_ not null means that the matrix owns the graph. This
619  // is true because the column indices come in as nonconst through
620  // the matrix, implying shared ownership.
621  myGraph_ = graph;
622  staticGraph_ = graph;
623 
624  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
625  valuesUnpacked_wdv = valuesPacked_wdv;
626 
627  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
628  (isFillActive (), std::logic_error,
629  "At the end of a CrsMatrix constructor that should produce "
630  "a fillComplete matrix, isFillActive() is true." << suffix);
631  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
632  (! isFillComplete (), std::logic_error, "At the end of a "
633  "CrsMatrix constructor that should produce a fillComplete "
634  "matrix, isFillComplete() is false." << suffix);
636  }
637 
638  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
641  const Teuchos::RCP<const map_type>& rowMap,
642  const Teuchos::RCP<const map_type>& colMap,
643  const Teuchos::RCP<const map_type>& domainMap,
644  const Teuchos::RCP<const map_type>& rangeMap,
645  const Teuchos::RCP<Teuchos::ParameterList>& params) :
646  dist_object_type (rowMap),
647  storageStatus_ (Details::STORAGE_1D_PACKED),
648  fillComplete_ (true)
649  {
650  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
651  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
652  "local_matrix_device_type[, RCP<ParameterList>]): ";
653  const char suffix[] =
654  " Please report this bug to the Tpetra developers.";
655 
656  Teuchos::RCP<crs_graph_type> graph;
657  try {
658  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
659  domainMap, rangeMap, params));
660  }
661  catch (std::exception& e) {
662  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
663  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
664  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
665  "RCP<ParameterList>]) threw an exception: " << e.what ());
666  }
667  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
668  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
669  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
670  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
671  "not produce a fillComplete graph." << suffix);
672  // myGraph_ not null means that the matrix owns the graph. This
673  // is true because the column indices come in as nonconst through
674  // the matrix, implying shared ownership.
675  myGraph_ = graph;
676  staticGraph_ = graph;
677 
678  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
679  valuesUnpacked_wdv = valuesPacked_wdv;
680 
681  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
682  (isFillActive (), std::logic_error,
683  "At the end of a CrsMatrix constructor that should produce "
684  "a fillComplete matrix, isFillActive() is true." << suffix);
685  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
686  (! isFillComplete (), std::logic_error, "At the end of a "
687  "CrsMatrix constructor that should produce a fillComplete "
688  "matrix, isFillComplete() is false." << suffix);
690  }
691 
692  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
695  const Teuchos::RCP<const map_type>& rowMap,
696  const Teuchos::RCP<const map_type>& colMap,
697  const Teuchos::RCP<const map_type>& domainMap,
698  const Teuchos::RCP<const map_type>& rangeMap,
699  const Teuchos::RCP<const import_type>& importer,
700  const Teuchos::RCP<const export_type>& exporter,
701  const Teuchos::RCP<Teuchos::ParameterList>& params) :
702  dist_object_type (rowMap),
703  storageStatus_ (Details::STORAGE_1D_PACKED),
704  fillComplete_ (true)
705  {
706  using Teuchos::rcp;
707  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
708  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
709  const char suffix[] =
710  " Please report this bug to the Tpetra developers.";
711 
712  Teuchos::RCP<crs_graph_type> graph;
713  try {
714  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
715  domainMap, rangeMap, importer,
716  exporter, params));
717  }
718  catch (std::exception& e) {
719  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
720  (true, std::runtime_error, "CrsGraph constructor "
721  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
722  "params) threw: " << e.what ());
723  }
724  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
725  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
726  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
727  "Export, params) did not produce a fill-complete graph. "
728  "Please report this bug to the Tpetra developers.");
729  // myGraph_ not null means that the matrix owns the graph. This
730  // is true because the column indices come in as nonconst through
731  // the matrix, implying shared ownership.
732  myGraph_ = graph;
733  staticGraph_ = graph;
734 
735  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
736  valuesUnpacked_wdv = valuesPacked_wdv;
737 
738  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
739  (isFillActive (), std::logic_error,
740  "At the end of a CrsMatrix constructor that should produce "
741  "a fillComplete matrix, isFillActive() is true." << suffix);
742  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
743  (! isFillComplete (), std::logic_error, "At the end of a "
744  "CrsMatrix constructor that should produce a fillComplete "
745  "matrix, isFillComplete() is false." << suffix);
747  }
748 
749  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
752  const Teuchos::DataAccess copyOrView):
753  dist_object_type (source.getCrsGraph()->getRowMap ()),
754  staticGraph_ (source.getCrsGraph()),
755  storageStatus_ (source.storageStatus_)
756  {
757  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
758  "const CrsMatrix&, const Teuchos::DataAccess): ";
759  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
760  (! source.isFillComplete (), std::invalid_argument,
761  "Source graph must be fillComplete().");
762 
763  if (copyOrView == Teuchos::Copy) {
764  using values_type = typename local_matrix_device_type::values_type;
765  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
766  using Kokkos::view_alloc;
767  using Kokkos::WithoutInitializing;
768  values_type newvals (view_alloc ("val", WithoutInitializing),
769  vals.extent (0));
770  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
771  Kokkos::deep_copy (newvals, vals);
772  valuesPacked_wdv = values_wdv_type(newvals);
773  valuesUnpacked_wdv = valuesPacked_wdv;
774  fillComplete (source.getDomainMap (), source.getRangeMap ());
775  }
776  else if (copyOrView == Teuchos::View) {
777  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
778  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
779  fillComplete (source.getDomainMap (), source.getRangeMap ());
780  }
781  else {
782  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
783  (true, std::invalid_argument, "Second argument 'copyOrView' "
784  "has an invalid value " << copyOrView << ". Valid values "
785  "include Teuchos::Copy = " << Teuchos::Copy << " and "
786  "Teuchos::View = " << Teuchos::View << ".");
787  }
789  }
790 
791  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
792  void
795  {
796  std::swap(crs_matrix.importMV_, this->importMV_);
797  std::swap(crs_matrix.exportMV_, this->exportMV_);
798  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
799  std::swap(crs_matrix.myGraph_, this->myGraph_);
800  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
801  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
802  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
803  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
804  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
805  }
806 
807  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
808  Teuchos::RCP<const Teuchos::Comm<int> >
810  getComm () const {
811  return getCrsGraphRef ().getComm ();
812  }
813 
814  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
815  bool
817  isFillComplete () const {
818  return fillComplete_;
819  }
820 
821  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
822  bool
824  isFillActive () const {
825  return ! fillComplete_;
826  }
827 
828  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
829  bool
832  return this->getCrsGraphRef ().isStorageOptimized ();
833  }
834 
835  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
836  bool
839  return getCrsGraphRef ().isLocallyIndexed ();
840  }
841 
842  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
843  bool
846  return getCrsGraphRef ().isGloballyIndexed ();
847  }
848 
849  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
850  bool
852  hasColMap () const {
853  return getCrsGraphRef ().hasColMap ();
854  }
855 
856  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
860  return getCrsGraphRef ().getGlobalNumEntries ();
861  }
862 
863  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
864  size_t
867  return getCrsGraphRef ().getLocalNumEntries ();
868  }
869 
870  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
874  return getCrsGraphRef ().getGlobalNumRows ();
875  }
876 
877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
881  return getCrsGraphRef ().getGlobalNumCols ();
882  }
883 
884  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
885  size_t
888  return getCrsGraphRef ().getLocalNumRows ();
889  }
890 
891 
892  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
893  size_t
896  return getCrsGraphRef ().getLocalNumCols ();
897  }
898 
899 
900  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
901  size_t
903  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
904  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
905  }
906 
907  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
908  size_t
910  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
911  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
912  }
913 
914  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
915  size_t
918  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
919  }
920 
921  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
922  size_t
925  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
926  }
927 
928  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
929  GlobalOrdinal
931  getIndexBase () const {
932  return getRowMap ()->getIndexBase ();
933  }
934 
935  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
936  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
938  getRowMap () const {
939  return getCrsGraphRef ().getRowMap ();
940  }
941 
942  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
943  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
945  getColMap () const {
946  return getCrsGraphRef ().getColMap ();
947  }
948 
949  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
950  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
952  getDomainMap () const {
953  return getCrsGraphRef ().getDomainMap ();
954  }
955 
956  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
957  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
959  getRangeMap () const {
960  return getCrsGraphRef ().getRangeMap ();
961  }
962 
963  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
964  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
966  getGraph () const {
967  if (staticGraph_ != Teuchos::null) {
968  return staticGraph_;
969  }
970  return myGraph_;
971  }
972 
973  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
974  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
976  getCrsGraph () const {
977  if (staticGraph_ != Teuchos::null) {
978  return staticGraph_;
979  }
980  return myGraph_;
981  }
982 
983  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
986  getCrsGraphRef () const
987  {
988 #ifdef HAVE_TPETRA_DEBUG
989  constexpr bool debug = true;
990 #else
991  constexpr bool debug = false;
992 #endif // HAVE_TPETRA_DEBUG
993 
994  if (! this->staticGraph_.is_null ()) {
995  return * (this->staticGraph_);
996  }
997  else {
998  if (debug) {
999  const char tfecfFuncName[] = "getCrsGraphRef: ";
1000  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1001  (this->myGraph_.is_null (), std::logic_error,
1002  "Both staticGraph_ and myGraph_ are null. "
1003  "Please report this bug to the Tpetra developers.");
1004  }
1005  return * (this->myGraph_);
1006  }
1007  }
1008 
1009  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1010  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
1013  {
1014  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1015  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
1016  numCols,
1017  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
1018  staticGraph_->getLocalGraphDevice());
1019  }
1020 
1021  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1022  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
1024  getLocalMatrixHost () const
1025  {
1026  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1027  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
1028  valuesPacked_wdv.getHostView(Access::ReadWrite),
1029  staticGraph_->getLocalGraphHost());
1030  }
1031 
1032 #if KOKKOSKERNELS_VERSION < 40299
1033 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1034  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1035  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1038  {
1039  auto localMatrix = getLocalMatrixDevice();
1040 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
1041  if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()))
1042  {
1043  if(this->ordinalRowptrs.data() == nullptr)
1044  {
1045  auto originalRowptrs = localMatrix.graph.row_map;
1046  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1047  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1048  this->ordinalRowptrs = ordinal_rowptrs_type(
1049  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1050  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1051  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1052  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1053  KOKKOS_LAMBDA(LocalOrdinal i)
1054  {
1055  ordinalRowptrs_(i) = originalRowptrs(i);
1056  });
1057  }
1058  //return local operator using ordinalRowptrs
1059  return std::make_shared<local_multiply_op_type>(
1060  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1061  }
1062 #endif
1063 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1064  return std::make_shared<local_multiply_op_type>(
1065  std::make_shared<local_matrix_device_type>(localMatrix));
1066  }
1067 #endif
1068 
1069  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1070  bool
1072  isStaticGraph () const {
1073  return myGraph_.is_null ();
1074  }
1075 
1076  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1077  bool
1080  return true;
1081  }
1082 
1083  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1084  bool
1087  return true;
1088  }
1089 
1090  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1091  void
1093  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1094  const bool verbose)
1095  {
1096  using Details::Behavior;
1098  using std::endl;
1099  const char tfecfFuncName[] = "allocateValues: ";
1100  const char suffix[] =
1101  " Please report this bug to the Tpetra developers.";
1102  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1103 
1104  std::unique_ptr<std::string> prefix;
1105  if (verbose) {
1106  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1107  std::ostringstream os;
1108  os << *prefix << "lg: "
1109  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1110  << ", gas: Graph"
1111  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1112  << "Allocated" << endl;
1113  std::cerr << os.str();
1114  }
1115 
1116  const bool debug = Behavior::debug("CrsMatrix");
1117  if (debug) {
1118  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1119  (this->staticGraph_.is_null (), std::logic_error,
1120  "staticGraph_ is null." << suffix);
1121 
1122  // If the graph indices are already allocated, then gas should be
1123  // GraphAlreadyAllocated. Otherwise, gas should be
1124  // GraphNotYetAllocated.
1125  if ((gas == GraphAlreadyAllocated) !=
1126  staticGraph_->indicesAreAllocated ()) {
1127  const char err1[] = "The caller has asserted that the graph "
1128  "is ";
1129  const char err2[] = "already allocated, but the static graph "
1130  "says that its indices are ";
1131  const char err3[] = "already allocated. ";
1132  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1133  (gas == GraphAlreadyAllocated &&
1134  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1135  err1 << err2 << "not " << err3 << suffix);
1136  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1137  (gas != GraphAlreadyAllocated &&
1138  staticGraph_->indicesAreAllocated (), std::logic_error,
1139  err1 << "not " << err2 << err3 << suffix);
1140  }
1141 
1142  // If the graph is unallocated, then it had better be a
1143  // matrix-owned graph. ("Matrix-owned graph" means that the
1144  // matrix gets to define the graph structure. If the CrsMatrix
1145  // constructor that takes an RCP<const CrsGraph> was used, then
1146  // the matrix does _not_ own the graph.)
1147  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1148  (! this->staticGraph_->indicesAreAllocated () &&
1149  this->myGraph_.is_null (), std::logic_error,
1150  "The static graph says that its indices are not allocated, "
1151  "but the graph is not owned by the matrix." << suffix);
1152  }
1153 
1154  if (gas == GraphNotYetAllocated) {
1155  if (debug) {
1156  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1157  (this->myGraph_.is_null (), std::logic_error,
1158  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1159  }
1160  try {
1161  this->myGraph_->allocateIndices (lg, verbose);
1162  }
1163  catch (std::exception& e) {
1164  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1165  (true, std::runtime_error, "CrsGraph::allocateIndices "
1166  "threw an exception: " << e.what ());
1167  }
1168  catch (...) {
1169  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1170  (true, std::runtime_error, "CrsGraph::allocateIndices "
1171  "threw an exception not a subclass of std::exception.");
1172  }
1173  }
1174 
1175  // Allocate matrix values.
1176  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1177  if (debug) {
1178  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1179  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1180  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1181  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1182  }
1183 
1184  // Allocate array of (packed???) matrix values.
1185  using values_type = typename local_matrix_device_type::values_type;
1186  if (verbose) {
1187  std::ostringstream os;
1188  os << *prefix << "Allocate values_wdv: Pre "
1189  << valuesUnpacked_wdv.extent(0) << ", post "
1190  << lclTotalNumEntries << endl;
1191  std::cerr << os.str();
1192  }
1193 // this->k_values1D_ =
1194  valuesUnpacked_wdv = values_wdv_type(
1195  values_type("Tpetra::CrsMatrix::values",
1196  lclTotalNumEntries));
1197  }
1198 
1199 
1200  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1201  void
1203  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1204  {
1206  using ::Tpetra::Details::getEntryOnHost;
1207  using Teuchos::arcp_const_cast;
1208  using Teuchos::Array;
1209  using Teuchos::ArrayRCP;
1210  using Teuchos::null;
1211  using Teuchos::RCP;
1212  using Teuchos::rcp;
1213  using std::endl;
1214  using row_map_type = typename local_graph_device_type::row_map_type;
1215  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1216  using values_type = typename local_matrix_device_type::values_type;
1217  Details::ProfilingRegion regionFLGAM
1218  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1219 
1220  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1221  "fillComplete or expertStaticFillComplete): ";
1222  const char suffix[] =
1223  " Please report this bug to the Tpetra developers.";
1224  const bool debug = Details::Behavior::debug("CrsMatrix");
1225  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1226 
1227  std::unique_ptr<std::string> prefix;
1228  if (verbose) {
1229  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1230  std::ostringstream os;
1231  os << *prefix << endl;
1232  std::cerr << os.str ();
1233  }
1234 
1235  if (debug) {
1236  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1237  // matrix owns the graph, which means myGraph_ is not null.
1238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1239  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1240  "(myGraph_) is null. This means that the matrix has a "
1241  "const (a.k.a. \"static\") graph. fillComplete or "
1242  "expertStaticFillComplete should never call "
1243  "fillLocalGraphAndMatrix in that case." << suffix);
1244  }
1245 
1246  const size_t lclNumRows = this->getLocalNumRows ();
1247 
1248  // This method's goal is to fill in the three arrays (compressed
1249  // sparse row format) that define the sparse graph's and matrix's
1250  // structure, and the sparse matrix's values.
1251  //
1252  // Get references to the data in myGraph_, so we can modify them
1253  // as well. Note that we only call fillLocalGraphAndMatrix() if
1254  // the matrix owns the graph, which means myGraph_ is not null.
1255 
1256  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1257  // See: https://github.com/trilinos/Trilinos/issues/12237
1258  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1259  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1260 
1261  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1262  myGraph_->rowPtrsUnpacked_dev_;
1263 
1264  if (debug) {
1265  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1266  (curRowOffsets.extent (0) == 0, std::logic_error,
1267  "curRowOffsets.extent(0) == 0.");
1268  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1269  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1270  "curRowOffsets.extent(0) = "
1271  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1272  << (lclNumRows + 1) << ".");
1273  const size_t numOffsets = curRowOffsets.extent (0);
1274  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1275  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1276  (numOffsets != 0 &&
1277  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1278  std::logic_error, "numOffsets = " <<
1279  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1280  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1281  << numOffsets << ") = " << valToCheck << ".");
1282  }
1283 
1284  if (myGraph_->getLocalNumEntries() !=
1285  myGraph_->getLocalAllocationSize()) {
1286 
1287  // Use the nonconst version of row_map_type for k_ptrs,
1288  // because row_map_type is const and we need to modify k_ptrs here.
1289  typename row_map_type::non_const_type k_ptrs;
1290  row_map_type k_ptrs_const;
1291  lclinds_1d_type k_inds;
1292  values_type k_vals;
1293 
1294  if (verbose) {
1295  std::ostringstream os;
1296  const auto numEnt = myGraph_->getLocalNumEntries();
1297  const auto allocSize = myGraph_->getLocalAllocationSize();
1298  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1299  << ", allocSize=" << allocSize << endl;
1300  std::cerr << os.str ();
1301  }
1302  // The matrix's current 1-D storage is "unpacked." This means
1303  // the row offsets may differ from what the final row offsets
1304  // should be. This could happen, for example, if the user
1305  // set an upper
1306  // bound on the number of entries per row, but didn't fill all
1307  // those entries.
1308  if (debug && curRowOffsets.extent (0) != 0) {
1309  const size_t numOffsets =
1310  static_cast<size_t> (curRowOffsets.extent (0));
1311  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1312  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1313  (static_cast<size_t> (valToCheck) !=
1314  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1315  std::logic_error, "(unpacked branch) Before "
1316  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1317  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1318  " = " << valuesUnpacked_wdv.extent (0) << ".");
1319  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1320  (static_cast<size_t> (valToCheck) !=
1321  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1322  std::logic_error, "(unpacked branch) Before "
1323  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1324  << ") = " << valToCheck
1325  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1326  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1327  }
1328  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1329  // the array of valid entry counts per row.
1330 
1331  // Total number of entries in the matrix on the calling
1332  // process. We will compute this in the loop below. It's
1333  // cheap to compute and useful as a sanity check.
1334  size_t lclTotalNumEntries = 0;
1335  {
1336  // Allocate the packed row offsets array. We use a nonconst
1337  // temporary (packedRowOffsets) here, because k_ptrs is
1338  // const. We will assign packedRowOffsets to k_ptrs below.
1339  if (verbose) {
1340  std::ostringstream os;
1341  os << *prefix << "Allocate packed row offsets: "
1342  << (lclNumRows+1) << endl;
1343  std::cerr << os.str ();
1344  }
1345  typename row_map_type::non_const_type
1346  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1347  typename row_entries_type::const_type numRowEnt_h =
1348  myGraph_->k_numRowEntries_;
1349  // We're computing offsets on device. This function can
1350  // handle numRowEnt_h being a host View.
1351  lclTotalNumEntries =
1352  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1353  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1354  // to use packedRowOffsets in the loop above and assign here.
1355  k_ptrs = packedRowOffsets;
1356  k_ptrs_const = k_ptrs;
1357  }
1358 
1359  if (debug) {
1360  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1361  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1362  std::logic_error,
1363  "(unpacked branch) After packing k_ptrs, "
1364  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1365  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1366  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1367  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1368  (valToCheck != lclTotalNumEntries, std::logic_error,
1369  "(unpacked branch) After filling k_ptrs, "
1370  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1371  << " != total number of entries on the calling process = "
1372  << lclTotalNumEntries << ".");
1373  }
1374 
1375  // Allocate the arrays of packed column indices and values.
1376  if (verbose) {
1377  std::ostringstream os;
1378  os << *prefix << "Allocate packed local column indices: "
1379  << lclTotalNumEntries << endl;
1380  std::cerr << os.str ();
1381  }
1382  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1383  if (verbose) {
1384  std::ostringstream os;
1385  os << *prefix << "Allocate packed values: "
1386  << lclTotalNumEntries << endl;
1387  std::cerr << os.str ();
1388  }
1389  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1390 
1391  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1392  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1393  // the packed row offsets array k_ptrs that we created above.
1394  //
1395  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1396  // need to keep around the unpacked row offsets, column
1397  // indices, and values arrays.
1398 
1399  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1400  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1401  using inds_packer_type = pack_functor<
1402  typename Graph::local_graph_device_type::entries_type::non_const_type,
1403  typename Graph::local_inds_dualv_type::t_dev::const_type,
1404  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1405  typename Graph::local_graph_device_type::row_map_type>;
1406  inds_packer_type indsPacker (
1407  k_inds,
1408  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1409  k_ptrs, curRowOffsets);
1410  using exec_space = typename decltype (k_inds)::execution_space;
1411  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1412  Kokkos::parallel_for
1413  ("Tpetra::CrsMatrix pack column indices",
1414  range_type (0, lclNumRows), indsPacker);
1415 
1416  // Pack the values from unpacked valuesUnpacked_wdv into packed
1417  // k_vals. We will replace valuesPacked_wdv below.
1418  using vals_packer_type = pack_functor<
1419  typename values_type::non_const_type,
1420  typename values_type::const_type,
1421  typename row_map_type::non_const_type,
1422  typename row_map_type::const_type>;
1423  vals_packer_type valsPacker (
1424  k_vals,
1425  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1426  k_ptrs, curRowOffsets);
1427  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1428  range_type (0, lclNumRows), valsPacker);
1429 
1430  if (debug) {
1431  const char myPrefix[] = "(\"Optimize Storage\""
1432  "=true branch) After packing, ";
1433  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1434  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1435  << "k_ptrs.extent(0) = 0. This probably means that "
1436  "rowPtrsUnpacked_ was never allocated.");
1437  if (k_ptrs.extent (0) != 0) {
1438  const size_t numOffsets (k_ptrs.extent (0));
1439  const auto valToCheck =
1440  getEntryOnHost (k_ptrs, numOffsets - 1);
1441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1442  (size_t (valToCheck) != k_vals.extent (0),
1443  std::logic_error, myPrefix <<
1444  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1445  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1446  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1447  (size_t (valToCheck) != k_inds.extent (0),
1448  std::logic_error, myPrefix <<
1449  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1450  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1451  }
1452  }
1453  // Build the local graph.
1454  myGraph_->setRowPtrsPacked(k_ptrs_const);
1455  myGraph_->lclIndsPacked_wdv =
1456  typename crs_graph_type::local_inds_wdv_type(k_inds);
1457  valuesPacked_wdv = values_wdv_type(k_vals);
1458  }
1459  else { // We don't have to pack, so just set the pointers.
1460  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1461  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1462  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1463  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1464  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1465  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1466  valuesPacked_wdv = valuesUnpacked_wdv;
1467 
1468  if (verbose) {
1469  std::ostringstream os;
1470  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1471  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1472  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1473  << valuesUnpacked_wdv.extent(0) << endl;
1474  std::cerr << os.str();
1475  }
1476 
1477  if (debug) {
1478  const char myPrefix[] =
1479  "(\"Optimize Storage\"=false branch) ";
1480  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1481  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1482  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1483  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1484  "that rowPtrsUnpacked_ was never allocated.");
1485  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1486  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1487  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1488  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1489  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1490  std::logic_error, myPrefix <<
1491  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1492  << " != valuesPacked_wdv.extent(0) = "
1493  << valuesPacked_wdv.extent (0) << ".");
1494  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1495  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1496  std::logic_error, myPrefix <<
1497  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1498  << " != myGraph_->lclIndsPacked.extent(0) = "
1499  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1500  }
1501  }
1502  }
1503 
1504  if (debug) {
1505  const char myPrefix[] = "After packing, ";
1506  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1507  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1508  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1509  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1510  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1511  (lclNumRows+1) << ".");
1512  if (rowPtrsPackedHost.extent (0) != 0) {
1513  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1514  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1515  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1516  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1517  std::logic_error, myPrefix << "k_ptrs_const(" <<
1518  (numOffsets-1) << ") = " << valToCheck
1519  << " != valuesPacked_wdv.extent(0) = "
1520  << valuesPacked_wdv.extent (0) << ".");
1521  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1522  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1523  std::logic_error, myPrefix << "k_ptrs_const(" <<
1524  (numOffsets-1) << ") = " << valToCheck
1525  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1526  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1527  }
1528  }
1529 
1530  // May we ditch the old allocations for the packed (and otherwise
1531  // "optimized") allocations, later in this routine? Optimize
1532  // storage if the graph is not static, or if the graph already has
1533  // optimized storage.
1534  const bool defaultOptStorage =
1535  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1536  const bool requestOptimizedStorage =
1537  (! params.is_null () &&
1538  params->get ("Optimize Storage", defaultOptStorage)) ||
1539  (params.is_null () && defaultOptStorage);
1540 
1541  // The graph has optimized storage when indices are allocated,
1542  // myGraph_->k_numRowEntries_ is empty, and there are more than
1543  // zero rows on this process.
1544  if (requestOptimizedStorage) {
1545  // Free the old, unpacked, unoptimized allocations.
1546  // Free graph data structures that are only needed for
1547  // unpacked 1-D storage.
1548  if (verbose) {
1549  std::ostringstream os;
1550  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1551  << myGraph_->k_numRowEntries_.extent(0) << endl;
1552  std::cerr << os.str();
1553  }
1554 
1555  myGraph_->k_numRowEntries_ = row_entries_type ();
1556 
1557  // Keep the new 1-D packed allocations.
1558  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1559  // We directly set the memory spaces to avoid a memcpy from device to host
1560  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1561  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1562  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1563  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1564  valuesUnpacked_wdv = valuesPacked_wdv;
1565 
1566  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1567  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1568  }
1569  else {
1570  if (verbose) {
1571  std::ostringstream os;
1572  os << *prefix << "User requested NOT to optimize storage"
1573  << endl;
1574  std::cerr << os.str();
1575  }
1576  }
1577  }
1578 
1579  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1580  void
1582  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1583  {
1584  using ::Tpetra::Details::ProfilingRegion;
1585  using Teuchos::ArrayRCP;
1586  using Teuchos::Array;
1587  using Teuchos::null;
1588  using Teuchos::RCP;
1589  using Teuchos::rcp;
1590  using std::endl;
1591  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1592  using non_const_row_map_type = typename row_map_type::non_const_type;
1593  using values_type = typename local_matrix_device_type::values_type;
1594  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1595  const size_t lclNumRows = getLocalNumRows();
1596 
1597  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1598  std::unique_ptr<std::string> prefix;
1599  if (verbose) {
1600  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1601  std::ostringstream os;
1602  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1603  std::cerr << os.str ();
1604  }
1605 
1606  // The goals of this routine are first, to allocate and fill
1607  // packed 1-D storage (see below for an explanation) in the vals
1608  // array, and second, to give vals to the local matrix and
1609  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1610  // row offsets, within the scope of this routine, since we're only
1611  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1612  // fill both the graph and the matrix at the same time).
1613 
1614  // get data from staticGraph_
1615  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1616  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1617  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1618 
1619  row_map_type k_ptrs; // "packed" row offsets array
1620  values_type k_vals; // "packed" values array
1621 
1622  // May we ditch the old allocations for the packed (and otherwise
1623  // "optimized") allocations, later in this routine? Request
1624  // optimized storage by default.
1625  bool requestOptimizedStorage = true;
1626  const bool default_OptimizeStorage =
1627  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1628  if (! params.is_null() &&
1629  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1630  requestOptimizedStorage = false;
1631  }
1632  // If we're not allowed to change a static graph, then we can't
1633  // change the storage of the matrix, either. This means that if
1634  // the graph's storage isn't already optimized, we can't optimize
1635  // the matrix's storage either. Check and give warning, as
1636  // appropriate.
1637  if (! staticGraph_->isStorageOptimized () &&
1638  requestOptimizedStorage) {
1640  (true, std::runtime_error, "You requested optimized storage "
1641  "by setting the \"Optimize Storage\" flag to \"true\" in "
1642  "the ParameterList, or by virtue of default behavior. "
1643  "However, the associated CrsGraph was filled separately and "
1644  "requested not to optimize storage. Therefore, the "
1645  "CrsMatrix cannot optimize storage.");
1646  requestOptimizedStorage = false;
1647  }
1648 
1649  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1650  // See: https://github.com/trilinos/Trilinos/issues/12237
1651  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1652  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1653 
1654  // The matrix's values are currently
1655  // stored in a 1-D format. However, this format is "unpacked";
1656  // it doesn't necessarily have the same row offsets as indicated
1657  // by the ptrs array returned by allocRowPtrs. This could
1658  // happen, for example, if the user
1659  // fixed the number of matrix entries in
1660  // each row, but didn't fill all those entries.
1661  //
1662  // As above, we don't need to keep the "packed" row offsets
1663  // array ptrs here, but we do need it here temporarily, so we
1664  // have to allocate it. We'll free ptrs later in this method.
1665  //
1666  // Note that this routine checks whether storage has already
1667  // been packed. This is a common case for solution of nonlinear
1668  // PDEs using the finite element method, as long as the
1669  // structure of the sparse matrix does not change between linear
1670  // solves.
1671  if (nodeNumEntries != nodeNumAllocated) {
1672  if (verbose) {
1673  std::ostringstream os;
1674  os << *prefix << "Unpacked 1-D storage: numEnt="
1675  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1676  << endl;
1677  std::cerr << os.str();
1678  }
1679  // We have to pack the 1-D storage, since the user didn't fill
1680  // up all requested storage.
1681  if (verbose) {
1682  std::ostringstream os;
1683  os << *prefix << "Allocate packed row offsets: "
1684  << (lclNumRows+1) << endl;
1685  std::cerr << os.str();
1686  }
1687  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1688  lclNumRows+1);
1689  // Total number of entries in the matrix on the calling
1690  // process. We will compute this in the loop below. It's
1691  // cheap to compute and useful as a sanity check.
1692  size_t lclTotalNumEntries = 0;
1693  k_ptrs = tmpk_ptrs;
1694  {
1695  typename row_entries_type::const_type numRowEnt_h =
1696  staticGraph_->k_numRowEntries_;
1697  // This function can handle the counts being a host View.
1698  lclTotalNumEntries =
1699  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1700  }
1701 
1702  // Allocate the "packed" values array.
1703  // It has exactly the right number of entries.
1704  if (verbose) {
1705  std::ostringstream os;
1706  os << *prefix << "Allocate packed values: "
1707  << lclTotalNumEntries << endl;
1708  std::cerr << os.str ();
1709  }
1710  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1711 
1712  // Pack values_wdv into k_vals. We will replace values_wdv below.
1713  pack_functor<
1714  typename values_type::non_const_type,
1715  typename values_type::const_type,
1716  typename row_map_type::non_const_type,
1717  typename row_map_type::const_type> valsPacker
1718  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1719  tmpk_ptrs, k_rowPtrs);
1720 
1721  using exec_space = typename decltype (k_vals)::execution_space;
1722  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1723  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1724  range_type (0, lclNumRows), valsPacker);
1725  valuesPacked_wdv = values_wdv_type(k_vals);
1726  }
1727  else { // We don't have to pack, so just set the pointer.
1728  valuesPacked_wdv = valuesUnpacked_wdv;
1729  if (verbose) {
1730  std::ostringstream os;
1731  os << *prefix << "Storage already packed: "
1732  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1733  std::cerr << os.str();
1734  }
1735  }
1736 
1737  // May we ditch the old allocations for the packed one?
1738  if (requestOptimizedStorage) {
1739  // The user requested optimized storage, so we can dump the
1740  // unpacked 1-D storage, and keep the packed storage.
1741  valuesUnpacked_wdv = valuesPacked_wdv;
1742 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1743  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1744  }
1745  }
1746 
1747  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1748  void
1750  insertIndicesAndValues (crs_graph_type& graph,
1751  RowInfo& rowInfo,
1752  const typename crs_graph_type::SLocalGlobalViews& newInds,
1753  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1754  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1755  const ELocalGlobal lg,
1756  const ELocalGlobal I)
1757  {
1758  const size_t oldNumEnt = rowInfo.numEntries;
1759  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1760 
1761  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1762  // that probably relates to scalar_type vs. impl_scalar_type
1763  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1764  // details; look for GCC_WORKAROUND macro definition.
1765  if (numInserted > 0) {
1766  const size_t startOffset = oldNumEnt;
1767  memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1768  numInserted * sizeof (impl_scalar_type));
1769  }
1770  }
1771 
1772  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1773  void
1775  insertLocalValues (const LocalOrdinal lclRow,
1776  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1777  const Teuchos::ArrayView<const Scalar>& values,
1778  const CombineMode CM)
1779  {
1780  using std::endl;
1781  const char tfecfFuncName[] = "insertLocalValues: ";
1782 
1783  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1784  (! this->isFillActive (), std::runtime_error,
1785  "Fill is not active. After calling fillComplete, you must call "
1786  "resumeFill before you may insert entries into the matrix again.");
1787  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1788  (this->isStaticGraph (), std::runtime_error,
1789  "Cannot insert indices with static graph; use replaceLocalValues() "
1790  "instead.");
1791  // At this point, we know that myGraph_ is nonnull.
1792  crs_graph_type& graph = * (this->myGraph_);
1793  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1794  (graph.colMap_.is_null (), std::runtime_error,
1795  "Cannot insert local indices without a column map.");
1796  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1797  (graph.isGloballyIndexed (),
1798  std::runtime_error, "Graph indices are global; use "
1799  "insertGlobalValues().");
1800  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1801  (values.size () != indices.size (), std::runtime_error,
1802  "values.size() = " << values.size ()
1803  << " != indices.size() = " << indices.size () << ".");
1804  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1805  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1806  "Local row index " << lclRow << " does not belong to this process.");
1807 
1808  if (! graph.indicesAreAllocated ()) {
1809  // We only allocate values at most once per process, so it's OK
1810  // to check TPETRA_VERBOSE here.
1811  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1812  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1813  }
1814 
1815 #ifdef HAVE_TPETRA_DEBUG
1816  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1817  // In a debug build, test whether any of the given column indices
1818  // are not in the column Map. Keep track of the invalid column
1819  // indices so we can tell the user about them.
1820  {
1821  using Teuchos::toString;
1822 
1823  const map_type& colMap = * (graph.colMap_);
1824  Teuchos::Array<LocalOrdinal> badColInds;
1825  bool allInColMap = true;
1826  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1827  if (! colMap.isNodeLocalElement (indices[k])) {
1828  allInColMap = false;
1829  badColInds.push_back (indices[k]);
1830  }
1831  }
1832  if (! allInColMap) {
1833  std::ostringstream os;
1834  os << "You attempted to insert entries in owned row " << lclRow
1835  << ", at the following column indices: " << toString (indices)
1836  << "." << endl;
1837  os << "Of those, the following indices are not in the column Map on "
1838  "this process: " << toString (badColInds) << "." << endl << "Since "
1839  "the matrix has a column Map already, it is invalid to insert "
1840  "entries at those locations.";
1841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1842  (true, std::invalid_argument, os.str ());
1843  }
1844  }
1845 #endif // HAVE_TPETRA_DEBUG
1846 
1847  RowInfo rowInfo = graph.getRowInfo (lclRow);
1848 
1849  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1850  if (CM == ADD) {
1851  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1852  valsView[offset] += values[k]; };
1853  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1854  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1855  } else if (CM == INSERT) {
1856  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1857  valsView[offset] = values[k]; };
1858  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1859  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1860  } else {
1861  std::ostringstream os;
1862  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1863  << "but this has not been implemented." << endl;
1864  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1865  (true, std::invalid_argument, os.str ());
1866  }
1867  }
1868 
1869  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1870  void
1872  insertLocalValues (const LocalOrdinal localRow,
1873  const LocalOrdinal numEnt,
1874  const Scalar vals[],
1875  const LocalOrdinal cols[],
1876  const CombineMode CM)
1877  {
1878  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1879  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1880  this->insertLocalValues (localRow, colsT, valsT, CM);
1881  }
1882 
1883  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1884  void
1887  RowInfo& rowInfo,
1888  const GlobalOrdinal gblColInds[],
1889  const impl_scalar_type vals[],
1890  const size_t numInputEnt)
1891  {
1892 #ifdef HAVE_TPETRA_DEBUG
1893  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1894  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1895  const size_t curNumEnt = rowInfo.numEntries;
1896 #endif // HAVE_TPETRA_DEBUG
1897 
1898  if (! graph.indicesAreAllocated ()) {
1899  // We only allocate values at most once per process, so it's OK
1900  // to check TPETRA_VERBOSE here.
1901  using ::Tpetra::Details::Behavior;
1902  const bool verbose = Behavior::verbose("CrsMatrix");
1903  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1904  // mfh 23 Jul 2017: allocateValues invalidates existing
1905  // getRowInfo results. Once we get rid of lazy graph
1906  // allocation, we'll be able to move the getRowInfo call outside
1907  // of this method.
1908  rowInfo = graph.getRowInfo (rowInfo.localRow);
1909  }
1910 
1911  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1912  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1913  valsView[offset] += vals[k];
1914  };
1915  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1916 #ifdef HAVE_TPETRA_DEBUG
1917  //numInserted is only used inside the debug code below.
1918  auto numInserted =
1919 #endif
1920  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1921 
1922 #ifdef HAVE_TPETRA_DEBUG
1923  size_t newNumEnt = curNumEnt + numInserted;
1924  const size_t chkNewNumEnt =
1925  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1926  if (chkNewNumEnt != newNumEnt) {
1927  std::ostringstream os;
1928  os << std::endl << "newNumEnt = " << newNumEnt
1929  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1930  << ") = " << chkNewNumEnt << "." << std::endl
1931  << "\torigNumEnt: " << origNumEnt << std::endl
1932  << "\tnumInputEnt: " << numInputEnt << std::endl
1933  << "\tgblColInds: [";
1934  for (size_t k = 0; k < numInputEnt; ++k) {
1935  os << gblColInds[k];
1936  if (k + size_t (1) < numInputEnt) {
1937  os << ",";
1938  }
1939  }
1940  os << "]" << std::endl
1941  << "\tvals: [";
1942  for (size_t k = 0; k < numInputEnt; ++k) {
1943  os << vals[k];
1944  if (k + size_t (1) < numInputEnt) {
1945  os << ",";
1946  }
1947  }
1948  os << "]" << std::endl;
1949 
1950  if (this->supportsRowViews ()) {
1951  values_host_view_type vals2;
1952  if (this->isGloballyIndexed ()) {
1953  global_inds_host_view_type gblColInds2;
1954  const GlobalOrdinal gblRow =
1955  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1956  if (gblRow ==
1957  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1958  os << "Local row index " << rowInfo.localRow << " is invalid!"
1959  << std::endl;
1960  }
1961  else {
1962  bool getViewThrew = false;
1963  try {
1964  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1965  }
1966  catch (std::exception& e) {
1967  getViewThrew = true;
1968  os << "getGlobalRowView threw exception:" << std::endl
1969  << e.what () << std::endl;
1970  }
1971  if (! getViewThrew) {
1972  os << "\tNew global column indices: ";
1973  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1974  os << gblColInds2[jjj] << " ";
1975  os << std::endl;
1976  os << "\tNew values: ";
1977  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1978  os << vals2[jjj] << " ";
1979  os << std::endl;
1980  }
1981  }
1982  }
1983  else if (this->isLocallyIndexed ()) {
1984  local_inds_host_view_type lclColInds2;
1985  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1986  os << "\tNew local column indices: ";
1987  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1988  os << lclColInds2[jjj] << " ";
1989  os << std::endl;
1990  os << "\tNew values: ";
1991  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1992  os << vals2[jjj] << " ";
1993  os << std::endl;
1994  }
1995  }
1996 
1997  os << "Please report this bug to the Tpetra developers.";
1998  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1999  (true, std::logic_error, os.str ());
2000  }
2001 #endif // HAVE_TPETRA_DEBUG
2002  }
2003 
2004  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2005  void
2007  insertGlobalValues (const GlobalOrdinal gblRow,
2008  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2009  const Teuchos::ArrayView<const Scalar>& values)
2010  {
2011  using Teuchos::toString;
2012  using std::endl;
2013  typedef impl_scalar_type IST;
2014  typedef LocalOrdinal LO;
2015  typedef GlobalOrdinal GO;
2016  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2017  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
2018  const char tfecfFuncName[] = "insertGlobalValues: ";
2019 
2020 #ifdef HAVE_TPETRA_DEBUG
2021  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2022  (values.size () != indices.size (), std::runtime_error,
2023  "values.size() = " << values.size () << " != indices.size() = "
2024  << indices.size () << ".");
2025 #endif // HAVE_TPETRA_DEBUG
2026 
2027  // getRowMap() is not thread safe, because it increments RCP's
2028  // reference count. getCrsGraphRef() is thread safe.
2029  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2030  const LO lclRow = rowMap.getLocalElement (gblRow);
2031 
2032  if (lclRow == OTLO::invalid ()) {
2033  // Input row is _not_ owned by the calling process.
2034  //
2035  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2036  // is not in the row Map, it doesn't matter whether or not the
2037  // graph is static; the data just get stashed for later use by
2038  // globalAssemble().
2039  this->insertNonownedGlobalValues (gblRow, indices, values);
2040  }
2041  else { // Input row _is_ owned by the calling process
2042  if (this->isStaticGraph ()) {
2043  // Uh oh! Not allowed to insert into owned rows in that case.
2044  const int myRank = rowMap.getComm ()->getRank ();
2045  const int numProcs = rowMap.getComm ()->getSize ();
2046  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2047  (true, std::runtime_error,
2048  "The matrix was constructed with a constant (\"static\") graph, "
2049  "yet the given global row index " << gblRow << " is in the row "
2050  "Map on the calling process (with rank " << myRank << ", of " <<
2051  numProcs << " process(es)). In this case, you may not insert "
2052  "new entries into rows owned by the calling process.");
2053  }
2054 
2055  crs_graph_type& graph = * (this->myGraph_);
2056  const IST* const inputVals =
2057  reinterpret_cast<const IST*> (values.getRawPtr ());
2058  const GO* const inputGblColInds = indices.getRawPtr ();
2059  const size_t numInputEnt = indices.size ();
2060  RowInfo rowInfo = graph.getRowInfo (lclRow);
2061 
2062  // If the matrix has a column Map, check at this point whether
2063  // the column indices belong to the column Map.
2064  //
2065  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2066  // test to the CrsGraph method, since it may have to do this
2067  // anyway.
2068  if (! graph.colMap_.is_null ()) {
2069  const map_type& colMap = * (graph.colMap_);
2070  // In a debug build, keep track of the nonowned ("bad") column
2071  // indices, so that we can display them in the exception
2072  // message. In a release build, just ditch the loop early if
2073  // we encounter a nonowned column index.
2074 #ifdef HAVE_TPETRA_DEBUG
2075  Teuchos::Array<GO> badColInds;
2076 #endif // HAVE_TPETRA_DEBUG
2077  const size_type numEntriesToInsert = indices.size ();
2078  bool allInColMap = true;
2079  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2080  if (! colMap.isNodeGlobalElement (indices[k])) {
2081  allInColMap = false;
2082 #ifdef HAVE_TPETRA_DEBUG
2083  badColInds.push_back (indices[k]);
2084 #else
2085  break;
2086 #endif // HAVE_TPETRA_DEBUG
2087  }
2088  }
2089  if (! allInColMap) {
2090  std::ostringstream os;
2091  os << "You attempted to insert entries in owned row " << gblRow
2092  << ", at the following column indices: " << toString (indices)
2093  << "." << endl;
2094 #ifdef HAVE_TPETRA_DEBUG
2095  os << "Of those, the following indices are not in the column Map "
2096  "on this process: " << toString (badColInds) << "." << endl
2097  << "Since the matrix has a column Map already, it is invalid "
2098  "to insert entries at those locations.";
2099 #else
2100  os << "At least one of those indices is not in the column Map "
2101  "on this process." << endl << "It is invalid to insert into "
2102  "columns not in the column Map on the process that owns the "
2103  "row.";
2104 #endif // HAVE_TPETRA_DEBUG
2105  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2106  (true, std::invalid_argument, os.str ());
2107  }
2108  }
2109 
2110  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2111  inputVals, numInputEnt);
2112  }
2113  }
2114 
2115 
2116  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2117  void
2119  insertGlobalValues (const GlobalOrdinal globalRow,
2120  const LocalOrdinal numEnt,
2121  const Scalar vals[],
2122  const GlobalOrdinal inds[])
2123  {
2124  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2125  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2126  this->insertGlobalValues (globalRow, indsT, valsT);
2127  }
2128 
2129 
2130  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2131  void
2134  const GlobalOrdinal gblRow,
2135  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2136  const Teuchos::ArrayView<const Scalar>& values,
2137  const bool debug)
2138  {
2139  typedef impl_scalar_type IST;
2140  typedef LocalOrdinal LO;
2141  typedef GlobalOrdinal GO;
2142  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2143  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2144 
2145  if (debug) {
2146  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2147  (values.size () != indices.size (), std::runtime_error,
2148  "values.size() = " << values.size () << " != indices.size() = "
2149  << indices.size () << ".");
2150  }
2151 
2152  // getRowMap() is not thread safe, because it increments RCP's
2153  // reference count. getCrsGraphRef() is thread safe.
2154  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2155  const LO lclRow = rowMap.getLocalElement (gblRow);
2156  if (lclRow == OTLO::invalid ()) {
2157  // Input row is _not_ owned by the calling process.
2158  //
2159  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2160  // is not in the row Map, it doesn't matter whether or not the
2161  // graph is static; the data just get stashed for later use by
2162  // globalAssemble().
2163  this->insertNonownedGlobalValues (gblRow, indices, values);
2164  }
2165  else { // Input row _is_ owned by the calling process
2166  if (this->isStaticGraph ()) {
2167  // Uh oh! Not allowed to insert into owned rows in that case.
2168  const int myRank = rowMap.getComm ()->getRank ();
2169  const int numProcs = rowMap.getComm ()->getSize ();
2170  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2171  (true, std::runtime_error,
2172  "The matrix was constructed with a constant (\"static\") graph, "
2173  "yet the given global row index " << gblRow << " is in the row "
2174  "Map on the calling process (with rank " << myRank << ", of " <<
2175  numProcs << " process(es)). In this case, you may not insert "
2176  "new entries into rows owned by the calling process.");
2177  }
2178 
2179  crs_graph_type& graph = * (this->myGraph_);
2180  const IST* const inputVals =
2181  reinterpret_cast<const IST*> (values.getRawPtr ());
2182  const GO* const inputGblColInds = indices.getRawPtr ();
2183  const size_t numInputEnt = indices.size ();
2184  RowInfo rowInfo = graph.getRowInfo (lclRow);
2185 
2186  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2187  // This branch is similar in function to the following branch, but for
2188  // the special case that the target graph is locally indexed.
2189  // In this case, we cannot simply filter
2190  // out global indices that don't exist on the receiving process and
2191  // insert the remaining (global) indices, but we must convert them (the
2192  // remaining global indices) to local and call `insertLocalValues`.
2193  const map_type& colMap = * (graph.colMap_);
2194  size_t curOffset = 0;
2195  while (curOffset < numInputEnt) {
2196  // Find a sequence of input indices that are in the column Map on the
2197  // calling process. Doing a sequence at a time, instead of one at a
2198  // time, amortizes some overhead.
2199  Teuchos::Array<LO> lclIndices;
2200  size_t endOffset = curOffset;
2201  for ( ; endOffset < numInputEnt; ++endOffset) {
2202  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2203  if (lclIndex != OTLO::invalid())
2204  lclIndices.push_back(lclIndex);
2205  else
2206  break;
2207  }
2208  // curOffset, endOffset: half-exclusive range of indices in the column
2209  // Map on the calling process. If endOffset == curOffset, the range is
2210  // empty.
2211  const LO numIndInSeq = (endOffset - curOffset);
2212  if (numIndInSeq != 0) {
2213  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2214  }
2215  // Invariant before the increment line: Either endOffset ==
2216  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2217  // on the calling process.
2218  if (debug) {
2219  const bool invariant = endOffset == numInputEnt ||
2220  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2221  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2222  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2223  }
2224  curOffset = endOffset + 1;
2225  }
2226  }
2227  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2228  const map_type& colMap = * (graph.colMap_);
2229  size_t curOffset = 0;
2230  while (curOffset < numInputEnt) {
2231  // Find a sequence of input indices that are in the column
2232  // Map on the calling process. Doing a sequence at a time,
2233  // instead of one at a time, amortizes some overhead.
2234  size_t endOffset = curOffset;
2235  for ( ; endOffset < numInputEnt &&
2236  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2237  ++endOffset)
2238  {}
2239  // curOffset, endOffset: half-exclusive range of indices in
2240  // the column Map on the calling process. If endOffset ==
2241  // curOffset, the range is empty.
2242  const LO numIndInSeq = (endOffset - curOffset);
2243  if (numIndInSeq != 0) {
2244  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2245  this->insertGlobalValuesImpl (graph, rowInfo,
2246  inputGblColInds + curOffset,
2247  inputVals + curOffset,
2248  numIndInSeq);
2249  }
2250  // Invariant before the increment line: Either endOffset ==
2251  // numInputEnt, or inputGblColInds[endOffset] is not in the
2252  // column Map on the calling process.
2253  if (debug) {
2254  const bool invariant = endOffset == numInputEnt ||
2255  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2256  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2257  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2258  }
2259  curOffset = endOffset + 1;
2260  }
2261  }
2262  else { // we don't have a column Map.
2263  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2264  inputVals, numInputEnt);
2265  }
2266  }
2267  }
2268 
2269  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2270  void
2271  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2272  insertGlobalValuesFilteredChecked(
2273  const GlobalOrdinal gblRow,
2274  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2275  const Teuchos::ArrayView<const Scalar>& values,
2276  const char* const prefix,
2277  const bool debug,
2278  const bool verbose)
2279  {
2281  using std::endl;
2282 
2283  try {
2284  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2285  }
2286  catch(std::exception& e) {
2287  std::ostringstream os;
2288  if (verbose) {
2289  const size_t maxNumToPrint =
2291  os << *prefix << ": insertGlobalValuesFiltered threw an "
2292  "exception: " << e.what() << endl
2293  << "Global row index: " << gblRow << endl;
2294  verbosePrintArray(os, indices, "Global column indices",
2295  maxNumToPrint);
2296  os << endl;
2297  verbosePrintArray(os, values, "Values", maxNumToPrint);
2298  os << endl;
2299  }
2300  else {
2301  os << ": insertGlobalValuesFiltered threw an exception: "
2302  << e.what();
2303  }
2304  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2305  }
2306  }
2307 
2308  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2309  LocalOrdinal
2312  const crs_graph_type& graph,
2313  const RowInfo& rowInfo,
2314  const LocalOrdinal inds[],
2315  const impl_scalar_type newVals[],
2316  const LocalOrdinal numElts)
2317  {
2318  typedef LocalOrdinal LO;
2319  typedef GlobalOrdinal GO;
2320  const bool sorted = graph.isSorted ();
2321 
2322  size_t hint = 0; // Guess for the current index k into rowVals
2323  LO numValid = 0; // number of valid local column indices
2324 
2325  if (graph.isLocallyIndexed ()) {
2326  // Get a view of the column indices in the row. This amortizes
2327  // the cost of getting the view over all the entries of inds.
2328  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2329 
2330  for (LO j = 0; j < numElts; ++j) {
2331  const LO lclColInd = inds[j];
2332  const size_t offset =
2333  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2334  lclColInd, hint, sorted);
2335  if (offset != rowInfo.numEntries) {
2336  rowVals[offset] = newVals[j];
2337  hint = offset + 1;
2338  ++numValid;
2339  }
2340  }
2341  }
2342  else if (graph.isGloballyIndexed ()) {
2343  if (graph.colMap_.is_null ()) {
2344  return Teuchos::OrdinalTraits<LO>::invalid ();
2345  }
2346  const map_type colMap = * (graph.colMap_);
2347 
2348  // Get a view of the column indices in the row. This amortizes
2349  // the cost of getting the view over all the entries of inds.
2350  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2351 
2352  for (LO j = 0; j < numElts; ++j) {
2353  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2354  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2355  const size_t offset =
2356  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2357  gblColInd, hint, sorted);
2358  if (offset != rowInfo.numEntries) {
2359  rowVals[offset] = newVals[j];
2360  hint = offset + 1;
2361  ++numValid;
2362  }
2363  }
2364  }
2365  }
2366  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2367  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2368  // to be neither locally nor globally indexed on a process.
2369  // This means that the graph or matrix has no entries on that
2370  // process. Epetra also works like this. It's related to lazy
2371  // allocation (on first insertion, not at graph / matrix
2372  // construction). Lazy allocation will go away because it is
2373  // not thread scalable.
2374 
2375  return numValid;
2376  }
2377 
2378  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2379  LocalOrdinal
2381  replaceLocalValues (const LocalOrdinal localRow,
2382  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2383  const Teuchos::ArrayView<const Scalar>& vals)
2384  {
2385  typedef LocalOrdinal LO;
2386 
2387  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2388  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2389  return Teuchos::OrdinalTraits<LO>::invalid ();
2390  }
2391  const LO* const inputInds = lclCols.getRawPtr ();
2392  const Scalar* const inputVals = vals.getRawPtr ();
2393  return this->replaceLocalValues (localRow, numInputEnt,
2394  inputVals, inputInds);
2395  }
2396 
2397  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2399  local_ordinal_type
2402  const local_ordinal_type localRow,
2403  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2404  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2405  {
2406  using LO = local_ordinal_type;
2407  const LO numInputEnt = inputInds.extent(0);
2408  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2409  return Teuchos::OrdinalTraits<LO>::invalid();
2410  }
2411  const Scalar* const inVals =
2412  reinterpret_cast<const Scalar*>(inputVals.data());
2413  return this->replaceLocalValues(localRow, numInputEnt,
2414  inVals, inputInds.data());
2415  }
2416 
2417  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2418  LocalOrdinal
2420  replaceLocalValues (const LocalOrdinal localRow,
2421  const LocalOrdinal numEnt,
2422  const Scalar inputVals[],
2423  const LocalOrdinal inputCols[])
2424  {
2425  typedef impl_scalar_type IST;
2426  typedef LocalOrdinal LO;
2427 
2428  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2429  // Fill must be active and the "nonconst" graph must exist.
2430  return Teuchos::OrdinalTraits<LO>::invalid ();
2431  }
2432  const crs_graph_type& graph = * (this->staticGraph_);
2433  const RowInfo rowInfo = graph.getRowInfo (localRow);
2434 
2435  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2436  // The calling process does not own this row, so it is not
2437  // allowed to modify its values.
2438  return static_cast<LO> (0);
2439  }
2440  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2441  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2442  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2443  inputCols, inVals, numEnt);
2444  }
2445 
2446  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2447  LocalOrdinal
2450  const crs_graph_type& graph,
2451  const RowInfo& rowInfo,
2452  const GlobalOrdinal inds[],
2453  const impl_scalar_type newVals[],
2454  const LocalOrdinal numElts)
2455  {
2456  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2457  auto fun =
2458  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2459  rowVals[offset] = newVals[k];
2460  };
2461  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2462  return graph.findGlobalIndices(rowInfo, indsT, cb);
2463  }
2464 
2465  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2466  LocalOrdinal
2468  replaceGlobalValues (const GlobalOrdinal globalRow,
2469  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2470  const Teuchos::ArrayView<const Scalar>& inputVals)
2471  {
2472  typedef LocalOrdinal LO;
2473 
2474  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2475  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2476  return Teuchos::OrdinalTraits<LO>::invalid ();
2477  }
2478  return this->replaceGlobalValues (globalRow, numInputEnt,
2479  inputVals.getRawPtr (),
2480  inputGblColInds.getRawPtr ());
2481  }
2482 
2483  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2484  LocalOrdinal
2486  replaceGlobalValues (const GlobalOrdinal globalRow,
2487  const LocalOrdinal numEnt,
2488  const Scalar inputVals[],
2489  const GlobalOrdinal inputGblColInds[])
2490  {
2491  typedef impl_scalar_type IST;
2492  typedef LocalOrdinal LO;
2493 
2494  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2495  // Fill must be active and the "nonconst" graph must exist.
2496  return Teuchos::OrdinalTraits<LO>::invalid ();
2497  }
2498  const crs_graph_type& graph = * (this->staticGraph_);
2499 
2500  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2501  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2502  // The input local row is invalid on the calling process,
2503  // which means that the calling process summed 0 entries.
2504  return static_cast<LO> (0);
2505  }
2506 
2507  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2508  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2509  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2510  inputGblColInds, inVals, numEnt);
2511  }
2512 
2513  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2515  local_ordinal_type
2518  const global_ordinal_type globalRow,
2519  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2520  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2521  {
2522  // We use static_assert here to check the template parameters,
2523  // rather than std::enable_if (e.g., on the return value, to
2524  // enable compilation only if the template parameters match the
2525  // desired attributes). This turns obscure link errors into
2526  // clear compilation errors. It also makes the return value a
2527  // lot easier to see.
2528  using LO = local_ordinal_type;
2529  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2530  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2531  return Teuchos::OrdinalTraits<LO>::invalid();
2532  }
2533  const Scalar* const inVals =
2534  reinterpret_cast<const Scalar*>(inputVals.data());
2535  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2536  inputInds.data());
2537  }
2538 
2539  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2540  LocalOrdinal
2543  const crs_graph_type& graph,
2544  const RowInfo& rowInfo,
2545  const GlobalOrdinal inds[],
2546  const impl_scalar_type newVals[],
2547  const LocalOrdinal numElts,
2548  const bool atomic)
2549  {
2550  typedef LocalOrdinal LO;
2551  typedef GlobalOrdinal GO;
2552 
2553  const bool sorted = graph.isSorted ();
2554 
2555  size_t hint = 0; // guess at the index's relative offset in the row
2556  LO numValid = 0; // number of valid input column indices
2557 
2558  if (graph.isLocallyIndexed ()) {
2559  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2560  // pointer does NOT change its reference count. Thus, this
2561  // code is still thread safe.
2562  if (graph.colMap_.is_null ()) {
2563  // NO input column indices are valid in this case, since if
2564  // the column Map is null on the calling process, then the
2565  // calling process owns no graph entries.
2566  return numValid;
2567  }
2568  const map_type& colMap = * (graph.colMap_);
2569 
2570  // Get a view of the column indices in the row. This amortizes
2571  // the cost of getting the view over all the entries of inds.
2572  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2573  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2574 
2575  for (LO j = 0; j < numElts; ++j) {
2576  const LO lclColInd = colMap.getLocalElement (inds[j]);
2577  if (lclColInd != LINV) {
2578  const size_t offset =
2579  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2580  lclColInd, hint, sorted);
2581  if (offset != rowInfo.numEntries) {
2582  if (atomic) {
2583  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2584  }
2585  else {
2586  rowVals[offset] += newVals[j];
2587  }
2588  hint = offset + 1;
2589  numValid++;
2590  }
2591  }
2592  }
2593  }
2594  else if (graph.isGloballyIndexed ()) {
2595  // Get a view of the column indices in the row. This amortizes
2596  // the cost of getting the view over all the entries of inds.
2597  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2598 
2599  for (LO j = 0; j < numElts; ++j) {
2600  const GO gblColInd = inds[j];
2601  const size_t offset =
2602  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2603  gblColInd, hint, sorted);
2604  if (offset != rowInfo.numEntries) {
2605  if (atomic) {
2606  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2607  }
2608  else {
2609  rowVals[offset] += newVals[j];
2610  }
2611  hint = offset + 1;
2612  numValid++;
2613  }
2614  }
2615  }
2616  // If the graph is neither locally nor globally indexed on the
2617  // calling process, that means the calling process has no graph
2618  // entries. Thus, none of the input column indices are valid.
2619 
2620  return numValid;
2621  }
2622 
2623  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2624  LocalOrdinal
2626  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2627  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2628  const Teuchos::ArrayView<const Scalar>& inputVals,
2629  const bool atomic)
2630  {
2631  typedef LocalOrdinal LO;
2632 
2633  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2634  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2635  return Teuchos::OrdinalTraits<LO>::invalid ();
2636  }
2637  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2638  inputVals.getRawPtr (),
2639  inputGblColInds.getRawPtr (),
2640  atomic);
2641  }
2642 
2643  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2644  LocalOrdinal
2646  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2647  const LocalOrdinal numInputEnt,
2648  const Scalar inputVals[],
2649  const GlobalOrdinal inputGblColInds[],
2650  const bool atomic)
2651  {
2652  typedef impl_scalar_type IST;
2653  typedef LocalOrdinal LO;
2654  typedef GlobalOrdinal GO;
2655 
2656  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2657  // Fill must be active and the "nonconst" graph must exist.
2658  return Teuchos::OrdinalTraits<LO>::invalid ();
2659  }
2660  const crs_graph_type& graph = * (this->staticGraph_);
2661 
2662  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2663  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2664  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2665  // thread safe in a debug build, in part because it uses
2666  // Teuchos::ArrayView, and in part because of the data structure
2667  // used to stash outgoing entries.
2668  using Teuchos::ArrayView;
2669  ArrayView<const GO> inputGblColInds_av(
2670  numInputEnt == 0 ? nullptr : inputGblColInds,
2671  numInputEnt);
2672  ArrayView<const Scalar> inputVals_av(
2673  numInputEnt == 0 ? nullptr :
2674  inputVals, numInputEnt);
2675  // gblRow is not in the row Map on the calling process, so stash
2676  // the given entries away in a separate data structure.
2677  // globalAssemble() (called during fillComplete()) will exchange
2678  // that data and sum it in using sumIntoGlobalValues().
2679  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2680  inputVals_av);
2681  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2682  // since we won't know whether the given indices were valid
2683  // until globalAssemble (called in fillComplete) is called.
2684  // That's why insertNonownedGlobalValues doesn't return
2685  // anything. Just for consistency, I'll return the number of
2686  // entries that the user gave us.
2687  return numInputEnt;
2688  }
2689  else { // input row is in the row Map on the calling process
2690  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2691  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2692  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2693  inputGblColInds, inVals,
2694  numInputEnt, atomic);
2695  }
2696  }
2697 
2698  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2699  LocalOrdinal
2701  transformLocalValues (const LocalOrdinal lclRow,
2702  const LocalOrdinal numInputEnt,
2703  const impl_scalar_type inputVals[],
2704  const LocalOrdinal inputCols[],
2705  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2706  const bool atomic)
2707  {
2708  using Tpetra::Details::OrdinalTraits;
2709  typedef LocalOrdinal LO;
2710 
2711  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2712  // Fill must be active and the "nonconst" graph must exist.
2713  return Teuchos::OrdinalTraits<LO>::invalid ();
2714  }
2715  const crs_graph_type& graph = * (this->staticGraph_);
2716  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2717 
2718  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2719  // The calling process does not own this row, so it is not
2720  // allowed to modify its values.
2721  return static_cast<LO> (0);
2722  }
2723  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2724  return this->transformLocalValues (curRowVals.data (), graph,
2725  rowInfo, inputCols, inputVals,
2726  numInputEnt, f, atomic);
2727  }
2728 
2729  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2730  LocalOrdinal
2731  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2732  transformGlobalValues (const GlobalOrdinal gblRow,
2733  const LocalOrdinal numInputEnt,
2734  const impl_scalar_type inputVals[],
2735  const GlobalOrdinal inputCols[],
2736  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2737  const bool atomic)
2738  {
2739  using Tpetra::Details::OrdinalTraits;
2740  typedef LocalOrdinal LO;
2741 
2742  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2743  // Fill must be active and the "nonconst" graph must exist.
2744  return OrdinalTraits<LO>::invalid ();
2745  }
2746  const crs_graph_type& graph = * (this->staticGraph_);
2747  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2748 
2749  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2750  // The calling process does not own this row, so it is not
2751  // allowed to modify its values.
2752  return static_cast<LO> (0);
2753  }
2754  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2755  return this->transformGlobalValues (curRowVals.data (), graph,
2756  rowInfo, inputCols, inputVals,
2757  numInputEnt, f, atomic);
2758  }
2759 
2760  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2761  LocalOrdinal
2762  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2763  transformLocalValues (impl_scalar_type rowVals[],
2764  const crs_graph_type& graph,
2765  const RowInfo& rowInfo,
2766  const LocalOrdinal inds[],
2767  const impl_scalar_type newVals[],
2768  const LocalOrdinal numElts,
2769  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2770  const bool atomic)
2771  {
2772  typedef impl_scalar_type ST;
2773  typedef LocalOrdinal LO;
2774  typedef GlobalOrdinal GO;
2775 
2776  //if (newVals.extent (0) != inds.extent (0)) {
2777  // The sizes of the input arrays must match.
2778  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2779  //}
2780  //const LO numElts = static_cast<LO> (inds.extent (0));
2781  const bool sorted = graph.isSorted ();
2782 
2783  LO numValid = 0; // number of valid input column indices
2784  size_t hint = 0; // Guess for the current index k into rowVals
2785 
2786  if (graph.isLocallyIndexed ()) {
2787  // Get a view of the column indices in the row. This amortizes
2788  // the cost of getting the view over all the entries of inds.
2789  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2790 
2791  for (LO j = 0; j < numElts; ++j) {
2792  const LO lclColInd = inds[j];
2793  const size_t offset =
2794  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2795  lclColInd, hint, sorted);
2796  if (offset != rowInfo.numEntries) {
2797  if (atomic) {
2798  // NOTE (mfh 30 Nov 2015) The commented-out code is
2799  // wrong because another thread may have changed
2800  // rowVals[offset] between those two lines of code.
2801  //
2802  //const ST newVal = f (rowVals[offset], newVals[j]);
2803  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2804 
2805  volatile ST* const dest = &rowVals[offset];
2806  (void) atomic_binary_function_update (dest, newVals[j], f);
2807  }
2808  else {
2809  // use binary function f
2810  rowVals[offset] = f (rowVals[offset], newVals[j]);
2811  }
2812  hint = offset + 1;
2813  ++numValid;
2814  }
2815  }
2816  }
2817  else if (graph.isGloballyIndexed ()) {
2818  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2819  // pointer does NOT change its reference count. Thus, this
2820  // code is still thread safe.
2821  if (graph.colMap_.is_null ()) {
2822  // NO input column indices are valid in this case. Either
2823  // the column Map hasn't been set yet (so local indices
2824  // don't exist yet), or the calling process owns no graph
2825  // entries.
2826  return numValid;
2827  }
2828  const map_type& colMap = * (graph.colMap_);
2829  // Get a view of the column indices in the row. This amortizes
2830  // the cost of getting the view over all the entries of inds.
2831  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2832 
2833  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2834  for (LO j = 0; j < numElts; ++j) {
2835  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2836  if (gblColInd != GINV) {
2837  const size_t offset =
2838  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2839  gblColInd, hint, sorted);
2840  if (offset != rowInfo.numEntries) {
2841  if (atomic) {
2842  // NOTE (mfh 30 Nov 2015) The commented-out code is
2843  // wrong because another thread may have changed
2844  // rowVals[offset] between those two lines of code.
2845  //
2846  //const ST newVal = f (rowVals[offset], newVals[j]);
2847  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2848 
2849  volatile ST* const dest = &rowVals[offset];
2850  (void) atomic_binary_function_update (dest, newVals[j], f);
2851  }
2852  else {
2853  // use binary function f
2854  rowVals[offset] = f (rowVals[offset], newVals[j]);
2855  }
2856  hint = offset + 1;
2857  numValid++;
2858  }
2859  }
2860  }
2861  }
2862  // If the graph is neither locally nor globally indexed on the
2863  // calling process, that means the calling process has no graph
2864  // entries. Thus, none of the input column indices are valid.
2865 
2866  return numValid;
2867  }
2868 
2869  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2870  LocalOrdinal
2871  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2872  transformGlobalValues (impl_scalar_type rowVals[],
2873  const crs_graph_type& graph,
2874  const RowInfo& rowInfo,
2875  const GlobalOrdinal inds[],
2876  const impl_scalar_type newVals[],
2877  const LocalOrdinal numElts,
2878  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2879  const bool atomic)
2880  {
2881  typedef impl_scalar_type ST;
2882  typedef LocalOrdinal LO;
2883  typedef GlobalOrdinal GO;
2884 
2885  //if (newVals.extent (0) != inds.extent (0)) {
2886  // The sizes of the input arrays must match.
2887  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2888  //}
2889  //const LO numElts = static_cast<LO> (inds.extent (0));
2890  const bool sorted = graph.isSorted ();
2891 
2892  LO numValid = 0; // number of valid input column indices
2893  size_t hint = 0; // Guess for the current index k into rowVals
2894 
2895  if (graph.isGloballyIndexed ()) {
2896  // Get a view of the column indices in the row. This amortizes
2897  // the cost of getting the view over all the entries of inds.
2898  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2899 
2900  for (LO j = 0; j < numElts; ++j) {
2901  const GO gblColInd = inds[j];
2902  const size_t offset =
2903  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2904  gblColInd, hint, sorted);
2905  if (offset != rowInfo.numEntries) {
2906  if (atomic) {
2907  // NOTE (mfh 30 Nov 2015) The commented-out code is
2908  // wrong because another thread may have changed
2909  // rowVals[offset] between those two lines of code.
2910  //
2911  //const ST newVal = f (rowVals[offset], newVals[j]);
2912  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2913 
2914  volatile ST* const dest = &rowVals[offset];
2915  (void) atomic_binary_function_update (dest, newVals[j], f);
2916  }
2917  else {
2918  // use binary function f
2919  rowVals[offset] = f (rowVals[offset], newVals[j]);
2920  }
2921  hint = offset + 1;
2922  ++numValid;
2923  }
2924  }
2925  }
2926  else if (graph.isLocallyIndexed ()) {
2927  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2928  // pointer does NOT change its reference count. Thus, this
2929  // code is still thread safe.
2930  if (graph.colMap_.is_null ()) {
2931  // NO input column indices are valid in this case. Either the
2932  // column Map hasn't been set yet (so local indices don't
2933  // exist yet), or the calling process owns no graph entries.
2934  return numValid;
2935  }
2936  const map_type& colMap = * (graph.colMap_);
2937  // Get a view of the column indices in the row. This amortizes
2938  // the cost of getting the view over all the entries of inds.
2939  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2940 
2941  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2942  for (LO j = 0; j < numElts; ++j) {
2943  const LO lclColInd = colMap.getLocalElement (inds[j]);
2944  if (lclColInd != LINV) {
2945  const size_t offset =
2946  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2947  lclColInd, hint, sorted);
2948  if (offset != rowInfo.numEntries) {
2949  if (atomic) {
2950  // NOTE (mfh 30 Nov 2015) The commented-out code is
2951  // wrong because another thread may have changed
2952  // rowVals[offset] between those two lines of code.
2953  //
2954  //const ST newVal = f (rowVals[offset], newVals[j]);
2955  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2956 
2957  volatile ST* const dest = &rowVals[offset];
2958  (void) atomic_binary_function_update (dest, newVals[j], f);
2959  }
2960  else {
2961  // use binary function f
2962  rowVals[offset] = f (rowVals[offset], newVals[j]);
2963  }
2964  hint = offset + 1;
2965  numValid++;
2966  }
2967  }
2968  }
2969  }
2970  // If the graph is neither locally nor globally indexed on the
2971  // calling process, that means the calling process has no graph
2972  // entries. Thus, none of the input column indices are valid.
2973 
2974  return numValid;
2975  }
2976 
2977  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2978  LocalOrdinal
2981  const crs_graph_type& graph,
2982  const RowInfo& rowInfo,
2983  const LocalOrdinal inds[],
2984  const impl_scalar_type newVals[],
2985  const LocalOrdinal numElts,
2986  const bool atomic)
2987  {
2988  typedef LocalOrdinal LO;
2989  typedef GlobalOrdinal GO;
2990 
2991  const bool sorted = graph.isSorted ();
2992 
2993  size_t hint = 0; // Guess for the current index k into rowVals
2994  LO numValid = 0; // number of valid local column indices
2995 
2996  if (graph.isLocallyIndexed ()) {
2997  // Get a view of the column indices in the row. This amortizes
2998  // the cost of getting the view over all the entries of inds.
2999  auto colInds = graph.getLocalIndsViewHost (rowInfo);
3000 
3001  for (LO j = 0; j < numElts; ++j) {
3002  const LO lclColInd = inds[j];
3003  const size_t offset =
3004  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3005  lclColInd, hint, sorted);
3006  if (offset != rowInfo.numEntries) {
3007  if (atomic) {
3008  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3009  }
3010  else {
3011  rowVals[offset] += newVals[j];
3012  }
3013  hint = offset + 1;
3014  ++numValid;
3015  }
3016  }
3017  }
3018  else if (graph.isGloballyIndexed ()) {
3019  if (graph.colMap_.is_null ()) {
3020  return Teuchos::OrdinalTraits<LO>::invalid ();
3021  }
3022  const map_type colMap = * (graph.colMap_);
3023 
3024  // Get a view of the column indices in the row. This amortizes
3025  // the cost of getting the view over all the entries of inds.
3026  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
3027 
3028  for (LO j = 0; j < numElts; ++j) {
3029  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3030  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3031  const size_t offset =
3032  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3033  gblColInd, hint, sorted);
3034  if (offset != rowInfo.numEntries) {
3035  if (atomic) {
3036  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3037  }
3038  else {
3039  rowVals[offset] += newVals[j];
3040  }
3041  hint = offset + 1;
3042  ++numValid;
3043  }
3044  }
3045  }
3046  }
3047  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3048  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3049  // to be neither locally nor globally indexed on a process.
3050  // This means that the graph or matrix has no entries on that
3051  // process. Epetra also works like this. It's related to lazy
3052  // allocation (on first insertion, not at graph / matrix
3053  // construction). Lazy allocation will go away because it is
3054  // not thread scalable.
3055 
3056  return numValid;
3057  }
3058 
3059  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3060  LocalOrdinal
3062  sumIntoLocalValues (const LocalOrdinal localRow,
3063  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3064  const Teuchos::ArrayView<const Scalar>& values,
3065  const bool atomic)
3066  {
3067  using LO = local_ordinal_type;
3068  const LO numInputEnt = static_cast<LO>(indices.size());
3069  if (static_cast<LO>(values.size()) != numInputEnt) {
3070  return Teuchos::OrdinalTraits<LO>::invalid();
3071  }
3072  const LO* const inputInds = indices.getRawPtr();
3073  const scalar_type* const inputVals = values.getRawPtr();
3074  return this->sumIntoLocalValues(localRow, numInputEnt,
3075  inputVals, inputInds, atomic);
3076  }
3077 
3078  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3080  local_ordinal_type
3083  const local_ordinal_type localRow,
3084  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3085  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3086  const bool atomic)
3087  {
3088  using LO = local_ordinal_type;
3089  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3090  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3091  return Teuchos::OrdinalTraits<LO>::invalid();
3092  }
3093  const scalar_type* inVals =
3094  reinterpret_cast<const scalar_type*>(inputVals.data());
3095  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3096  inputInds.data(), atomic);
3097  }
3098 
3099  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3100  LocalOrdinal
3102  sumIntoLocalValues (const LocalOrdinal localRow,
3103  const LocalOrdinal numEnt,
3104  const Scalar vals[],
3105  const LocalOrdinal cols[],
3106  const bool atomic)
3107  {
3108  typedef impl_scalar_type IST;
3109  typedef LocalOrdinal LO;
3110 
3111  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3112  // Fill must be active and the "nonconst" graph must exist.
3113  return Teuchos::OrdinalTraits<LO>::invalid ();
3114  }
3115  const crs_graph_type& graph = * (this->staticGraph_);
3116  const RowInfo rowInfo = graph.getRowInfo (localRow);
3117 
3118  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3119  // The calling process does not own this row, so it is not
3120  // allowed to modify its values.
3121  return static_cast<LO> (0);
3122  }
3123  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3124  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3125  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3126  cols, inputVals, numEnt, atomic);
3127  }
3128 
3129  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3131  values_dualv_type::t_host::const_type
3133  getValuesViewHost (const RowInfo& rowinfo) const
3134  {
3135  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3136  return typename values_dualv_type::t_host::const_type ();
3137  else
3138  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3139  rowinfo.allocSize,
3140  Access::ReadOnly);
3141  }
3142 
3143  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3145  values_dualv_type::t_host
3148  {
3149  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3150  return typename values_dualv_type::t_host ();
3151  else
3152  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3153  rowinfo.allocSize,
3154  Access::ReadWrite);
3155  }
3156 
3157  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3159  values_dualv_type::t_dev::const_type
3161  getValuesViewDevice (const RowInfo& rowinfo) const
3162  {
3163  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3164  return typename values_dualv_type::t_dev::const_type ();
3165  else
3166  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3167  rowinfo.allocSize,
3168  Access::ReadOnly);
3169  }
3170 
3171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3173  values_dualv_type::t_dev
3176  {
3177  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3178  return typename values_dualv_type::t_dev ();
3179  else
3180  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3181  rowinfo.allocSize,
3182  Access::ReadWrite);
3183  }
3184 
3185 
3186  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3187  void
3190  nonconst_local_inds_host_view_type &indices,
3191  nonconst_values_host_view_type &values,
3192  size_t& numEntries) const
3193  {
3194  using Teuchos::ArrayView;
3195  using Teuchos::av_reinterpret_cast;
3196  const char tfecfFuncName[] = "getLocalRowCopy: ";
3197 
3198  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3199  (! this->hasColMap (), std::runtime_error,
3200  "The matrix does not have a column Map yet. This means we don't have "
3201  "local indices for columns yet, so it doesn't make sense to call this "
3202  "method. If the matrix doesn't have a column Map yet, you should call "
3203  "fillComplete on it first.");
3204 
3205  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3206  const size_t theNumEntries = rowinfo.numEntries;
3207  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3208  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3209  static_cast<size_t> (values.size ()) < theNumEntries,
3210  std::runtime_error, "Row with local index " << localRow << " has " <<
3211  theNumEntries << " entry/ies, but indices.size() = " <<
3212  indices.size () << " and values.size() = " << values.size () << ".");
3213  numEntries = theNumEntries; // first side effect
3214 
3215  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3216  if (staticGraph_->isLocallyIndexed ()) {
3217  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3218  auto curVals = getValuesViewHost(rowinfo);
3219 
3220  for (size_t j = 0; j < theNumEntries; ++j) {
3221  values[j] = curVals[j];
3222  indices[j] = curLclInds(j);
3223  }
3224  }
3225  else if (staticGraph_->isGloballyIndexed ()) {
3226  // Don't call getColMap(), because it touches RCP's reference count.
3227  const map_type& colMap = * (staticGraph_->colMap_);
3228  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3229  auto curVals = getValuesViewHost(rowinfo);
3230 
3231  for (size_t j = 0; j < theNumEntries; ++j) {
3232  values[j] = curVals[j];
3233  indices[j] = colMap.getLocalElement (curGblInds(j));
3234  }
3235  }
3236  }
3237  }
3238 
3239 
3240 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3241 void
3244  nonconst_global_inds_host_view_type &indices,
3245  nonconst_values_host_view_type &values,
3246  size_t& numEntries) const
3247  {
3248  using Teuchos::ArrayView;
3249  using Teuchos::av_reinterpret_cast;
3250  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3251 
3252  const RowInfo rowinfo =
3253  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3254  const size_t theNumEntries = rowinfo.numEntries;
3255  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3256  static_cast<size_t> (indices.size ()) < theNumEntries ||
3257  static_cast<size_t> (values.size ()) < theNumEntries,
3258  std::runtime_error, "Row with global index " << globalRow << " has "
3259  << theNumEntries << " entry/ies, but indices.size() = " <<
3260  indices.size () << " and values.size() = " << values.size () << ".");
3261  numEntries = theNumEntries; // first side effect
3262 
3263  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3264  if (staticGraph_->isLocallyIndexed ()) {
3265  const map_type& colMap = * (staticGraph_->colMap_);
3266  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3267  auto curVals = getValuesViewHost(rowinfo);
3268 
3269  for (size_t j = 0; j < theNumEntries; ++j) {
3270  values[j] = curVals[j];
3271  indices[j] = colMap.getGlobalElement (curLclInds(j));
3272  }
3273  }
3274  else if (staticGraph_->isGloballyIndexed ()) {
3275  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3276  auto curVals = getValuesViewHost(rowinfo);
3277 
3278  for (size_t j = 0; j < theNumEntries; ++j) {
3279  values[j] = curVals[j];
3280  indices[j] = curGblInds(j);
3281  }
3282  }
3283  }
3284  }
3285 
3286 
3287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3288  void
3290  getLocalRowView(LocalOrdinal localRow,
3291  local_inds_host_view_type &indices,
3292  values_host_view_type &values) const
3293  {
3294  const char tfecfFuncName[] = "getLocalRowView: ";
3295 
3296  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3297  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3298  "its indices as global indices, so you cannot get a view with local "
3299  "column indices. If the matrix has a column Map, you may call "
3300  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3301  "a view with global column indices by calling getGlobalRowCopy().");
3302 
3303  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3304  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3305  rowInfo.numEntries > 0) {
3306  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3307  rowInfo.offset1D,
3308  rowInfo.numEntries,
3309  Access::ReadOnly);
3310  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3311  rowInfo.numEntries,
3312  Access::ReadOnly);
3313  }
3314  else {
3315  // This does the right thing (reports an empty row) if the input
3316  // row is invalid.
3317  indices = local_inds_host_view_type();
3318  values = values_host_view_type();
3319  }
3320 
3321 #ifdef HAVE_TPETRA_DEBUG
3322  const char suffix[] = ". This should never happen. Please report this "
3323  "bug to the Tpetra developers.";
3324  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3325  (static_cast<size_t> (indices.size ()) !=
3326  static_cast<size_t> (values.size ()), std::logic_error,
3327  "At the end of this method, for local row " << localRow << ", "
3328  "indices.size() = " << indices.size () << " != values.size () = "
3329  << values.size () << suffix);
3330  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3331  (static_cast<size_t> (indices.size ()) !=
3332  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3333  "At the end of this method, for local row " << localRow << ", "
3334  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3335  << rowInfo.numEntries << suffix);
3336  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3337  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3338  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3339  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3340  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3341  expectedNumEntries << suffix);
3342 #endif // HAVE_TPETRA_DEBUG
3343  }
3344 
3345 
3346  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3347  void
3349  getGlobalRowView (GlobalOrdinal globalRow,
3350  global_inds_host_view_type &indices,
3351  values_host_view_type &values) const
3352  {
3353  const char tfecfFuncName[] = "getGlobalRowView: ";
3354 
3355  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3356  isLocallyIndexed (), std::runtime_error,
3357  "The matrix is locally indexed, so we cannot return a view of the row "
3358  "with global column indices. Use getGlobalRowCopy() instead.");
3359 
3360  // This does the right thing (reports an empty row) if the input
3361  // row is invalid.
3362  const RowInfo rowInfo =
3363  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3364  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3365  rowInfo.numEntries > 0) {
3366  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3367  rowInfo.numEntries,
3368  Access::ReadOnly);
3369  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3370  rowInfo.numEntries,
3371  Access::ReadOnly);
3372  }
3373  else {
3374  indices = global_inds_host_view_type();
3375  values = values_host_view_type();
3376  }
3377 
3378 #ifdef HAVE_TPETRA_DEBUG
3379  const char suffix[] = ". This should never happen. Please report this "
3380  "bug to the Tpetra developers.";
3381  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3382  (static_cast<size_t> (indices.size ()) !=
3383  static_cast<size_t> (values.size ()), std::logic_error,
3384  "At the end of this method, for global row " << globalRow << ", "
3385  "indices.size() = " << indices.size () << " != values.size () = "
3386  << values.size () << suffix);
3387  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3388  (static_cast<size_t> (indices.size ()) !=
3389  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3390  "At the end of this method, for global row " << globalRow << ", "
3391  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3392  << rowInfo.numEntries << suffix);
3393  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3394  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3395  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3396  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3397  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3398  " " << expectedNumEntries << suffix);
3399 #endif // HAVE_TPETRA_DEBUG
3400  }
3401 
3402 
3403  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3404  void
3406  scale (const Scalar& alpha)
3407  {
3408  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3409 
3410  const size_t nlrs = staticGraph_->getLocalNumRows ();
3411  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3412  if (! staticGraph_->indicesAreAllocated () ||
3413  nlrs == 0 || numEntries == 0) {
3414  // do nothing
3415  }
3416  else {
3417 
3418  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3419  KokkosBlas::scal(vals, theAlpha, vals);
3420 
3421  }
3422  }
3423 
3424  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3425  void
3427  setAllToScalar (const Scalar& alpha)
3428  {
3429  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3430 
3431  // replace all values in the matrix
3432  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3433  // however, if there are no valid entries, we can short-circuit
3434  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3435  const size_t numEntries = staticGraph_->getLocalNumEntries();
3436  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3437  // do nothing
3438  }
3439  else {
3440  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3441  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3442  theAlpha);
3443  // CAG: This fence was found to be required on Cuda with UVM=on.
3444  Kokkos::fence("CrsMatrix::setAllToScalar");
3445  }
3446  }
3447 
3448  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3449  void
3451  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3452  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3453  const typename local_matrix_device_type::values_type& values)
3454  {
3455  using ProfilingRegion=Details::ProfilingRegion;
3456  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3457  const char tfecfFuncName[] = "setAllValues: ";
3458  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3459  (columnIndices.size () != values.size (), std::invalid_argument,
3460  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3461  " = " << values.size () << ".");
3462  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3463  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3464 
3465  try {
3466  myGraph_->setAllIndices (rowPointers, columnIndices);
3467  }
3468  catch (std::exception &e) {
3469  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3470  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3471  "exception: " << e.what ());
3472  }
3473 
3474  // Make sure that myGraph_ now has a local graph. It may not be
3475  // fillComplete yet, so it's important to check. We don't care
3476  // whether setAllIndices() did a shallow copy or a deep copy, so a
3477  // good way to check is to compare dimensions.
3478  auto lclGraph = myGraph_->getLocalGraphDevice ();
3479  const size_t numEnt = lclGraph.entries.extent (0);
3480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3481  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3482  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3483  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3484  "local graph. Please report this bug to the Tpetra developers.");
3485 
3486  valuesPacked_wdv = values_wdv_type(values);
3487  valuesUnpacked_wdv = valuesPacked_wdv;
3488 
3489  // Storage MUST be packed, since the interface doesn't give any
3490  // way to indicate any extra space at the end of each row.
3491  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3492 
3493  checkInternalState ();
3494  }
3495 
3496  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3497  void
3499  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3500  {
3501  using ProfilingRegion=Details::ProfilingRegion;
3502  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3503 
3504  auto graph = localDeviceMatrix.graph;
3505  //FIXME how to check whether graph is allocated
3506 
3507  auto rows = graph.row_map;
3508  auto columns = graph.entries;
3509  auto values = localDeviceMatrix.values;
3510 
3511  setAllValues(rows,columns,values);
3512  }
3513 
3514  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3515  void
3517  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3518  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3519  const Teuchos::ArrayRCP<Scalar>& val)
3520  {
3521  using Kokkos::Compat::getKokkosViewDeepCopy;
3522  using Teuchos::ArrayRCP;
3523  using Teuchos::av_reinterpret_cast;
3524  typedef device_type DT;
3525  typedef impl_scalar_type IST;
3526  typedef typename local_graph_device_type::row_map_type row_map_type;
3527  //typedef typename row_map_type::non_const_value_type row_offset_type;
3528  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3529 
3530  // The row offset type may depend on the execution space. It may
3531  // not necessarily be size_t. If it's not, we need to make a deep
3532  // copy. We need to make a deep copy anyway so that Kokkos can
3533  // own the memory. Regardless, ptrIn gets the copy.
3534  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3535  Kokkos::View<const size_t*,
3536  typename row_map_type::array_layout,
3537  Kokkos::HostSpace,
3538  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3539  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3540 
3541  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3542  (ptrNative.extent (0) != ptrSizeT.extent (0),
3543  std::logic_error, "ptrNative.extent(0) = " <<
3544  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3545  << ptrSizeT.extent (0) << ". Please report this bug to the "
3546  "Tpetra developers.");
3547 
3548  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3549  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3550  this->setAllValues (ptrNative, indIn, valIn);
3551  }
3552 
3553  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3554  void
3556  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3557  {
3558  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3560  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3561 
3562  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3563  // this method in CrsGraph too, so don't call it (otherwise build
3564  // warnings will show up and annoy users). Instead, copy results
3565  // in and out, if the memory space requires it.
3566 
3567  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3568  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3569  offsets.resize (lclNumRows);
3570  }
3571 
3572  // The input ArrayRCP must always be a host pointer. Thus, if
3573  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3574  // to write to that allocation directly as a Kokkos::View.
3575  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3576  // It is always syntactically correct to assign a raw host
3577  // pointer to a device View, so this code will compile correctly
3578  // even if this branch never runs.
3579  typedef Kokkos::View<size_t*, device_type,
3580  Kokkos::MemoryUnmanaged> output_type;
3581  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3582  staticGraph_->getLocalDiagOffsets (offsetsOut);
3583  }
3584  else {
3585  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3586  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3587  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3588  Kokkos::MemoryUnmanaged> output_type;
3589  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3590  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3591  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3592  }
3593  }
3594 
3595  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3596  void
3599  {
3600  using Teuchos::ArrayRCP;
3601  using Teuchos::ArrayView;
3602  using Teuchos::av_reinterpret_cast;
3603  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3604  typedef local_ordinal_type LO;
3605 
3606 
3607  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3608  staticGraph_.is_null (), std::runtime_error,
3609  "This method requires that the matrix have a graph.");
3610  auto rowMapPtr = this->getRowMap ();
3611  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3612  // Processes on which the row Map or its communicator is null
3613  // don't participate. Users shouldn't even call this method on
3614  // those processes.
3615  return;
3616  }
3617  auto colMapPtr = this->getColMap ();
3618  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3619  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3620  "This method requires that the matrix have a column Map.");
3621  const map_type& rowMap = * rowMapPtr;
3622  const map_type& colMap = * colMapPtr;
3623  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3624 
3625 #ifdef HAVE_TPETRA_DEBUG
3626  // isCompatible() requires an all-reduce, and thus this check
3627  // should only be done in debug mode.
3628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3629  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3630  "The input Vector's Map must be compatible with the CrsMatrix's row "
3631  "Map. You may check this by using Map's isCompatible method: "
3632  "diag.getMap ()->isCompatible (A.getRowMap ());");
3633 #endif // HAVE_TPETRA_DEBUG
3634 
3635  if (this->isFillComplete ()) {
3636  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3637  // 1-D subview of the first (and only) column of D_lcl.
3638  const auto D_lcl_1d =
3639  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3640 
3641  const auto lclRowMap = rowMap.getLocalMap ();
3642  const auto lclColMap = colMap.getLocalMap ();
3644  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3645  lclColMap,
3646  getLocalMatrixDevice ());
3647  }
3648  else {
3650  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3651  }
3652  }
3653 
3654  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3655  void
3658  const Kokkos::View<const size_t*, device_type,
3659  Kokkos::MemoryUnmanaged>& offsets) const
3660  {
3661  typedef LocalOrdinal LO;
3662 
3663 #ifdef HAVE_TPETRA_DEBUG
3664  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3665  const map_type& rowMap = * (this->getRowMap ());
3666  // isCompatible() requires an all-reduce, and thus this check
3667  // should only be done in debug mode.
3668  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3669  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3670  "The input Vector's Map must be compatible with (in the sense of Map::"
3671  "isCompatible) the CrsMatrix's row Map.");
3672 #endif // HAVE_TPETRA_DEBUG
3673 
3674  // For now, we fill the Vector on the host and sync to device.
3675  // Later, we may write a parallel kernel that works entirely on
3676  // device.
3677  //
3678  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3679  // we write a device kernel, it will not need to assume UVM.
3680 
3681  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3682  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3683  // Get 1-D subview of the first (and only) column of D_lcl.
3684  auto D_lcl_1d =
3685  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3686 
3687  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3688  getLocalMatrixDevice ());
3689  }
3690 
3691  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3692  void
3695  const Teuchos::ArrayView<const size_t>& offsets) const
3696  {
3697  using LO = LocalOrdinal;
3698  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3699  using IST = impl_scalar_type;
3700 
3701 #ifdef HAVE_TPETRA_DEBUG
3702  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3703  const map_type& rowMap = * (this->getRowMap ());
3704  // isCompatible() requires an all-reduce, and thus this check
3705  // should only be done in debug mode.
3706  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3707  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3708  "The input Vector's Map must be compatible with (in the sense of Map::"
3709  "isCompatible) the CrsMatrix's row Map.");
3710 #endif // HAVE_TPETRA_DEBUG
3711 
3712  // See #1510. In case diag has already been marked modified on
3713  // device, we need to clear that flag, since the code below works
3714  // on host.
3715  //diag.clear_sync_state ();
3716 
3717  // For now, we fill the Vector on the host and sync to device.
3718  // Later, we may write a parallel kernel that works entirely on
3719  // device.
3720  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3721  // 1-D subview of the first (and only) column of lclVecHost.
3722  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3723 
3724  using host_offsets_view_type =
3725  Kokkos::View<const size_t*, Kokkos::HostSpace,
3726  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3727  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3728  // Find the diagonal entries and put them in lclVecHost1d.
3729  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3730  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3731  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3732 
3733  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3734  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3735  Kokkos::parallel_for
3736  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3737  range_type (0, myNumRows),
3738  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3739  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3740  if (h_offsets[lclRow] != INV) {
3741  auto curRowOffset = rowPtrsPackedHost (lclRow);
3742  lclVecHost1d(lclRow) =
3743  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3744  }
3745  });
3746  //diag.sync_device ();
3747  }
3748 
3749 
3750  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3751  void
3754  {
3755  using ::Tpetra::Details::ProfilingRegion;
3756  using Teuchos::ArrayRCP;
3757  using Teuchos::ArrayView;
3758  using Teuchos::null;
3759  using Teuchos::RCP;
3760  using Teuchos::rcp;
3761  using Teuchos::rcpFromRef;
3763  const char tfecfFuncName[] = "leftScale: ";
3764 
3765  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3766 
3767  RCP<const vec_type> xp;
3768  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3769  // Take from Epetra: If we have a non-trivial exporter, we must
3770  // import elements that are permuted or are on other processors.
3771  auto exporter = this->getCrsGraphRef ().getExporter ();
3772  if (exporter.get () != nullptr) {
3773  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3774  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3775  xp = tempVec;
3776  }
3777  else {
3778  xp = rcpFromRef (x);
3779  }
3780  }
3781  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3782  xp = rcpFromRef (x);
3783  }
3784  else {
3785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3786  (true, std::invalid_argument, "x's Map must be the same as "
3787  "either the row Map or the range Map of the CrsMatrix.");
3788  }
3789 
3790  if (this->isFillComplete()) {
3791  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3792  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3794  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3795  x_lcl_1d, false, false);
3796  }
3797  else {
3798  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3799  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3800  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3801  " fillComplete");
3802  }
3803  }
3804 
3805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3806  void
3809  {
3810  using ::Tpetra::Details::ProfilingRegion;
3811  using Teuchos::ArrayRCP;
3812  using Teuchos::ArrayView;
3813  using Teuchos::null;
3814  using Teuchos::RCP;
3815  using Teuchos::rcp;
3816  using Teuchos::rcpFromRef;
3818  const char tfecfFuncName[] = "rightScale: ";
3819 
3820  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3821 
3822  RCP<const vec_type> xp;
3823  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3824  // Take from Epetra: If we have a non-trivial exporter, we must
3825  // import elements that are permuted or are on other processors.
3826  auto importer = this->getCrsGraphRef ().getImporter ();
3827  if (importer.get () != nullptr) {
3828  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3829  tempVec->doImport (x, *importer, REPLACE);
3830  xp = tempVec;
3831  }
3832  else {
3833  xp = rcpFromRef (x);
3834  }
3835  }
3836  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3837  xp = rcpFromRef (x);
3838  } else {
3839  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3840  (true, std::runtime_error, "x's Map must be the same as "
3841  "either the domain Map or the column Map of the CrsMatrix.");
3842  }
3843 
3844  if (this->isFillComplete()) {
3845  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3846  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3848  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3849  x_lcl_1d, false, false);
3850  }
3851  else {
3852  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3854  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3855  " fillComplete");
3856  }
3857  }
3858 
3859  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3863  {
3864  using Teuchos::ArrayView;
3865  using Teuchos::outArg;
3866  using Teuchos::REDUCE_SUM;
3867  using Teuchos::reduceAll;
3868 
3869  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3870  // local part of this computation. It could make sense to put
3871  // this operation in the Kokkos::CrsMatrix.
3872 
3873  // check the cache first
3874  mag_type mySum = STM::zero ();
3875  if (getLocalNumEntries() > 0) {
3876  if (isStorageOptimized ()) {
3877  // "Optimized" storage is packed storage. That means we can
3878  // iterate in one pass through the 1-D values array.
3879  const size_t numEntries = getLocalNumEntries ();
3880  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3881  for (size_t k = 0; k < numEntries; ++k) {
3882  auto val = values[k];
3883  // Note (etp 06 Jan 2015) We need abs() here for composite types
3884  // (in general, if mag_type is on the left-hand-side, we need
3885  // abs() on the right-hand-side)
3886  const mag_type val_abs = STS::abs (val);
3887  mySum += val_abs * val_abs;
3888  }
3889  }
3890  else {
3891  const LocalOrdinal numRows =
3892  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3893  for (LocalOrdinal r = 0; r < numRows; ++r) {
3894  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3895  const size_t numEntries = rowInfo.numEntries;
3896  auto A_r = this->getValuesViewHost(rowInfo);
3897  for (size_t k = 0; k < numEntries; ++k) {
3898  const impl_scalar_type val = A_r[k];
3899  const mag_type val_abs = STS::abs (val);
3900  mySum += val_abs * val_abs;
3901  }
3902  }
3903  }
3904  }
3905  mag_type totalSum = STM::zero ();
3906  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3907  mySum, outArg (totalSum));
3908  return STM::sqrt (totalSum);
3909  }
3910 
3911  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3912  void
3914  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3915  {
3916  const char tfecfFuncName[] = "replaceColMap: ";
3917  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3918  // Then replacing the column Map might mean that we need to
3919  // reindex the column indices.
3920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3921  myGraph_.is_null (), std::runtime_error,
3922  "This method does not work if the matrix has a const graph. The whole "
3923  "idea of a const graph is that you are not allowed to change it, but "
3924  "this method necessarily must modify the graph, since the graph owns "
3925  "the matrix's column Map.");
3926  myGraph_->replaceColMap (newColMap);
3927  }
3928 
3929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3930  void
3933  const Teuchos::RCP<const map_type>& newColMap,
3934  const Teuchos::RCP<const import_type>& newImport,
3935  const bool sortEachRow)
3936  {
3937  const char tfecfFuncName[] = "reindexColumns: ";
3938  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3939  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3940  "The input graph is null, but the matrix does not own its graph.");
3941 
3942  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3943  const bool sortGraph = false; // we'll sort graph & matrix together below
3944 
3945  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3946 
3947  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3948  const LocalOrdinal lclNumRows =
3949  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3950 
3951  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3952 
3953  const RowInfo rowInfo = theGraph.getRowInfo (row);
3954  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3955  auto vals = this->getValuesViewHostNonConst (rowInfo);
3956 
3957  sort2 (lclColInds.data (),
3958  lclColInds.data () + rowInfo.numEntries,
3959  vals.data ());
3960  }
3961  theGraph.indicesAreSorted_ = true;
3962  }
3963  }
3964 
3965  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3966  void
3968  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3969  {
3970  const char tfecfFuncName[] = "replaceDomainMap: ";
3971  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3972  myGraph_.is_null (), std::runtime_error,
3973  "This method does not work if the matrix has a const graph. The whole "
3974  "idea of a const graph is that you are not allowed to change it, but this"
3975  " method necessarily must modify the graph, since the graph owns the "
3976  "matrix's domain Map and Import objects.");
3977  myGraph_->replaceDomainMap (newDomainMap);
3978  }
3979 
3980  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3981  void
3983  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3984  Teuchos::RCP<const import_type>& newImporter)
3985  {
3986  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3987  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3988  myGraph_.is_null (), std::runtime_error,
3989  "This method does not work if the matrix has a const graph. The whole "
3990  "idea of a const graph is that you are not allowed to change it, but this"
3991  " method necessarily must modify the graph, since the graph owns the "
3992  "matrix's domain Map and Import objects.");
3993  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3994  }
3995 
3996  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3997  void
3999  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
4000  {
4001  const char tfecfFuncName[] = "replaceRangeMap: ";
4002  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4003  myGraph_.is_null (), std::runtime_error,
4004  "This method does not work if the matrix has a const graph. The whole "
4005  "idea of a const graph is that you are not allowed to change it, but this"
4006  " method necessarily must modify the graph, since the graph owns the "
4007  "matrix's domain Map and Import objects.");
4008  myGraph_->replaceRangeMap (newRangeMap);
4009  }
4010 
4011  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4012  void
4014  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
4015  Teuchos::RCP<const export_type>& newExporter)
4016  {
4017  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
4018  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4019  myGraph_.is_null (), std::runtime_error,
4020  "This method does not work if the matrix has a const graph. The whole "
4021  "idea of a const graph is that you are not allowed to change it, but this"
4022  " method necessarily must modify the graph, since the graph owns the "
4023  "matrix's domain Map and Import objects.");
4024  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
4025  }
4026 
4027  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4028  void
4030  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4031  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4032  const Teuchos::ArrayView<const Scalar>& values)
4033  {
4034  using Teuchos::Array;
4035  typedef GlobalOrdinal GO;
4036  typedef typename Array<GO>::size_type size_type;
4037 
4038  const size_type numToInsert = indices.size ();
4039  // Add the new data to the list of nonlocals.
4040  // This creates the arrays if they don't exist yet.
4041  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4042  Array<GO>& curRowInds = curRow.first;
4043  Array<Scalar>& curRowVals = curRow.second;
4044  const size_type newCapacity = curRowInds.size () + numToInsert;
4045  curRowInds.reserve (newCapacity);
4046  curRowVals.reserve (newCapacity);
4047  for (size_type k = 0; k < numToInsert; ++k) {
4048  curRowInds.push_back (indices[k]);
4049  curRowVals.push_back (values[k]);
4050  }
4051  }
4052 
4053  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4054  void
4057  {
4058  using Details::Behavior;
4060  using Teuchos::Comm;
4061  using Teuchos::outArg;
4062  using Teuchos::RCP;
4063  using Teuchos::rcp;
4064  using Teuchos::REDUCE_MAX;
4065  using Teuchos::REDUCE_MIN;
4066  using Teuchos::reduceAll;
4067  using std::endl;
4069  //typedef LocalOrdinal LO;
4070  typedef GlobalOrdinal GO;
4071  typedef typename Teuchos::Array<GO>::size_type size_type;
4072  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4073  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4074 
4075  const bool verbose = Behavior::verbose("CrsMatrix");
4076  std::unique_ptr<std::string> prefix;
4077  if (verbose) {
4078  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4079  std::ostringstream os;
4080  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4081  << endl;
4082  std::cerr << os.str();
4083  }
4084  RCP<const Comm<int> > comm = getComm ();
4085 
4086  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4087  (! isFillActive (), std::runtime_error, "Fill must be active before "
4088  "you may call this method.");
4089 
4090  const size_t myNumNonlocalRows = nonlocals_.size ();
4091 
4092  // If no processes have nonlocal rows, then we don't have to do
4093  // anything. Checking this is probably cheaper than constructing
4094  // the Map of nonlocal rows (see below) and noticing that it has
4095  // zero global entries.
4096  {
4097  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4098  int someoneHasNonlocalRows = 0;
4099  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4100  outArg (someoneHasNonlocalRows));
4101  if (someoneHasNonlocalRows == 0) {
4102  return; // no process has nonlocal rows, so nothing to do
4103  }
4104  }
4105 
4106  // 1. Create a list of the "nonlocal" rows on each process. this
4107  // requires iterating over nonlocals_, so while we do this,
4108  // deduplicate the entries and get a count for each nonlocal
4109  // row on this process.
4110  // 2. Construct a new row Map corresponding to those rows. This
4111  // Map is likely overlapping. We know that the Map is not
4112  // empty on all processes, because the above all-reduce and
4113  // return exclude that case.
4114 
4115  RCP<const map_type> nonlocalRowMap;
4116  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4117  {
4118  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4119  size_type curPos = 0;
4120  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4121  ++mapIter, ++curPos) {
4122  myNonlocalGblRows[curPos] = mapIter->first;
4123  // Get the values and column indices by reference, since we
4124  // intend to change them in place (that's what "erase" does).
4125  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4126  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4127 
4128  // Sort both arrays jointly, using the column indices as keys,
4129  // then merge them jointly. "Merge" here adds values
4130  // corresponding to the same column indices. The first 2 args
4131  // of merge2 are output arguments that work just like the
4132  // return value of std::unique.
4133  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4134  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4135  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4136  merge2 (gblCols_newEnd, vals_newEnd,
4137  gblCols.begin (), gblCols.end (),
4138  vals.begin (), vals.end ());
4139  gblCols.erase (gblCols_newEnd, gblCols.end ());
4140  vals.erase (vals_newEnd, vals.end ());
4141  numEntPerNonlocalRow[curPos] = gblCols.size ();
4142  }
4143 
4144  // Currently, Map requires that its indexBase be the global min
4145  // of all its global indices. Map won't compute this for us, so
4146  // we must do it. If our process has no nonlocal rows, set the
4147  // "min" to the max possible GO value. This ensures that if
4148  // some process has at least one nonlocal row, then it will pick
4149  // that up as the min. We know that at least one process has a
4150  // nonlocal row, since the all-reduce and return at the top of
4151  // this method excluded that case.
4152  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4153  {
4154  auto iter = std::min_element (myNonlocalGblRows.begin (),
4155  myNonlocalGblRows.end ());
4156  if (iter != myNonlocalGblRows.end ()) {
4157  myMinNonlocalGblRow = *iter;
4158  }
4159  }
4160  GO gblMinNonlocalGblRow = 0;
4161  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4162  outArg (gblMinNonlocalGblRow));
4163  const GO indexBase = gblMinNonlocalGblRow;
4164  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4165  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4166  }
4167 
4168  // 3. Use the values and column indices for each nonlocal row, as
4169  // stored in nonlocals_, to construct a CrsMatrix corresponding
4170  // to nonlocal rows. We have
4171  // exact counts of the number of entries in each nonlocal row.
4172 
4173  if (verbose) {
4174  std::ostringstream os;
4175  os << *prefix << "Create nonlocal matrix" << endl;
4176  std::cerr << os.str();
4177  }
4178  RCP<crs_matrix_type> nonlocalMatrix =
4179  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4180  {
4181  size_type curPos = 0;
4182  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4183  ++mapIter, ++curPos) {
4184  const GO gblRow = mapIter->first;
4185  // Get values & column indices by ref, just to avoid copy.
4186  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4187  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4188  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4189  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4190  }
4191  }
4192  // There's no need to fill-complete the nonlocals matrix.
4193  // We just use it as a temporary container for the Export.
4194 
4195  // 4. If the original row Map is one to one, then we can Export
4196  // directly from nonlocalMatrix into this. Otherwise, we have
4197  // to create a temporary matrix with a one-to-one row Map,
4198  // Export into that, then Import from the temporary matrix into
4199  // *this.
4200 
4201  auto origRowMap = this->getRowMap ();
4202  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4203 
4204  int isLocallyComplete = 1; // true by default
4205 
4206  if (origRowMapIsOneToOne) {
4207  if (verbose) {
4208  std::ostringstream os;
4209  os << *prefix << "Original row Map is 1-to-1" << endl;
4210  std::cerr << os.str();
4211  }
4212  export_type exportToOrig (nonlocalRowMap, origRowMap);
4213  if (! exportToOrig.isLocallyComplete ()) {
4214  isLocallyComplete = 0;
4215  }
4216  if (verbose) {
4217  std::ostringstream os;
4218  os << *prefix << "doExport from nonlocalMatrix" << endl;
4219  std::cerr << os.str();
4220  }
4221  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4222  // We're done at this point!
4223  }
4224  else {
4225  if (verbose) {
4226  std::ostringstream os;
4227  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4228  std::cerr << os.str();
4229  }
4230  // If you ask a Map whether it is one to one, it does some
4231  // communication and stashes intermediate results for later use
4232  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4233  // much more then the original cost of calling isOneToOne.
4234  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4235  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4236  if (! exportToOneToOne.isLocallyComplete ()) {
4237  isLocallyComplete = 0;
4238  }
4239 
4240  // Create a temporary matrix with the one-to-one row Map.
4241  //
4242  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4243  // each row, to avoid reallocation during the Export operation.
4244  if (verbose) {
4245  std::ostringstream os;
4246  os << *prefix << "Create & doExport into 1-to-1 matrix"
4247  << endl;
4248  std::cerr << os.str();
4249  }
4250  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4251  // Export from matrix of nonlocals into the temp one-to-one matrix.
4252  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4253  Tpetra::ADD);
4254 
4255  // We don't need the matrix of nonlocals anymore, so get rid of
4256  // it, to keep the memory high-water mark down.
4257  if (verbose) {
4258  std::ostringstream os;
4259  os << *prefix << "Free nonlocalMatrix" << endl;
4260  std::cerr << os.str();
4261  }
4262  nonlocalMatrix = Teuchos::null;
4263 
4264  // Import from the one-to-one matrix to the original matrix.
4265  if (verbose) {
4266  std::ostringstream os;
4267  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4268  std::cerr << os.str();
4269  }
4270  import_type importToOrig (oneToOneRowMap, origRowMap);
4271  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4272  }
4273 
4274  // It's safe now to clear out nonlocals_, since we've already
4275  // committed side effects to *this. The standard idiom for
4276  // clearing a Container like std::map, is to swap it with an empty
4277  // Container and let the swapped Container fall out of scope.
4278  if (verbose) {
4279  std::ostringstream os;
4280  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4281  std::cerr << os.str();
4282  }
4283  decltype (nonlocals_) newNonlocals;
4284  std::swap (nonlocals_, newNonlocals);
4285 
4286  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4287  // don't like throwing an exception here. A local return value
4288  // would likely be more useful to users. However, if users find
4289  // themselves exercising nonlocal inserts often, then they are
4290  // probably novice users who need the help. See Gibhub Issues
4291  // #603 and #601 (esp. the latter) for discussion.
4292 
4293  int isGloballyComplete = 0; // output argument of reduceAll
4294  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4295  outArg (isGloballyComplete));
4296  TEUCHOS_TEST_FOR_EXCEPTION
4297  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4298  "you called insertGlobalValues with a global row index which is not in "
4299  "the matrix's row Map on any process in its communicator.");
4300  }
4301 
4302  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4303  void
4305  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4306  {
4307  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4308  myGraph_->resumeFill (params);
4309  }
4310 #if KOKKOSKERNELS_VERSION >= 40299
4311  // Delete the apply helper (if it exists)
4312  applyHelper.reset();
4313 #endif
4314  fillComplete_ = false;
4315  }
4316 
4317  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4318  bool
4321  return getCrsGraphRef ().haveGlobalConstants ();
4322  }
4323 
4324  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4325  void
4327  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4328  {
4329  const char tfecfFuncName[] = "fillComplete(params): ";
4330 
4331  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4332  (this->getCrsGraph ().is_null (), std::logic_error,
4333  "getCrsGraph() returns null. This should not happen at this point. "
4334  "Please report this bug to the Tpetra developers.");
4335 
4336  const crs_graph_type& graph = this->getCrsGraphRef ();
4337  if (this->isStaticGraph () && graph.isFillComplete ()) {
4338  // If this matrix's graph is fill complete and the user did not
4339  // supply a domain or range Map, use the graph's domain and
4340  // range Maps.
4341  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4342  }
4343  else { // assume that user's row Map is the domain and range Map
4344  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4345  Teuchos::RCP<const map_type> domainMap = rangeMap;
4346  this->fillComplete (domainMap, rangeMap, params);
4347  }
4348  }
4349 
4350  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4351  void
4353  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4354  const Teuchos::RCP<const map_type>& rangeMap,
4355  const Teuchos::RCP<Teuchos::ParameterList>& params)
4356  {
4357  using Details::Behavior;
4359  using Teuchos::ArrayRCP;
4360  using Teuchos::RCP;
4361  using Teuchos::rcp;
4362  using std::endl;
4363  const char tfecfFuncName[] = "fillComplete: ";
4364  ProfilingRegion regionFillComplete
4365  ("Tpetra::CrsMatrix::fillComplete");
4366  const bool verbose = Behavior::verbose("CrsMatrix");
4367  std::unique_ptr<std::string> prefix;
4368  if (verbose) {
4369  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4370  std::ostringstream os;
4371  os << *prefix << endl;
4372  std::cerr << os.str ();
4373  }
4374  Details::ProfilingRegion region(
4375  "Tpetra::CrsMatrix::fillCompete",
4376  "fillCompete");
4377 
4378  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4379  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4380  "Matrix fill state must be active (isFillActive() "
4381  "must be true) before you may call fillComplete().");
4382  const int numProcs = this->getComm ()->getSize ();
4383 
4384  //
4385  // Read parameters from the input ParameterList.
4386  //
4387  {
4388  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4389 
4390  // If true, the caller promises that no process did nonlocal
4391  // changes since the last call to fillComplete.
4392  bool assertNoNonlocalInserts = false;
4393  // If true, makeColMap sorts remote GIDs (within each remote
4394  // process' group).
4395  bool sortGhosts = true;
4396 
4397  if (! params.is_null ()) {
4398  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4399  assertNoNonlocalInserts);
4400  if (params->isParameter ("sort column map ghost gids")) {
4401  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4402  }
4403  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4404  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4405  }
4406  }
4407  // We also don't need to do global assembly if there is only one
4408  // process in the communicator.
4409  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4410  // This parameter only matters if this matrix owns its graph.
4411  if (! this->myGraph_.is_null ()) {
4412  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4413  }
4414 
4415  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4416  if (this->hasColMap ()) { // use local indices
4417  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4418  }
4419  else { // no column Map, so use global indices
4420  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4421  }
4422  }
4423  // Global assemble, if we need to. This call only costs a single
4424  // all-reduce if we didn't need global assembly after all.
4425  if (needGlobalAssemble) {
4426  this->globalAssemble ();
4427  }
4428  else {
4429  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4430  (numProcs == 1 && nonlocals_.size() > 0,
4431  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4432  "An invalid entry (i.e., with row index not in the row Map) must have "
4433  "been submitted to the CrsMatrix.");
4434  }
4435  }
4436  if (this->isStaticGraph ()) {
4437  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4438  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4439  // checks below only in debug mode. It would be nicer to do a
4440  // local check, then propagate the error state in a deferred
4441  // way, whenever communication happens. That would reduce the
4442  // cost of checking, to the point where it may make sense to
4443  // enable it even in release mode.
4444 #ifdef HAVE_TPETRA_DEBUG
4445  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4446  // input Maps incurs a penalty of two all-reduces for the
4447  // otherwise optimal const graph case.
4448  //
4449  // We could turn these (max) 2 all-reduces into (max) 1, by
4450  // fusing them. We could do this by adding a "locallySameAs"
4451  // method to Map, which would return one of four states:
4452  //
4453  // a. Certainly globally the same
4454  // b. Certainly globally not the same
4455  // c. Locally the same
4456  // d. Locally not the same
4457  //
4458  // The first two states don't require further communication.
4459  // The latter two states require an all-reduce to communicate
4460  // globally, but we only need one all-reduce, since we only need
4461  // to check whether at least one of the Maps is wrong.
4462  const bool domainMapsMatch =
4463  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4464  const bool rangeMapsMatch =
4465  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4466 
4467  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4468  (! domainMapsMatch, std::runtime_error,
4469  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4470  "The graph cannot be changed because it was given to the CrsMatrix "
4471  "constructor as const. You can fix this by passing in the graph's "
4472  "domain Map and range Map to the matrix's fillComplete call.");
4473 
4474  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4475  (! rangeMapsMatch, std::runtime_error,
4476  "The CrsMatrix's range Map does not match the graph's range Map. "
4477  "The graph cannot be changed because it was given to the CrsMatrix "
4478  "constructor as const. You can fix this by passing in the graph's "
4479  "domain Map and range Map to the matrix's fillComplete call.");
4480 #endif // HAVE_TPETRA_DEBUG
4481 
4482  // The matrix does _not_ own the graph, and the graph's
4483  // structure is already fixed, so just fill the local matrix.
4484  this->fillLocalMatrix (params);
4485  }
4486  else {
4487  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4488  // Set the graph's domain and range Maps. This will clear the
4489  // Import if the domain Map has changed (is a different
4490  // pointer), and the Export if the range Map has changed (is a
4491  // different pointer).
4492  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4493 
4494  // Make the graph's column Map, if necessary.
4495  Teuchos::Array<int> remotePIDs (0);
4496  const bool mustBuildColMap = ! this->hasColMap ();
4497  if (mustBuildColMap) {
4498  this->myGraph_->makeColMap (remotePIDs);
4499  }
4500 
4501  // Make indices local, if necessary. The method won't do
4502  // anything if the graph is already locally indexed.
4503  const std::pair<size_t, std::string> makeIndicesLocalResult =
4504  this->myGraph_->makeIndicesLocal(verbose);
4505  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4506  // the error state to makeImportExport
4507  // which may do all-reduces and thus may
4508  // have the opportunity to communicate that error state.
4509  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4510  (makeIndicesLocalResult.first != 0, std::runtime_error,
4511  makeIndicesLocalResult.second);
4512 
4513  const bool sorted = this->myGraph_->isSorted ();
4514  const bool merged = this->myGraph_->isMerged ();
4515  this->sortAndMergeIndicesAndValues (sorted, merged);
4516 
4517  // Make Import and Export objects, if they haven't been made
4518  // already. If we made a column Map above, reuse information
4519  // from that process to avoid communiation in the Import setup.
4520  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4521 
4522  // The matrix _does_ own the graph, so fill the local graph at
4523  // the same time as the local matrix.
4524  this->fillLocalGraphAndMatrix (params);
4525 
4526  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4527  params->get ("compute global constants", true);
4528  if (callGraphComputeGlobalConstants) {
4529  this->myGraph_->computeGlobalConstants ();
4530  }
4531  else {
4532  this->myGraph_->computeLocalConstants ();
4533  }
4534  this->myGraph_->fillComplete_ = true;
4535  this->myGraph_->checkInternalState ();
4536  }
4537 
4538  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4539 
4540  this->fillComplete_ = true; // Now we're fill complete!
4541  {
4542  Details::ProfilingRegion region_cis(
4543  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4544  );
4545  this->checkInternalState ();
4546  }
4547  } //fillComplete(domainMap, rangeMap, params)
4548 
4549  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4550  void
4552  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4553  const Teuchos::RCP<const map_type> & rangeMap,
4554  const Teuchos::RCP<const import_type>& importer,
4555  const Teuchos::RCP<const export_type>& exporter,
4556  const Teuchos::RCP<Teuchos::ParameterList> &params)
4557  {
4558 #ifdef HAVE_TPETRA_MMM_TIMINGS
4559  std::string label;
4560  if(!params.is_null())
4561  label = params->get("Timer Label",label);
4562  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4563  using Teuchos::TimeMonitor;
4564 
4565  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4566 #endif
4567 
4568  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4569  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4570  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4571  "must be true) before calling fillComplete().");
4572  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4573  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4574 
4575  {
4576 #ifdef HAVE_TPETRA_MMM_TIMINGS
4577  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4578 #endif
4579  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4580  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4581  }
4582 
4583  {
4584 #ifdef HAVE_TPETRA_MMM_TIMINGS
4585  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4586 #endif
4587  // Fill the local graph and matrix
4588  fillLocalGraphAndMatrix (params);
4589  }
4590  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4591 
4592  // Now we're fill complete!
4593  fillComplete_ = true;
4594 
4595  // Sanity checks at the end.
4596 #ifdef HAVE_TPETRA_DEBUG
4597  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4598  ": We're at the end of fillComplete(), but isFillActive() is true. "
4599  "Please report this bug to the Tpetra developers.");
4600  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4601  ": We're at the end of fillComplete(), but isFillActive() is true. "
4602  "Please report this bug to the Tpetra developers.");
4603 #endif // HAVE_TPETRA_DEBUG
4604  {
4605 #ifdef HAVE_TPETRA_MMM_TIMINGS
4606  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4607 #endif
4608 
4609  checkInternalState();
4610  }
4611  }
4612 
4613  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4615  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4616  {
4617  impl_scalar_type* rowValueIter = vals;
4618  // beg,end define a half-exclusive interval over which to iterate.
4619  LocalOrdinal* beg = cols;
4620  LocalOrdinal* end = cols + rowLen;
4621  LocalOrdinal* newend = beg;
4622  if (beg != end) {
4623  LocalOrdinal* cur = beg + 1;
4624  impl_scalar_type* vcur = rowValueIter + 1;
4625  impl_scalar_type* vend = rowValueIter;
4626  cur = beg+1;
4627  while (cur != end) {
4628  if (*cur != *newend) {
4629  // new entry; save it
4630  ++newend;
4631  ++vend;
4632  (*newend) = (*cur);
4633  (*vend) = (*vcur);
4634  }
4635  else {
4636  // old entry; merge it
4637  //(*vend) = f (*vend, *vcur);
4638  (*vend) += *vcur;
4639  }
4640  ++cur;
4641  ++vcur;
4642  }
4643  ++newend; // one past the last entry, per typical [beg,end) semantics
4644  }
4645  return newend - beg;
4646  }
4647 
4648  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4649  void
4651  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4652  {
4653  using ::Tpetra::Details::ProfilingRegion;
4654  typedef LocalOrdinal LO;
4655  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4656  host_execution_space;
4657  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4658  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4659  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4660 
4661  if (! sorted || ! merged) {
4662  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4663  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4664  "\"static\" (const) graph, since the matrix does not own the graph.");
4665  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4666  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4667  "this matrix claims ! isStaticGraph(). "
4668  "Please report this bug to the Tpetra developers.");
4669  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4670  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4671  "this method if the graph's storage has already been optimized. "
4672  "Please report this bug to the Tpetra developers.");
4673 
4674  crs_graph_type& graph = * (this->myGraph_);
4675  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4676  size_t totalNumDups = 0;
4677  {
4678  //Accessing host unpacked (4-array CRS) local matrix.
4679  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4680  auto rowLengths_ = graph.k_numRowEntries_;
4681  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4682  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4683  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4684  [=] (const LO lclRow, size_t& numDups) {
4685  size_t rowBegin = rowBegins_(lclRow);
4686  size_t rowLen = rowLengths_(lclRow);
4687  LO* cols = cols_.data() + rowBegin;
4688  impl_scalar_type* vals = vals_.data() + rowBegin;
4689  if (! sorted) {
4690  sort2 (cols, cols + rowLen, vals);
4691  }
4692  if (! merged) {
4693  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4694  rowLengths_(lclRow) = newRowLength;
4695  numDups += rowLen - newRowLength;
4696  }
4697  }, totalNumDups);
4698  }
4699  if (! sorted) {
4700  graph.indicesAreSorted_ = true; // we just sorted every row
4701  }
4702  if (! merged) {
4703  graph.noRedundancies_ = true; // we just merged every row
4704  }
4705  }
4706  }
4707 
4708  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4709  void
4713  Scalar alpha,
4714  Scalar beta) const
4715  {
4717  using Teuchos::RCP;
4718  using Teuchos::rcp;
4719  using Teuchos::rcp_const_cast;
4720  using Teuchos::rcpFromRef;
4721  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4722  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4723 
4724  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4725  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4726  // failing only for the Kokkos refactor version of Tpetra. It's a
4727  // good idea regardless to have the bypass.
4728  if (alpha == ZERO) {
4729  if (beta == ZERO) {
4730  Y_in.putScalar (ZERO);
4731  } else if (beta != ONE) {
4732  Y_in.scale (beta);
4733  }
4734  return;
4735  }
4736 
4737  // It's possible that X is a view of Y or vice versa. We don't
4738  // allow this (apply() requires that X and Y not alias one
4739  // another), but it's helpful to detect and work around this case.
4740  // We don't try to to detect the more subtle cases (e.g., one is a
4741  // subview of the other, but their initial pointers differ). We
4742  // only need to do this if this matrix's Import is trivial;
4743  // otherwise, we don't actually apply the operator from X into Y.
4744 
4745  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4746  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4747 
4748  // If beta == 0, then the output MV will be overwritten; none of
4749  // its entries should be read. (Sparse BLAS semantics say that we
4750  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4751  // This matters if we need to do an Export operation; see below.
4752  const bool Y_is_overwritten = (beta == ZERO);
4753 
4754  // We treat the case of a replicated MV output specially.
4755  const bool Y_is_replicated =
4756  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4757 
4758  // This is part of the special case for replicated MV output.
4759  // We'll let each process do its thing, but do an all-reduce at
4760  // the end to sum up the results. Setting beta=0 on all processes
4761  // but Proc 0 makes the math work out for the all-reduce. (This
4762  // assumes that the replicated data is correctly replicated, so
4763  // that the data are the same on all processes.)
4764  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4765  beta = ZERO;
4766  }
4767 
4768  // Temporary MV for Import operation. After the block of code
4769  // below, this will be an (Imported if necessary) column Map MV
4770  // ready to give to localApply(...).
4771  RCP<const MV> X_colMap;
4772  if (importer.is_null ()) {
4773  if (! X_in.isConstantStride ()) {
4774  // Not all sparse mat-vec kernels can handle an input MV with
4775  // nonconstant stride correctly, so we have to copy it in that
4776  // case into a constant stride MV. To make a constant stride
4777  // copy of X_in, we force creation of the column (== domain)
4778  // Map MV (if it hasn't already been created, else fetch the
4779  // cached copy). This avoids creating a new MV each time.
4780  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4781  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4782  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4783  }
4784  else {
4785  // The domain and column Maps are the same, so do the local
4786  // multiply using the domain Map input MV X_in.
4787  X_colMap = rcpFromRef (X_in);
4788  }
4789  }
4790  else { // need to Import source (multi)vector
4791  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4792 
4793  // We're doing an Import anyway, which will copy the relevant
4794  // elements of the domain Map MV X_in into a separate column Map
4795  // MV. Thus, we don't have to worry whether X_in is constant
4796  // stride.
4797  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4798 
4799  // Import from the domain Map MV to the column Map MV.
4800  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4801  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4802  }
4803 
4804  // Temporary MV for doExport (if needed), or for copying a
4805  // nonconstant stride output MV into a constant stride MV. This
4806  // is null if we don't need the temporary MV, that is, if the
4807  // Export is trivial (null).
4808  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4809 
4810  // If we have a nontrivial Export object, we must perform an
4811  // Export. In that case, the local multiply result will go into
4812  // the row Map multivector. We don't have to make a
4813  // constant-stride version of Y_in in this case, because we had to
4814  // make a constant stride Y_rowMap MV and do an Export anyway.
4815  if (! exporter.is_null ()) {
4816  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4817  {
4818  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4819 
4820  // If we're overwriting the output MV Y_in completely (beta ==
4821  // 0), then make sure that it is filled with zeros before we
4822  // do the Export. Otherwise, the ADD combine mode will use
4823  // data in Y_in, which is supposed to be zero.
4824  if (Y_is_overwritten) {
4825  Y_in.putScalar (ZERO);
4826  }
4827  else {
4828  // Scale output MV by beta, so that doExport sums in the
4829  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4830  Y_in.scale (beta);
4831  }
4832  // Do the Export operation.
4833  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4834  }
4835  }
4836  else { // Don't do an Export: row Map and range Map are the same.
4837  //
4838  // If Y_in does not have constant stride, or if the column Map
4839  // MV aliases Y_in, then we can't let the kernel write directly
4840  // to Y_in. Instead, we have to use the cached row (== range)
4841  // Map MV as temporary storage.
4842  //
4843  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4844  // the user passed in the same MultiVector for both X and Y. It
4845  // won't detect whether one MultiVector views the other. We
4846  // should also check the MultiVectors' raw data pointers.
4847  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4848  // Force creating the MV if it hasn't been created already.
4849  // This will reuse a previously created cached MV.
4850  Y_rowMap = getRowMapMultiVector (Y_in, true);
4851 
4852  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4853  // since we're overwriting it anyway.
4854  if (beta != ZERO) {
4855  Tpetra::deep_copy (*Y_rowMap, Y_in);
4856  }
4857  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4858  Tpetra::deep_copy (Y_in, *Y_rowMap);
4859  }
4860  else {
4861  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4862  }
4863  }
4864 
4865  // If the range Map is a locally replicated Map, sum up
4866  // contributions from each process. We set beta = 0 on all
4867  // processes but Proc 0 initially, so this will handle the scaling
4868  // factor beta correctly.
4869  if (Y_is_replicated) {
4870  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4871  Y_in.reduce ();
4872  }
4873  }
4874 
4875  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4876  void
4880  const Teuchos::ETransp mode,
4881  Scalar alpha,
4882  Scalar beta) const
4883  {
4885  using Teuchos::null;
4886  using Teuchos::RCP;
4887  using Teuchos::rcp;
4888  using Teuchos::rcp_const_cast;
4889  using Teuchos::rcpFromRef;
4890  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4891 
4892  // Take shortcuts for alpha == 0.
4893  if (alpha == ZERO) {
4894  // Follow the Sparse BLAS convention by ignoring both the matrix
4895  // and X_in, in this case.
4896  if (beta == ZERO) {
4897  // Follow the Sparse BLAS convention by overwriting any Inf or
4898  // NaN values in Y_in, in this case.
4899  Y_in.putScalar (ZERO);
4900  }
4901  else {
4902  Y_in.scale (beta);
4903  }
4904  return;
4905  }
4906  else if (beta == ZERO) {
4907  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4908  //when bets==0. This was not the case with transpose in a multithreaded
4909  //environment where a multiplication with subsequent atomic_adds is used
4910  //since 0 is effectively not special cased. Doing the explicit set to zero here
4911  //This catches cases where Y is nan or inf.
4912  Y_in.putScalar (ZERO);
4913  }
4914 
4915  const size_t numVectors = X_in.getNumVectors ();
4916 
4917  // We don't allow X_in and Y_in to alias one another. It's hard
4918  // to check this, because advanced users could create views from
4919  // raw pointers. However, if X_in and Y_in reference the same
4920  // object, we will do the user a favor by copying X into new
4921  // storage (with a warning). We only need to do this if we have
4922  // trivial importers; otherwise, we don't actually apply the
4923  // operator from X into Y.
4924  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4925  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4926  // access X indirectly, in case we need to create temporary storage
4927  RCP<const MV> X;
4928 
4929  // some parameters for below
4930  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4931  const bool Y_is_overwritten = (beta == ZERO);
4932  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4933  beta = ZERO;
4934  }
4935 
4936  // The kernels do not allow input or output with nonconstant stride.
4937  if (! X_in.isConstantStride () && importer.is_null ()) {
4938  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4939  } else {
4940  X = rcpFromRef (X_in); // Reference to X_in
4941  }
4942 
4943  // Set up temporary multivectors for Import and/or Export.
4944  if (importer != Teuchos::null) {
4945  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4946  importMV_ = null;
4947  }
4948  if (importMV_ == null) {
4949  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4950  }
4951  }
4952  if (exporter != Teuchos::null) {
4953  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4954  exportMV_ = null;
4955  }
4956  if (exportMV_ == null) {
4957  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4958  }
4959  }
4960 
4961  // If we have a non-trivial exporter, we must import elements that
4962  // are permuted or are on other processors.
4963  if (! exporter.is_null ()) {
4964  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4965  exportMV_->doImport (X_in, *exporter, INSERT);
4966  X = exportMV_; // multiply out of exportMV_
4967  }
4968 
4969  // If we have a non-trivial importer, we must export elements that
4970  // are permuted or belong to other processors. We will compute
4971  // solution into the to-be-exported MV; get a view.
4972  if (importer != Teuchos::null) {
4973  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4974 
4975  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4976  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4977  // back and figure out why this helps. importMV_ SHOULD be
4978  // completely overwritten in the localApply(...) call
4979  // below, because beta == ZERO there.
4980  importMV_->putScalar (ZERO);
4981  // Do the local computation.
4982  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4983 
4984  if (Y_is_overwritten) {
4985  Y_in.putScalar (ZERO);
4986  } else {
4987  Y_in.scale (beta);
4988  }
4989  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4990  }
4991  // otherwise, multiply into Y
4992  else {
4993  // can't multiply in-situ; can't multiply into non-strided multivector
4994  //
4995  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4996  // the user passed in the same MultiVector for both X and Y. It
4997  // won't detect whether one MultiVector views the other. We
4998  // should also check the MultiVectors' raw data pointers.
4999  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
5000  // Make a deep copy of Y_in, into which to write the multiply result.
5001  MV Y (Y_in, Teuchos::Copy);
5002  this->localApply (*X, Y, mode, alpha, beta);
5003  Tpetra::deep_copy (Y_in, Y);
5004  } else {
5005  this->localApply (*X, Y_in, mode, alpha, beta);
5006  }
5007  }
5008 
5009  // If the range Map is a locally replicated map, sum the
5010  // contributions from each process. (That's why we set beta=0
5011  // above for all processes but Proc 0.)
5012  if (Y_is_replicated) {
5013  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
5014  Y_in.reduce ();
5015  }
5016  }
5017 
5018  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5019  void
5023  const Teuchos::ETransp mode,
5024  const Scalar& alpha,
5025  const Scalar& beta) const
5026  {
5028  using Teuchos::NO_TRANS;
5029  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5030 
5031  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5032  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5033 
5034  const bool debug = ::Tpetra::Details::Behavior::debug ();
5035  if (debug) {
5036  const char tfecfFuncName[] = "localApply: ";
5037  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5038  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5039  "X.getNumVectors() = " << X.getNumVectors () << " != "
5040  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5041  const bool transpose = (mode != Teuchos::NO_TRANS);
5042  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5043  (! transpose && X.getLocalLength () !=
5044  getColMap ()->getLocalNumElements (), std::runtime_error,
5045  "NO_TRANS case: X has the wrong number of local rows. "
5046  "X.getLocalLength() = " << X.getLocalLength () << " != "
5047  "getColMap()->getLocalNumElements() = " <<
5048  getColMap ()->getLocalNumElements () << ".");
5049  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5050  (! transpose && Y.getLocalLength () !=
5051  getRowMap ()->getLocalNumElements (), std::runtime_error,
5052  "NO_TRANS case: Y has the wrong number of local rows. "
5053  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5054  "getRowMap()->getLocalNumElements() = " <<
5055  getRowMap ()->getLocalNumElements () << ".");
5056  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5057  (transpose && X.getLocalLength () !=
5058  getRowMap ()->getLocalNumElements (), std::runtime_error,
5059  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5060  "rows. X.getLocalLength() = " << X.getLocalLength ()
5061  << " != getRowMap()->getLocalNumElements() = "
5062  << getRowMap ()->getLocalNumElements () << ".");
5063  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5064  (transpose && Y.getLocalLength () !=
5065  getColMap ()->getLocalNumElements (), std::runtime_error,
5066  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5067  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5068  << " != getColMap()->getLocalNumElements() = "
5069  << getColMap ()->getLocalNumElements () << ".");
5070  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5071  (! isFillComplete (), std::runtime_error, "The matrix is not "
5072  "fill complete. You must call fillComplete() (possibly with "
5073  "domain and range Map arguments) without an intervening "
5074  "resumeFill() call before you may call this method.");
5075  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5076  (! X.isConstantStride () || ! Y.isConstantStride (),
5077  std::runtime_error, "X and Y must be constant stride.");
5078  // If the two pointers are null, then they don't alias one
5079  // another, even though they are equal.
5080  // Kokkos does not guarantee that zero row-extent vectors
5081  // point to different places, so we have to check that too.
5082  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5083  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5084  && X_lcl.extent(0) != 0,
5085  std::runtime_error, "X and Y may not alias one another.");
5086  }
5087 
5088 #if KOKKOSKERNELS_VERSION >= 40299
5089  auto A_lcl = getLocalMatrixDevice();
5090 
5091  if(!applyHelper.get()) {
5092  // The apply helper does not exist, so create it.
5093  // Decide now whether to use the imbalanced row path, or the default.
5094  bool useMergePath = false;
5095 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5096  //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5097  //we can use SPMV_MERGE_PATH for the native spmv as well.
5098  //Take out this ifdef to enable that.
5099  //
5100  //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5101  if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5102  LocalOrdinal nrows = getLocalNumRows();
5103  LocalOrdinal maxRowImbalance = 0;
5104  if(nrows != 0)
5105  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5106 
5107  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5108  useMergePath = true;
5109  }
5110 #endif
5111  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5112  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5113  }
5114 
5115  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5116  const char* modeKK = nullptr;
5117  switch(mode)
5118  {
5119  case Teuchos::NO_TRANS:
5120  modeKK = KokkosSparse::NoTranspose; break;
5121  case Teuchos::TRANS:
5122  modeKK = KokkosSparse::Transpose; break;
5123  case Teuchos::CONJ_TRANS:
5124  modeKK = KokkosSparse::ConjugateTranspose; break;
5125  default:
5126  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5127  }
5128 
5129  if(applyHelper->shouldUseIntRowptrs())
5130  {
5131  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5132  KokkosSparse::spmv(
5133  &applyHelper->handle_int, modeKK,
5134  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5135  }
5136  else
5137  {
5138  KokkosSparse::spmv(
5139  &applyHelper->handle, modeKK,
5140  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5141  }
5142 #else
5143  LocalOrdinal nrows = getLocalNumRows();
5144  LocalOrdinal maxRowImbalance = 0;
5145  if(nrows != 0)
5146  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5147 
5148  auto matrix_lcl = getLocalMultiplyOperator();
5149  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5150  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5151  else
5152  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5153 #endif
5154  }
5155 
5156  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5157  void
5161  Teuchos::ETransp mode,
5162  Scalar alpha,
5163  Scalar beta) const
5164  {
5166  const char fnName[] = "Tpetra::CrsMatrix::apply";
5167 
5168  TEUCHOS_TEST_FOR_EXCEPTION
5169  (! isFillComplete (), std::runtime_error,
5170  fnName << ": Cannot call apply() until fillComplete() "
5171  "has been called.");
5172 
5173  if (mode == Teuchos::NO_TRANS) {
5174  ProfilingRegion regionNonTranspose (fnName);
5175  this->applyNonTranspose (X, Y, alpha, beta);
5176  }
5177  else {
5178  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5179  this->applyTranspose (X, Y, mode, alpha, beta);
5180  }
5181  }
5182 
5183 
5184  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5185  template<class T>
5186  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5188  convert () const
5189  {
5190  using Teuchos::RCP;
5191  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5192  const char tfecfFuncName[] = "convert: ";
5193 
5194  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5195  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5196  "of the conversion) is not fill complete. You must first call "
5197  "fillComplete() (possibly with the domain and range Map) without an "
5198  "intervening call to resumeFill(), before you may call this method.");
5199 
5200  RCP<output_matrix_type> newMatrix
5201  (new output_matrix_type (this->getCrsGraph ()));
5202  // Copy old values into new values. impl_scalar_type and T may
5203  // differ, so we can't use Kokkos::deep_copy.
5205  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5206  this->getLocalMatrixDevice ().values);
5207  // Since newmat has a static (const) graph, the graph already has
5208  // a column Map, and Import and Export objects already exist (if
5209  // applicable). Thus, calling fillComplete is cheap.
5210  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5211 
5212  return newMatrix;
5213  }
5214 
5215 
5216  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5217  void
5220  {
5221  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5222  if (debug) {
5223  const char tfecfFuncName[] = "checkInternalState: ";
5224  const char err[] = "Internal state is not consistent. "
5225  "Please report this bug to the Tpetra developers.";
5226 
5227  // This version of the graph (RCP<const crs_graph_type>) must
5228  // always be nonnull.
5229  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5230  (staticGraph_.is_null (), std::logic_error, err);
5231  // myGraph == null means that the matrix has a const ("static")
5232  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5233  // graph).
5234  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5235  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5236  std::logic_error, err);
5237  // if matrix is fill complete, then graph must be fill complete
5238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5239  (isFillComplete () && ! staticGraph_->isFillComplete (),
5240  std::logic_error, err << " Specifically, the matrix is fill complete, "
5241  "but its graph is NOT fill complete.");
5242  // if values are allocated and they are non-zero in number, then
5243  // one of the allocations should be present
5244  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5245  (staticGraph_->indicesAreAllocated () &&
5246  staticGraph_->getLocalAllocationSize() > 0 &&
5247  staticGraph_->getLocalNumRows() > 0 &&
5248  valuesUnpacked_wdv.extent (0) == 0,
5249  std::logic_error, err);
5250  }
5251  }
5252 
5253  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5254  std::string
5257  {
5258  std::ostringstream os;
5259 
5260  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5261  if (this->getObjectLabel () != "") {
5262  os << "Label: \"" << this->getObjectLabel () << "\", ";
5263  }
5264  if (isFillComplete ()) {
5265  os << "isFillComplete: true"
5266  << ", global dimensions: [" << getGlobalNumRows () << ", "
5267  << getGlobalNumCols () << "]"
5268  << ", global number of entries: " << getGlobalNumEntries ()
5269  << "}";
5270  }
5271  else {
5272  os << "isFillComplete: false"
5273  << ", global dimensions: [" << getGlobalNumRows () << ", "
5274  << getGlobalNumCols () << "]}";
5275  }
5276  return os.str ();
5277  }
5278 
5279  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5280  void
5282  describe (Teuchos::FancyOStream &out,
5283  const Teuchos::EVerbosityLevel verbLevel) const
5284  {
5285  using std::endl;
5286  using std::setw;
5287  using Teuchos::ArrayView;
5288  using Teuchos::Comm;
5289  using Teuchos::RCP;
5290  using Teuchos::TypeNameTraits;
5291  using Teuchos::VERB_DEFAULT;
5292  using Teuchos::VERB_NONE;
5293  using Teuchos::VERB_LOW;
5294  using Teuchos::VERB_MEDIUM;
5295  using Teuchos::VERB_HIGH;
5296  using Teuchos::VERB_EXTREME;
5297 
5298  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5299 
5300  if (vl == VERB_NONE) {
5301  return; // Don't print anything at all
5302  }
5303 
5304  // By convention, describe() always begins with a tab.
5305  Teuchos::OSTab tab0 (out);
5306 
5307  RCP<const Comm<int> > comm = this->getComm();
5308  const int myRank = comm->getRank();
5309  const int numProcs = comm->getSize();
5310  size_t width = 1;
5311  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5312  ++width;
5313  }
5314  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5315 
5316  // none: print nothing
5317  // low: print O(1) info from node 0
5318  // medium: print O(P) info, num entries per process
5319  // high: print O(N) info, num entries per row
5320  // extreme: print O(NNZ) info: print indices and values
5321  //
5322  // for medium and higher, print constituent objects at specified verbLevel
5323  if (myRank == 0) {
5324  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5325  }
5326  Teuchos::OSTab tab1 (out);
5327 
5328  if (myRank == 0) {
5329  if (this->getObjectLabel () != "") {
5330  out << "Label: \"" << this->getObjectLabel () << "\", ";
5331  }
5332  {
5333  out << "Template parameters:" << endl;
5334  Teuchos::OSTab tab2 (out);
5335  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5336  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5337  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5338  << "Node: " << TypeNameTraits<Node>::name () << endl;
5339  }
5340  if (isFillComplete()) {
5341  out << "isFillComplete: true" << endl
5342  << "Global dimensions: [" << getGlobalNumRows () << ", "
5343  << getGlobalNumCols () << "]" << endl
5344  << "Global number of entries: " << getGlobalNumEntries () << endl
5345  << endl << "Global max number of entries in a row: "
5346  << getGlobalMaxNumRowEntries () << endl;
5347  }
5348  else {
5349  out << "isFillComplete: false" << endl
5350  << "Global dimensions: [" << getGlobalNumRows () << ", "
5351  << getGlobalNumCols () << "]" << endl;
5352  }
5353  }
5354 
5355  if (vl < VERB_MEDIUM) {
5356  return; // all done!
5357  }
5358 
5359  // Describe the row Map.
5360  if (myRank == 0) {
5361  out << endl << "Row Map:" << endl;
5362  }
5363  if (getRowMap ().is_null ()) {
5364  if (myRank == 0) {
5365  out << "null" << endl;
5366  }
5367  }
5368  else {
5369  if (myRank == 0) {
5370  out << endl;
5371  }
5372  getRowMap ()->describe (out, vl);
5373  }
5374 
5375  // Describe the column Map.
5376  if (myRank == 0) {
5377  out << "Column Map: ";
5378  }
5379  if (getColMap ().is_null ()) {
5380  if (myRank == 0) {
5381  out << "null" << endl;
5382  }
5383  } else if (getColMap () == getRowMap ()) {
5384  if (myRank == 0) {
5385  out << "same as row Map" << endl;
5386  }
5387  } else {
5388  if (myRank == 0) {
5389  out << endl;
5390  }
5391  getColMap ()->describe (out, vl);
5392  }
5393 
5394  // Describe the domain Map.
5395  if (myRank == 0) {
5396  out << "Domain Map: ";
5397  }
5398  if (getDomainMap ().is_null ()) {
5399  if (myRank == 0) {
5400  out << "null" << endl;
5401  }
5402  } else if (getDomainMap () == getRowMap ()) {
5403  if (myRank == 0) {
5404  out << "same as row Map" << endl;
5405  }
5406  } else if (getDomainMap () == getColMap ()) {
5407  if (myRank == 0) {
5408  out << "same as column Map" << endl;
5409  }
5410  } else {
5411  if (myRank == 0) {
5412  out << endl;
5413  }
5414  getDomainMap ()->describe (out, vl);
5415  }
5416 
5417  // Describe the range Map.
5418  if (myRank == 0) {
5419  out << "Range Map: ";
5420  }
5421  if (getRangeMap ().is_null ()) {
5422  if (myRank == 0) {
5423  out << "null" << endl;
5424  }
5425  } else if (getRangeMap () == getDomainMap ()) {
5426  if (myRank == 0) {
5427  out << "same as domain Map" << endl;
5428  }
5429  } else if (getRangeMap () == getRowMap ()) {
5430  if (myRank == 0) {
5431  out << "same as row Map" << endl;
5432  }
5433  } else {
5434  if (myRank == 0) {
5435  out << endl;
5436  }
5437  getRangeMap ()->describe (out, vl);
5438  }
5439 
5440  // O(P) data
5441  for (int curRank = 0; curRank < numProcs; ++curRank) {
5442  if (myRank == curRank) {
5443  out << "Process rank: " << curRank << endl;
5444  Teuchos::OSTab tab2 (out);
5445  if (! staticGraph_->indicesAreAllocated ()) {
5446  out << "Graph indices not allocated" << endl;
5447  }
5448  else {
5449  out << "Number of allocated entries: "
5450  << staticGraph_->getLocalAllocationSize () << endl;
5451  }
5452  out << "Number of entries: " << getLocalNumEntries () << endl
5453  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5454  << endl;
5455  }
5456  // Give output time to complete by executing some barriers.
5457  comm->barrier ();
5458  comm->barrier ();
5459  comm->barrier ();
5460  }
5461 
5462  if (vl < VERB_HIGH) {
5463  return; // all done!
5464  }
5465 
5466  // O(N) and O(NNZ) data
5467  for (int curRank = 0; curRank < numProcs; ++curRank) {
5468  if (myRank == curRank) {
5469  out << std::setw(width) << "Proc Rank"
5470  << std::setw(width) << "Global Row"
5471  << std::setw(width) << "Num Entries";
5472  if (vl == VERB_EXTREME) {
5473  out << std::setw(width) << "(Index,Value)";
5474  }
5475  out << endl;
5476  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5477  const size_t nE = getNumEntriesInLocalRow(r);
5478  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5479  out << std::setw(width) << myRank
5480  << std::setw(width) << gid
5481  << std::setw(width) << nE;
5482  if (vl == VERB_EXTREME) {
5483  if (isGloballyIndexed()) {
5484  global_inds_host_view_type rowinds;
5485  values_host_view_type rowvals;
5486  getGlobalRowView (gid, rowinds, rowvals);
5487  for (size_t j = 0; j < nE; ++j) {
5488  out << " (" << rowinds[j]
5489  << ", " << rowvals[j]
5490  << ") ";
5491  }
5492  }
5493  else if (isLocallyIndexed()) {
5494  local_inds_host_view_type rowinds;
5495  values_host_view_type rowvals;
5496  getLocalRowView (r, rowinds, rowvals);
5497  for (size_t j=0; j < nE; ++j) {
5498  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5499  << ", " << rowvals[j]
5500  << ") ";
5501  }
5502  } // globally or locally indexed
5503  } // vl == VERB_EXTREME
5504  out << endl;
5505  } // for each row r on this process
5506  } // if (myRank == curRank)
5507 
5508  // Give output time to complete
5509  comm->barrier ();
5510  comm->barrier ();
5511  comm->barrier ();
5512  } // for each process p
5513  }
5514 
5515  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5516  bool
5519  {
5520  // It's not clear what kind of compatibility checks on sizes can
5521  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5522  // compatibility.
5523 
5524  // Currently, the source object must be a RowMatrix with the same
5525  // four template parameters as the target CrsMatrix. We might
5526  // relax this requirement later.
5527  const row_matrix_type* srcRowMat =
5528  dynamic_cast<const row_matrix_type*> (&source);
5529  return (srcRowMat != nullptr);
5530  }
5531 
5532  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5533  void
5536  const typename crs_graph_type::padding_type& padding,
5537  const bool verbose)
5538  {
5540  using Details::padCrsArrays;
5541  using std::endl;
5542  using LO = local_ordinal_type;
5543  using row_ptrs_type =
5544  typename local_graph_device_type::row_map_type::non_const_type;
5545  using range_policy =
5546  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5547  const char tfecfFuncName[] = "applyCrsPadding";
5548  const char suffix[] =
5549  ". Please report this bug to the Tpetra developers.";
5550  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5551 
5552  std::unique_ptr<std::string> prefix;
5553  if (verbose) {
5554  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5555  std::ostringstream os;
5556  os << *prefix << "padding: ";
5557  padding.print(os);
5558  os << endl;
5559  std::cerr << os.str();
5560  }
5561  const int myRank = ! verbose ? -1 : [&] () {
5562  auto map = this->getMap();
5563  if (map.is_null()) {
5564  return -1;
5565  }
5566  auto comm = map->getComm();
5567  if (comm.is_null()) {
5568  return -1;
5569  }
5570  return comm->getRank();
5571  } ();
5572 
5573  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5574  if (! myGraph_->indicesAreAllocated()) {
5575  if (verbose) {
5576  std::ostringstream os;
5577  os << *prefix << "Call allocateIndices" << endl;
5578  std::cerr << os.str();
5579  }
5580  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5581  }
5582 
5583  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5584  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5585  // size needs to increase. That should be the job of
5586  // padCrsArrays.
5587 
5588  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5589  // would use it directly.
5590 
5591  if (verbose) {
5592  std::ostringstream os;
5593  os << *prefix << "Allocate row_ptrs_beg: "
5594  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5595  std::cerr << os.str();
5596  }
5597  using Kokkos::view_alloc;
5598  using Kokkos::WithoutInitializing;
5599  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5600  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5601  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5602  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5603 
5604  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5605  size_t(row_ptr_beg.extent(0) - 1);
5606  if (verbose) {
5607  std::ostringstream os;
5608  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5609  std::cerr << os.str();
5610  }
5611  row_ptrs_type row_ptr_end(
5612  view_alloc("row_ptr_end", WithoutInitializing), N);
5613 
5614  row_ptrs_type num_row_entries_d;
5615 
5616  const bool refill_num_row_entries =
5617  myGraph_->k_numRowEntries_.extent(0) != 0;
5618 
5619  if (refill_num_row_entries) { // unpacked storage
5620  // We can't assume correct *this capture until C++17, and it's
5621  // likely more efficient just to capture what we need anyway.
5622  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5623  myGraph_->k_numRowEntries_);
5624  Kokkos::parallel_for
5625  ("Fill end row pointers", range_policy(0, N),
5626  KOKKOS_LAMBDA (const size_t i) {
5627  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5628  });
5629  }
5630  else {
5631  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5632  // storage, we don't need row_ptr_end to be separate allocation;
5633  // could just have it alias row_ptr_beg+1.
5634  Kokkos::parallel_for
5635  ("Fill end row pointers", range_policy(0, N),
5636  KOKKOS_LAMBDA (const size_t i) {
5637  row_ptr_end(i) = row_ptr_beg(i+1);
5638  });
5639  }
5640 
5641  if (myGraph_->isGloballyIndexed()) {
5642  padCrsArrays(row_ptr_beg, row_ptr_end,
5643  myGraph_->gblInds_wdv,
5644  valuesUnpacked_wdv, padding, myRank, verbose);
5645  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5646  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5647  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5648  (newValuesLen != newColIndsLen, std::logic_error,
5649  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5650  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5651  << suffix);
5652  }
5653  else {
5654  padCrsArrays(row_ptr_beg, row_ptr_end,
5655  myGraph_->lclIndsUnpacked_wdv,
5656  valuesUnpacked_wdv, padding, myRank, verbose);
5657  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5658  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5659  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5660  (newValuesLen != newColIndsLen, std::logic_error,
5661  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5662  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5663  << suffix);
5664  }
5665 
5666  if (refill_num_row_entries) {
5667  Kokkos::parallel_for
5668  ("Fill num entries", range_policy(0, N),
5669  KOKKOS_LAMBDA (const size_t i) {
5670  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5671  });
5672  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5673  }
5674 
5675  if (verbose) {
5676  std::ostringstream os;
5677  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5678  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5679  << ", new size: " << row_ptr_beg.extent(0) << endl;
5680  std::cerr << os.str();
5681  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5682  row_ptr_beg.extent(0) );
5683  }
5684  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5685  }
5686 
5687  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5688  void
5689  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5690  copyAndPermuteStaticGraph(
5691  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5692  const size_t numSameIDs,
5693  const LocalOrdinal permuteToLIDs[],
5694  const LocalOrdinal permuteFromLIDs[],
5695  const size_t numPermutes)
5696  {
5697  using Details::ProfilingRegion;
5698  using Teuchos::Array;
5699  using Teuchos::ArrayView;
5700  using std::endl;
5701  using LO = LocalOrdinal;
5702  using GO = GlobalOrdinal;
5703  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5704  const char suffix[] =
5705  " Please report this bug to the Tpetra developers.";
5706  ProfilingRegion regionCAP
5707  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5708 
5709  const bool debug = Details::Behavior::debug("CrsGraph");
5710  const bool verbose = Details::Behavior::verbose("CrsGraph");
5711  std::unique_ptr<std::string> prefix;
5712  if (verbose) {
5713  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5714  std::ostringstream os;
5715  os << *prefix << "Start" << endl;
5716  }
5717  const char* const prefix_raw =
5718  verbose ? prefix.get()->c_str() : nullptr;
5719 
5720  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5721  //
5722  // Copy the first numSame row from source to target (this matrix).
5723  // This involves copying rows corresponding to LIDs [0, numSame-1].
5724  //
5725  const map_type& srcRowMap = * (srcMat.getRowMap ());
5726  nonconst_global_inds_host_view_type rowInds;
5727  nonconst_values_host_view_type rowVals;
5728  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5729  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5730  // Global ID for the current row index in the source matrix.
5731  // The first numSameIDs GIDs in the two input lists are the
5732  // same, so sourceGID == targetGID in this case.
5733  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5734  const GO targetGID = sourceGID;
5735 
5736  ArrayView<const GO>rowIndsConstView;
5737  ArrayView<const Scalar> rowValsConstView;
5738 
5739  if (sourceIsLocallyIndexed) {
5740  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5741  if (rowLength > static_cast<size_t> (rowInds.size())) {
5742  Kokkos::resize(rowInds,rowLength);
5743  Kokkos::resize(rowVals,rowLength);
5744  }
5745  // Resizing invalidates an Array's views, so we must make new
5746  // ones, even if rowLength hasn't changed.
5747  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5748  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5749 
5750  // The source matrix is locally indexed, so we have to get a
5751  // copy. Really it's the GIDs that have to be copied (because
5752  // they have to be converted from LIDs).
5753  size_t checkRowLength = 0;
5754  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5755  rowValsView, checkRowLength);
5756  if (debug) {
5757  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5758  (rowLength != checkRowLength, std::logic_error, "For "
5759  "global row index " << sourceGID << ", the source "
5760  "matrix's getNumEntriesInGlobalRow returns a row length "
5761  "of " << rowLength << ", but getGlobalRowCopy reports "
5762  "a row length of " << checkRowLength << "." << suffix);
5763  }
5764 
5765  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5766  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5767  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5768  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5769  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5770  rowIndsView.data(), rowIndsView.extent(0),
5771  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5772  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5773  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5774  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5775  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5776  // KDDKDD UVM TEMPORARY: KokkosView interface
5777  }
5778  else { // source matrix is globally indexed.
5779  global_inds_host_view_type rowIndsView;
5780  values_host_view_type rowValsView;
5781  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5782  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5783  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5784  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5785  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5786  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5787  rowIndsView.data(), rowIndsView.extent(0),
5788  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5789  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5790  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5791  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5792  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5793  // KDDKDD UVM TEMPORARY: KokkosView interface
5794 
5795  }
5796 
5797  // Applying a permutation to a matrix with a static graph
5798  // means REPLACE-ing entries.
5799  combineGlobalValues(targetGID, rowIndsConstView,
5800  rowValsConstView, REPLACE,
5801  prefix_raw, debug, verbose);
5802  }
5803 
5804  if (verbose) {
5805  std::ostringstream os;
5806  os << *prefix << "Do permutes" << endl;
5807  }
5808 
5809  const map_type& tgtRowMap = * (this->getRowMap ());
5810  for (size_t p = 0; p < numPermutes; ++p) {
5811  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5812  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5813 
5814  ArrayView<const GO> rowIndsConstView;
5815  ArrayView<const Scalar> rowValsConstView;
5816 
5817  if (sourceIsLocallyIndexed) {
5818  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5819  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5820  Kokkos::resize(rowInds,rowLength);
5821  Kokkos::resize(rowVals,rowLength);
5822  }
5823  // Resizing invalidates an Array's views, so we must make new
5824  // ones, even if rowLength hasn't changed.
5825  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5826  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5827 
5828  // The source matrix is locally indexed, so we have to get a
5829  // copy. Really it's the GIDs that have to be copied (because
5830  // they have to be converted from LIDs).
5831  size_t checkRowLength = 0;
5832  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5833  rowValsView, checkRowLength);
5834  if (debug) {
5835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5836  (rowLength != checkRowLength, std::logic_error, "For "
5837  "source matrix global row index " << sourceGID << ", "
5838  "getNumEntriesInGlobalRow returns a row length of " <<
5839  rowLength << ", but getGlobalRowCopy a row length of "
5840  << checkRowLength << "." << suffix);
5841  }
5842 
5843  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5844  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5845  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5846  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5847  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5848  rowIndsView.data(), rowIndsView.extent(0),
5849  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5850  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5851  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5852  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5853  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5854  // KDDKDD UVM TEMPORARY: KokkosView interface
5855  }
5856  else {
5857  global_inds_host_view_type rowIndsView;
5858  values_host_view_type rowValsView;
5859  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5860  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5861  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5862  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5863  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5864  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5865  rowIndsView.data(), rowIndsView.extent(0),
5866  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5867  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5868  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5869  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5870  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5871  // KDDKDD UVM TEMPORARY: KokkosView interface
5872  }
5873 
5874  combineGlobalValues(targetGID, rowIndsConstView,
5875  rowValsConstView, REPLACE,
5876  prefix_raw, debug, verbose);
5877  }
5878 
5879  if (verbose) {
5880  std::ostringstream os;
5881  os << *prefix << "Done" << endl;
5882  }
5883  }
5884 
5885  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5886  void
5887  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5888  copyAndPermuteNonStaticGraph(
5889  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5890  const size_t numSameIDs,
5891  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5892  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5893  const size_t numPermutes)
5894  {
5895  using Details::ProfilingRegion;
5896  using Teuchos::Array;
5897  using Teuchos::ArrayView;
5898  using std::endl;
5899  using LO = LocalOrdinal;
5900  using GO = GlobalOrdinal;
5901  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5902  const char suffix[] =
5903  " Please report this bug to the Tpetra developers.";
5904  ProfilingRegion regionCAP
5905  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5906 
5907  const bool debug = Details::Behavior::debug("CrsGraph");
5908  const bool verbose = Details::Behavior::verbose("CrsGraph");
5909  std::unique_ptr<std::string> prefix;
5910  if (verbose) {
5911  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5912  std::ostringstream os;
5913  os << *prefix << "Start" << endl;
5914  }
5915  const char* const prefix_raw =
5916  verbose ? prefix.get()->c_str() : nullptr;
5917 
5918  {
5919  using row_graph_type = RowGraph<LO, GO, Node>;
5920  const row_graph_type& srcGraph = *(srcMat.getGraph());
5921  auto padding =
5922  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5923  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5924  applyCrsPadding(*padding, verbose);
5925  }
5926  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5927  //
5928  // Copy the first numSame row from source to target (this matrix).
5929  // This involves copying rows corresponding to LIDs [0, numSame-1].
5930  //
5931  const map_type& srcRowMap = * (srcMat.getRowMap ());
5932  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5933  using gids_type = nonconst_global_inds_host_view_type;
5934  using vals_type = nonconst_values_host_view_type;
5935  gids_type rowInds;
5936  vals_type rowVals;
5937  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5938  // Global ID for the current row index in the source matrix.
5939  // The first numSameIDs GIDs in the two input lists are the
5940  // same, so sourceGID == targetGID in this case.
5941  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5942  const GO targetGID = sourceGID;
5943 
5944  ArrayView<const GO> rowIndsConstView;
5945  ArrayView<const Scalar> rowValsConstView;
5946 
5947  if (sourceIsLocallyIndexed) {
5948 
5949  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5950  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5951  Kokkos::resize(rowInds,rowLength);
5952  Kokkos::resize(rowVals,rowLength);
5953  }
5954  // Resizing invalidates an Array's views, so we must make new
5955  // ones, even if rowLength hasn't changed.
5956  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5957  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5958 
5959  // The source matrix is locally indexed, so we have to get a
5960  // copy. Really it's the GIDs that have to be copied (because
5961  // they have to be converted from LIDs).
5962  size_t checkRowLength = 0;
5963  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5964  checkRowLength);
5965  if (debug) {
5966  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5967  (rowLength != checkRowLength, std::logic_error, ": For "
5968  "global row index " << sourceGID << ", the source "
5969  "matrix's getNumEntriesInGlobalRow returns a row length "
5970  "of " << rowLength << ", but getGlobalRowCopy reports "
5971  "a row length of " << checkRowLength << "." << suffix);
5972  }
5973  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5974  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5975  }
5976  else { // source matrix is globally indexed.
5977  global_inds_host_view_type rowIndsView;
5978  values_host_view_type rowValsView;
5979  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5980 
5981  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5982  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5983  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5984  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5985  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5986  rowIndsView.data(), rowIndsView.extent(0),
5987  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5988  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5989  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5990  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5991  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5992  // KDDKDD UVM TEMPORARY: KokkosView interface
5993  }
5994 
5995  // Combine the data into the target matrix.
5996  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5997  rowValsConstView, prefix_raw, debug, verbose);
5998  }
5999 
6000  if (verbose) {
6001  std::ostringstream os;
6002  os << *prefix << "Do permutes" << endl;
6003  }
6004  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
6005  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
6006 
6007  const map_type& tgtRowMap = * (this->getRowMap ());
6008  for (size_t p = 0; p < numPermutes; ++p) {
6009  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
6010  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
6011 
6012  ArrayView<const GO> rowIndsConstView;
6013  ArrayView<const Scalar> rowValsConstView;
6014 
6015  if (sourceIsLocallyIndexed) {
6016  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6017  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
6018  Kokkos::resize(rowInds,rowLength);
6019  Kokkos::resize(rowVals,rowLength);
6020  }
6021  // Resizing invalidates an Array's views, so we must make new
6022  // ones, even if rowLength hasn't changed.
6023  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
6024  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
6025 
6026  // The source matrix is locally indexed, so we have to get a
6027  // copy. Really it's the GIDs that have to be copied (because
6028  // they have to be converted from LIDs).
6029  size_t checkRowLength = 0;
6030  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6031  rowValsView, checkRowLength);
6032  if (debug) {
6033  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6034  (rowLength != checkRowLength, std::logic_error, "For "
6035  "source matrix global row index " << sourceGID << ", "
6036  "getNumEntriesInGlobalRow returns a row length of " <<
6037  rowLength << ", but getGlobalRowCopy a row length of "
6038  << checkRowLength << "." << suffix);
6039  }
6040  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6041  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6042  }
6043  else {
6044  global_inds_host_view_type rowIndsView;
6045  values_host_view_type rowValsView;
6046  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6047 
6048  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6049  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6050  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6051  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6052  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6053  rowIndsView.data(), rowIndsView.extent(0),
6054  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6055  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6056  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6057  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6058  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6059  // KDDKDD UVM TEMPORARY: KokkosView interface
6060  }
6061 
6062  // Combine the data into the target matrix.
6063  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6064  rowValsConstView, prefix_raw, debug, verbose);
6065  }
6066 
6067  if (verbose) {
6068  std::ostringstream os;
6069  os << *prefix << "Done" << endl;
6070  }
6071  }
6072 
6073  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6074  void
6077  const SrcDistObject& srcObj,
6078  const size_t numSameIDs,
6079  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6080  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6081  const CombineMode /*CM*/)
6082  {
6083  using Details::Behavior;
6086  using std::endl;
6087 
6088  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6089  const char tfecfFuncName[] = "copyAndPermute: ";
6090  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6091 
6092  const bool verbose = Behavior::verbose("CrsMatrix");
6093  std::unique_ptr<std::string> prefix;
6094  if (verbose) {
6095  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6096  std::ostringstream os;
6097  os << *prefix << endl
6098  << *prefix << " numSameIDs: " << numSameIDs << endl
6099  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6100  << endl
6101  << *prefix << " "
6102  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6103  << endl
6104  << *prefix << " "
6105  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6106  << endl
6107  << *prefix << " "
6108  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6109  << endl;
6110  std::cerr << os.str ();
6111  }
6112 
6113  const auto numPermute = permuteToLIDs.extent (0);
6114  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6115  (numPermute != permuteFromLIDs.extent (0),
6116  std::invalid_argument, "permuteToLIDs.extent(0) = "
6117  << numPermute << "!= permuteFromLIDs.extent(0) = "
6118  << permuteFromLIDs.extent (0) << ".");
6119 
6120  // This dynamic cast should succeed, because we've already tested
6121  // it in checkSizes().
6123  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6124  if (isStaticGraph ()) {
6125  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6126  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6127  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6128  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6129 
6130  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6131  permuteToLIDs_h.data(),
6132  permuteFromLIDs_h.data(),
6133  numPermute);
6134  }
6135  else {
6136  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6137  permuteFromLIDs, numPermute);
6138  }
6139 
6140  if (verbose) {
6141  std::ostringstream os;
6142  os << *prefix << "Done" << endl;
6143  std::cerr << os.str();
6144  }
6145  }
6146 
6147  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6148  void
6151  (const SrcDistObject& source,
6152  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6153  Kokkos::DualView<char*, buffer_device_type>& exports,
6154  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6155  size_t& constantNumPackets)
6156  {
6157  using Details::Behavior;
6160  using Teuchos::outArg;
6161  using Teuchos::REDUCE_MAX;
6162  using Teuchos::reduceAll;
6163  using std::endl;
6164  typedef LocalOrdinal LO;
6165  typedef GlobalOrdinal GO;
6166  const char tfecfFuncName[] = "packAndPrepare: ";
6167  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6168 
6169  const bool debug = Behavior::debug("CrsMatrix");
6170  const bool verbose = Behavior::verbose("CrsMatrix");
6171 
6172  // Processes on which the communicator is null should not participate.
6173  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6174  if (pComm.is_null ()) {
6175  return;
6176  }
6177  const Teuchos::Comm<int>& comm = *pComm;
6178  const int myRank = comm.getSize ();
6179 
6180  std::unique_ptr<std::string> prefix;
6181  if (verbose) {
6182  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6183  std::ostringstream os;
6184  os << *prefix << "Start" << endl
6185  << *prefix << " "
6186  << dualViewStatusToString (exportLIDs, "exportLIDs")
6187  << endl
6188  << *prefix << " "
6189  << dualViewStatusToString (exports, "exports")
6190  << endl
6191  << *prefix << " "
6192  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6193  << endl;
6194  std::cerr << os.str ();
6195  }
6196 
6197  // Attempt to cast the source object to CrsMatrix. If successful,
6198  // use the source object's packNew() method to pack its data for
6199  // communication. Otherwise, attempt to cast to RowMatrix; if
6200  // successful, use the source object's pack() method. Otherwise,
6201  // the source object doesn't have the right type.
6202  //
6203  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6204  // RowMatrix to have the same Node type. Unfortunately, we don't
6205  // have a way to ask if the RowMatrix is "a RowMatrix with any
6206  // Node type," since RowMatrix doesn't have a base class. A
6207  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6208  // not currently exist, would satisfy this requirement.
6209  //
6210  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6211  // type doesn't technically need to match the target object's
6212  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6213  // and GO need not be the same, as long as there is no overflow of
6214  // the indices. However, checking for index overflow is global
6215  // and therefore undesirable.
6216 
6217  std::ostringstream msg; // for collecting error messages
6218  int lclBad = 0; // to be set below
6219 
6220  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6221  const crs_matrix_type* srcCrsMat =
6222  dynamic_cast<const crs_matrix_type*> (&source);
6223  if (srcCrsMat != nullptr) {
6224  if (verbose) {
6225  std::ostringstream os;
6226  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6227  "calling packNew" << endl;
6228  std::cerr << os.str ();
6229  }
6230  try {
6231  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6232  constantNumPackets);
6233  }
6234  catch (std::exception& e) {
6235  lclBad = 1;
6236  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6237  }
6238  }
6239  else {
6240  using Kokkos::HostSpace;
6241  using Kokkos::subview;
6242  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6243  using range_type = Kokkos::pair<size_t, size_t>;
6244 
6245  if (verbose) {
6246  std::ostringstream os;
6247  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6248  << endl;
6249  std::cerr << os.str ();
6250  }
6251 
6252  const row_matrix_type* srcRowMat =
6253  dynamic_cast<const row_matrix_type*> (&source);
6254  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6255  (srcRowMat == nullptr, std::invalid_argument,
6256  "The source object of the Import or Export operation is neither a "
6257  "CrsMatrix (with the same template parameters as the target object), "
6258  "nor a RowMatrix (with the same first four template parameters as the "
6259  "target object).");
6260 
6261  // For the RowMatrix case, we need to convert from
6262  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6263  // so terribly efficient, since packing a non-CrsMatrix
6264  // RowMatrix for Import/Export into a CrsMatrix is not a
6265  // critical case. Thus, we may allocate Teuchos::Array objects
6266  // here and copy to and from Kokkos::*View.
6267 
6268  // View exportLIDs's host data as a Teuchos::ArrayView.
6269  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6270  auto exportLIDs_h = exportLIDs.view_host ();
6271  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6272  exportLIDs_h.size ());
6273 
6274  // pack() will allocate exports_a as needed. We'll copy back
6275  // into exports (after (re)allocating exports if needed) below.
6276  Teuchos::Array<char> exports_a;
6277 
6278  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6279  // need to sync, since we're doing write-only access, but we do
6280  // need to mark the DualView as modified on host.
6281 
6282  numPacketsPerLID.clear_sync_state (); // write-only access
6283  numPacketsPerLID.modify_host ();
6284  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6285  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6286  numPacketsPerLID_h.size ());
6287 
6288  // Invoke RowMatrix's legacy pack() interface, using above
6289  // Teuchos::Array* objects.
6290  try {
6291  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6292  constantNumPackets);
6293  }
6294  catch (std::exception& e) {
6295  lclBad = 1;
6296  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6297  }
6298 
6299  // Allocate 'exports', and copy exports_a back into it.
6300  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6301  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6302  const std::string oldLabel = exports.d_view.label ();
6303  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6304  exports = exports_type (newLabel, newAllocSize);
6305  }
6306  // It's safe to assume that we're working on host anyway, so
6307  // just keep exports sync'd to host.
6308  // ignore current device contents
6309  exports.modify_host();
6310 
6311  auto exports_h = exports.view_host ();
6312  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6313 
6314  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6315  // exports_a into a nonowning Kokkos::View first before copying.
6316  typedef typename exports_type::t_host::execution_space HES;
6317  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6318  Kokkos::View<const char*, host_device_type>
6319  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6320  // DEEP_COPY REVIEW - NOT TESTED
6321  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6322  }
6323 
6324  if (debug) {
6325  int gblBad = 0; // output argument; to be set below
6326  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6327  if (gblBad != 0) {
6328  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6329  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6330  (true, std::logic_error, "packNew() or pack() threw an exception on "
6331  "one or more participating processes.");
6332  }
6333  }
6334  else {
6335  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6336  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6337  "or more participating processes. Here is this process' error "
6338  "message: " << msg.str ());
6339  }
6340 
6341  if (verbose) {
6342  std::ostringstream os;
6343  os << *prefix << "packAndPrepare: Done!" << endl
6344  << *prefix << " "
6345  << dualViewStatusToString (exportLIDs, "exportLIDs")
6346  << endl
6347  << *prefix << " "
6348  << dualViewStatusToString (exports, "exports")
6349  << endl
6350  << *prefix << " "
6351  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6352  << endl;
6353  std::cerr << os.str ();
6354  }
6355  }
6356 
6357  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6358  size_t
6359  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6360  packRow (char exports[],
6361  const size_t offset,
6362  const size_t numEnt,
6363  const GlobalOrdinal gidsIn[],
6364  const impl_scalar_type valsIn[],
6365  const size_t numBytesPerValue) const
6366  {
6367  using Kokkos::View;
6368  using Kokkos::subview;
6370  typedef LocalOrdinal LO;
6371  typedef GlobalOrdinal GO;
6372  typedef impl_scalar_type ST;
6373 
6374  if (numEnt == 0) {
6375  // Empty rows always take zero bytes, to ensure sparsity.
6376  return 0;
6377  }
6378 
6379  const GO gid = 0; // packValueCount wants this
6380  const LO numEntLO = static_cast<size_t> (numEnt);
6381 
6382  const size_t numEntBeg = offset;
6383  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6384  const size_t gidsBeg = numEntBeg + numEntLen;
6385  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6386  const size_t valsBeg = gidsBeg + gidsLen;
6387  const size_t valsLen = numEnt * numBytesPerValue;
6388 
6389  char* const numEntOut = exports + numEntBeg;
6390  char* const gidsOut = exports + gidsBeg;
6391  char* const valsOut = exports + valsBeg;
6392 
6393  size_t numBytesOut = 0;
6394  int errorCode = 0;
6395  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
6396 
6397  {
6398  Kokkos::pair<int, size_t> p;
6399  p = PackTraits<GO>::packArray (gidsOut, gidsIn, numEnt);
6400  errorCode += p.first;
6401  numBytesOut += p.second;
6402 
6403  p = PackTraits<ST>::packArray (valsOut, valsIn, numEnt);
6404  errorCode += p.first;
6405  numBytesOut += p.second;
6406  }
6407 
6408  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6409  TEUCHOS_TEST_FOR_EXCEPTION
6410  (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
6411  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6412  << expectedNumBytes << ".");
6413  TEUCHOS_TEST_FOR_EXCEPTION
6414  (errorCode != 0, std::runtime_error, "packRow: "
6415  "PackTraits::packArray returned a nonzero error code");
6416 
6417  return numBytesOut;
6418  }
6419 
6420  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6421  size_t
6422  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6423  unpackRow (GlobalOrdinal gidsOut[],
6424  impl_scalar_type valsOut[],
6425  const char imports[],
6426  const size_t offset,
6427  const size_t numBytes,
6428  const size_t numEnt,
6429  const size_t numBytesPerValue)
6430  {
6431  using Kokkos::View;
6432  using Kokkos::subview;
6434  typedef LocalOrdinal LO;
6435  typedef GlobalOrdinal GO;
6436  typedef impl_scalar_type ST;
6437 
6438  Details::ProfilingRegion region_upack_row(
6439  "Tpetra::CrsMatrix::unpackRow",
6440  "Import/Export"
6441  );
6442 
6443  if (numBytes == 0) {
6444  // Rows with zero bytes should always have zero entries.
6445  if (numEnt != 0) {
6446  const int myRank = this->getMap ()->getComm ()->getRank ();
6447  TEUCHOS_TEST_FOR_EXCEPTION
6448  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6449  "unpackRow: The number of bytes to unpack numBytes=0, but the "
6450  "number of entries to unpack (as reported by numPacketsPerLID) "
6451  "for this row numEnt=" << numEnt << " != 0.");
6452  }
6453  return 0;
6454  }
6455 
6456  if (numEnt == 0 && numBytes != 0) {
6457  const int myRank = this->getMap ()->getComm ()->getRank ();
6458  TEUCHOS_TEST_FOR_EXCEPTION
6459  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6460  "unpackRow: The number of entries to unpack (as reported by "
6461  "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6462  "numBytes=" << numBytes << " != 0.");
6463  }
6464 
6465  const GO gid = 0; // packValueCount wants this
6466  const LO lid = 0; // packValueCount wants this
6467 
6468  const size_t numEntBeg = offset;
6469  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
6470  const size_t gidsBeg = numEntBeg + numEntLen;
6471  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6472  const size_t valsBeg = gidsBeg + gidsLen;
6473  const size_t valsLen = numEnt * numBytesPerValue;
6474 
6475  const char* const numEntIn = imports + numEntBeg;
6476  const char* const gidsIn = imports + gidsBeg;
6477  const char* const valsIn = imports + valsBeg;
6478 
6479  size_t numBytesOut = 0;
6480  int errorCode = 0;
6481  LO numEntOut;
6482  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
6483  if (static_cast<size_t> (numEntOut) != numEnt ||
6484  numEntOut == static_cast<LO> (0)) {
6485  const int myRank = this->getMap ()->getComm ()->getRank ();
6486  std::ostringstream os;
6487  os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6488  bool firstErrorCondition = false;
6489  if (static_cast<size_t> (numEntOut) != numEnt) {
6490  os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6491  << " does not equal number of entries unpacked from imports "
6492  "buffer numEntOut=" << numEntOut << ".";
6493  firstErrorCondition = true;
6494  }
6495  if (numEntOut == static_cast<LO> (0)) {
6496  if (firstErrorCondition) {
6497  os << " Also, ";
6498  }
6499  os << "Number of entries unpacked from imports buffer numEntOut=0, "
6500  "but number of bytes to unpack for this row numBytes=" << numBytes
6501  << " != 0. This should never happen, since packRow should only "
6502  "ever pack rows with a nonzero number of entries. In this case, "
6503  "the number of entries from numPacketsPerLID is numEnt=" << numEnt
6504  << ".";
6505  }
6506  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
6507  }
6508 
6509  {
6510  Kokkos::pair<int, size_t> p;
6511  p = PackTraits<GO>::unpackArray (gidsOut, gidsIn, numEnt);
6512  errorCode += p.first;
6513  numBytesOut += p.second;
6514 
6515  p = PackTraits<ST>::unpackArray (valsOut, valsIn, numEnt);
6516  errorCode += p.first;
6517  numBytesOut += p.second;
6518  }
6519 
6520  TEUCHOS_TEST_FOR_EXCEPTION
6521  (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
6522  << numBytesOut << " != numBytes = " << numBytes << ".");
6523 
6524  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6525  TEUCHOS_TEST_FOR_EXCEPTION
6526  (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
6527  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6528  << expectedNumBytes << ".");
6529 
6530  TEUCHOS_TEST_FOR_EXCEPTION
6531  (errorCode != 0, std::runtime_error, "unpackRow: "
6532  "PackTraits::unpackArray returned a nonzero error code");
6533 
6534  return numBytesOut;
6535  }
6536 
6537  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6538  void
6539  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6540  allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
6541  size_t& totalNumEntries,
6542  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
6543  {
6544  using Details::Behavior;
6546  using std::endl;
6547  typedef impl_scalar_type IST;
6548  typedef LocalOrdinal LO;
6549  typedef GlobalOrdinal GO;
6550  //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6551 
6552  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6553  // output to std::cerr on every MPI process. This is unwise for
6554  // runs with large numbers of MPI processes.
6555  const bool verbose = Behavior::verbose("CrsMatrix");
6556  std::unique_ptr<std::string> prefix;
6557  if (verbose) {
6558  prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6559  std::ostringstream os;
6560  os << *prefix << "Before:"
6561  << endl
6562  << *prefix << " "
6563  << dualViewStatusToString (exports, "exports")
6564  << endl
6565  << *prefix << " "
6566  << dualViewStatusToString (exportLIDs, "exportLIDs")
6567  << endl;
6568  std::cerr << os.str ();
6569  }
6570 
6571  // The number of export LIDs must fit in LocalOrdinal, assuming
6572  // that the LIDs are distinct and valid on the calling process.
6573  const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
6574 
6575  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6576  auto exportLIDs_h = exportLIDs.view_host ();
6577 
6578  // Count the total number of matrix entries to send.
6579  totalNumEntries = 0;
6580  for (LO i = 0; i < numExportLIDs; ++i) {
6581  const LO lclRow = exportLIDs_h[i];
6582  size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
6583  // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6584  // indices as an error. Just consider them nonowned for now.
6585  if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
6586  curNumEntries = 0;
6587  }
6588  totalNumEntries += curNumEntries;
6589  }
6590 
6591  // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6592  // if sizeof(IST) is a meaningful representation of the amount of
6593  // data in a Scalar instance. (LO and GO are always built-in
6594  // integer types.)
6595  //
6596  // Allocate the exports array. It does NOT need padding for
6597  // alignment, since we use memcpy to write to / read from send /
6598  // receive buffers.
6599  const size_t allocSize =
6600  static_cast<size_t> (numExportLIDs) * sizeof (LO) +
6601  totalNumEntries * (sizeof (IST) + sizeof (GO));
6602  if (static_cast<size_t> (exports.extent (0)) < allocSize) {
6603  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6604 
6605  const std::string oldLabel = exports.d_view.label ();
6606  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6607  exports = exports_type (newLabel, allocSize);
6608  }
6609 
6610  if (verbose) {
6611  std::ostringstream os;
6612  os << *prefix << "After:"
6613  << endl
6614  << *prefix << " "
6615  << dualViewStatusToString (exports, "exports")
6616  << endl
6617  << *prefix << " "
6618  << dualViewStatusToString (exportLIDs, "exportLIDs")
6619  << endl;
6620  std::cerr << os.str ();
6621  }
6622  }
6623 
6624  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6625  void
6627  packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6628  Kokkos::DualView<char*, buffer_device_type>& exports,
6629  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6630  size_t& constantNumPackets) const
6631  {
6632  // The call to packNew in packAndPrepare catches and handles any exceptions.
6633  Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6634  if (this->isStaticGraph ()) {
6636  packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
6637  constantNumPackets);
6638  }
6639  else {
6640  this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
6641  constantNumPackets);
6642  }
6643  }
6644 
6645  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6646  void
6648  packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6649  Kokkos::DualView<char*, buffer_device_type>& exports,
6650  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6651  size_t& constantNumPackets) const
6652  {
6653  using Details::Behavior;
6655  using Details::PackTraits;
6657  using Kokkos::View;
6658  using std::endl;
6659  using LO = LocalOrdinal;
6660  using GO = GlobalOrdinal;
6661  using ST = impl_scalar_type;
6662  const char tfecfFuncName[] = "packNonStaticNew: ";
6663 
6664  const bool verbose = Behavior::verbose("CrsMatrix");
6665  std::unique_ptr<std::string> prefix;
6666  if (verbose) {
6667  prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6668  std::ostringstream os;
6669  os << *prefix << "Start" << endl;
6670  std::cerr << os.str ();
6671  }
6672 
6673  const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
6674  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6675  (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
6676  std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
6677  << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
6678  << ".");
6679 
6680  // Setting this to zero tells the caller to expect a possibly
6681  // different ("nonconstant") number of packets per local index
6682  // (i.e., a possibly different number of entries per row).
6683  constantNumPackets = 0;
6684 
6685  // The pack buffer 'exports' enters this method possibly
6686  // unallocated. Do the first two parts of "Count, allocate, fill,
6687  // compute."
6688  size_t totalNumEntries = 0;
6689  this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
6690  const size_t bufSize = static_cast<size_t> (exports.extent (0));
6691 
6692  // Write-only host access
6693  exports.clear_sync_state();
6694  exports.modify_host();
6695  auto exports_h = exports.view_host ();
6696  if (verbose) {
6697  std::ostringstream os;
6698  os << *prefix << "After marking exports as modified on host, "
6699  << dualViewStatusToString (exports, "exports") << endl;
6700  std::cerr << os.str ();
6701  }
6702 
6703  // Read-only host access
6704  auto exportLIDs_h = exportLIDs.view_host ();
6705 
6706  // Write-only host access
6707  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6708  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6709  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6710 
6711  // Compute the number of "packets" (in this case, bytes) per
6712  // export LID (in this case, local index of the row to send), and
6713  // actually pack the data.
6714  auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6715 
6716 
6717  // Temporary buffer for global column indices.
6718  typename global_inds_host_view_type::non_const_type gidsIn_k;
6719  if (this->isLocallyIndexed()) { // Need storage for Global IDs
6720  gidsIn_k =
6721  typename global_inds_host_view_type::non_const_type("packGids",
6722  maxRowNumEnt);
6723  }
6724 
6725  size_t offset = 0; // current index into 'exports' array.
6726  for (size_t i = 0; i < numExportLIDs; ++i) {
6727  const LO lclRow = exportLIDs_h[i];
6728 
6729  size_t numBytes = 0;
6730  size_t numEnt = this->getNumEntriesInLocalRow (lclRow);
6731 
6732  // Only pack this row's data if it has a nonzero number of
6733  // entries. We can do this because receiving processes get the
6734  // number of packets, and will know that zero packets means zero
6735  // entries.
6736  if (numEnt == 0) {
6737  numPacketsPerLID_h[i] = 0;
6738  continue;
6739  }
6740 
6741  if (this->isLocallyIndexed ()) {
6742  typename global_inds_host_view_type::non_const_type gidsIn;
6743  values_host_view_type valsIn;
6744  // If the matrix is locally indexed on the calling process, we
6745  // have to use its column Map (which it _must_ have in this
6746  // case) to convert to global indices.
6747  local_inds_host_view_type lidsIn;
6748  this->getLocalRowView (lclRow, lidsIn, valsIn);
6749  const map_type& colMap = * (this->getColMap ());
6750  for (size_t k = 0; k < numEnt; ++k) {
6751  gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
6752  }
6753  gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
6754 
6755  const size_t numBytesPerValue =
6756  PackTraits<ST>::packValueCount (valsIn[0]);
6757  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6758  gidsIn.data (), valsIn.data (),
6759  numBytesPerValue);
6760  }
6761  else if (this->isGloballyIndexed ()) {
6762  global_inds_host_view_type gidsIn;
6763  values_host_view_type valsIn;
6764  // If the matrix is globally indexed on the calling process,
6765  // then we can use the column indices directly. However, we
6766  // have to get the global row index. The calling process must
6767  // have a row Map, since otherwise it shouldn't be participating
6768  // in packing operations.
6769  const map_type& rowMap = * (this->getRowMap ());
6770  const GO gblRow = rowMap.getGlobalElement (lclRow);
6771  this->getGlobalRowView (gblRow, gidsIn, valsIn);
6772 
6773  const size_t numBytesPerValue =
6774  PackTraits<ST>::packValueCount (valsIn[0]);
6775  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6776  gidsIn.data (), valsIn.data (),
6777  numBytesPerValue);
6778  }
6779  // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6780  // nor locally indexed, then it has no entries. Therefore,
6781  // there is nothing to pack. No worries!
6782 
6783  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6784  (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6785  "First invalid offset into 'exports' pack buffer at index i = " << i
6786  << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
6787  bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
6788  ".");
6789  // numPacketsPerLID_h[i] is the number of "packets" in the
6790  // current local row i. Packet=char (really "byte") so use the
6791  // number of bytes of the packed data for that row.
6792  numPacketsPerLID_h[i] = numBytes;
6793  offset += numBytes;
6794  }
6795 
6796  if (verbose) {
6797  std::ostringstream os;
6798  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6799  << *prefix << " "
6800  << dualViewStatusToString (exports, "exports")
6801  << endl
6802  << *prefix << " "
6803  << dualViewStatusToString (exportLIDs, "exportLIDs")
6804  << endl;
6805  std::cerr << os.str ();
6806  }
6807  }
6808 
6809  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6810  LocalOrdinal
6811  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6812  combineGlobalValuesRaw(const LocalOrdinal lclRow,
6813  const LocalOrdinal numEnt,
6814  const impl_scalar_type vals[],
6815  const GlobalOrdinal cols[],
6816  const Tpetra::CombineMode combMode,
6817  const char* const prefix,
6818  const bool debug,
6819  const bool verbose)
6820  {
6821  using GO = GlobalOrdinal;
6822 
6823  // mfh 23 Mar 2017: This branch is not thread safe in a debug
6824  // build, due to use of Teuchos::ArrayView; see #229.
6825  const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6826  Teuchos::ArrayView<const GO> cols_av
6827  (numEnt == 0 ? nullptr : cols, numEnt);
6828  Teuchos::ArrayView<const Scalar> vals_av
6829  (numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*> (vals), numEnt);
6830 
6831  // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6832  // combine modes. combineGlobalValues throws on error; it does
6833  // not return an error code. Thus, if it returns, it succeeded.
6834  combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6835  prefix, debug, verbose);
6836  return numEnt;
6837  }
6838 
6839  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6840  void
6841  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6842  combineGlobalValues(
6843  const GlobalOrdinal globalRowIndex,
6844  const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6845  const Teuchos::ArrayView<const Scalar>& values,
6846  const Tpetra::CombineMode combineMode,
6847  const char* const prefix,
6848  const bool debug,
6849  const bool verbose)
6850  {
6851  const char tfecfFuncName[] = "combineGlobalValues: ";
6852 
6853  if (isStaticGraph ()) {
6854  // INSERT doesn't make sense for a static graph, since you
6855  // aren't allowed to change the structure of the graph.
6856  // However, all the other combine modes work.
6857  if (combineMode == ADD) {
6858  sumIntoGlobalValues (globalRowIndex, columnIndices, values);
6859  }
6860  else if (combineMode == REPLACE) {
6861  replaceGlobalValues (globalRowIndex, columnIndices, values);
6862  }
6863  else if (combineMode == ABSMAX) {
6864  using ::Tpetra::Details::AbsMax;
6865  AbsMax<Scalar> f;
6866  this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
6867  columnIndices,
6868  values, f);
6869  }
6870  else if (combineMode == INSERT) {
6871  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6872  (isStaticGraph() && combineMode == INSERT,
6873  std::invalid_argument, "INSERT combine mode is forbidden "
6874  "if the matrix has a static (const) graph (i.e., was "
6875  "constructed with the CrsMatrix constructor that takes a "
6876  "const CrsGraph pointer).");
6877  }
6878  else {
6879  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6880  (true, std::logic_error, "Invalid combine mode; should "
6881  "never get here! "
6882  "Please report this bug to the Tpetra developers.");
6883  }
6884  }
6885  else { // The matrix has a dynamic graph.
6886  if (combineMode == ADD || combineMode == INSERT) {
6887  // For a dynamic graph, all incoming column indices are
6888  // inserted into the target graph. Duplicate indices will
6889  // have their values summed. In this context, ADD and INSERT
6890  // are equivalent. We need to call insertGlobalValues()
6891  // anyway if the column indices don't yet exist in this row,
6892  // so we just call insertGlobalValues() for both cases.
6893  insertGlobalValuesFilteredChecked(globalRowIndex,
6894  columnIndices, values, prefix, debug, verbose);
6895  }
6896  // FIXME (mfh 14 Mar 2012):
6897  //
6898  // Implementing ABSMAX or REPLACE for a dynamic graph would
6899  // require modifying assembly to attach a possibly different
6900  // combine mode to each inserted (i, j, A_ij) entry. For
6901  // example, consider two different Export operations to the same
6902  // target CrsMatrix, the first with ABSMAX combine mode and the
6903  // second with REPLACE. This isn't a common use case, so we
6904  // won't mess with it for now.
6905  else if (combineMode == ABSMAX) {
6906  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6907  ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
6908  "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6909  "implemented.");
6910  }
6911  else if (combineMode == REPLACE) {
6912  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6913  ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
6914  "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6915  "implemented.");
6916  }
6917  else {
6918  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6919  true, std::logic_error, "Should never get here! Please report this "
6920  "bug to the Tpetra developers.");
6921  }
6922  }
6923  }
6924 
6925  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6926  void
6929  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6930  Kokkos::DualView<char*, buffer_device_type> imports,
6931  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6932  const size_t constantNumPackets,
6933  const CombineMode combineMode)
6934  {
6935  using Details::Behavior;
6938  using std::endl;
6939  const char tfecfFuncName[] = "unpackAndCombine: ";
6940  ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
6941 
6942  const bool debug = Behavior::debug("CrsMatrix");
6943  const bool verbose = Behavior::verbose("CrsMatrix");
6944  constexpr int numValidModes = 5;
6945  const CombineMode validModes[numValidModes] =
6946  {ADD, REPLACE, ABSMAX, INSERT, ZERO};
6947  const char* validModeNames[numValidModes] =
6948  {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6949 
6950  std::unique_ptr<std::string> prefix;
6951  if (verbose) {
6952  prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6953  std::ostringstream os;
6954  os << *prefix << "Start:" << endl
6955  << *prefix << " "
6956  << dualViewStatusToString (importLIDs, "importLIDs")
6957  << endl
6958  << *prefix << " "
6959  << dualViewStatusToString (imports, "imports")
6960  << endl
6961  << *prefix << " "
6962  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6963  << endl
6964  << *prefix << " constantNumPackets: " << constantNumPackets
6965  << endl
6966  << *prefix << " combineMode: " << combineModeToString (combineMode)
6967  << endl;
6968  std::cerr << os.str ();
6969  }
6970 
6971  if (debug) {
6972  if (std::find (validModes, validModes+numValidModes, combineMode) ==
6973  validModes+numValidModes) {
6974  std::ostringstream os;
6975  os << "Invalid combine mode. Valid modes are {";
6976  for (int k = 0; k < numValidModes; ++k) {
6977  os << validModeNames[k];
6978  if (k < numValidModes - 1) {
6979  os << ", ";
6980  }
6981  }
6982  os << "}.";
6983  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6984  (true, std::invalid_argument, os.str ());
6985  }
6986  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6987  (importLIDs.extent(0) != numPacketsPerLID.extent(0),
6988  std::invalid_argument, "importLIDs.extent(0)="
6989  << importLIDs.extent(0)
6990  << " != numPacketsPerLID.extent(0)="
6991  << numPacketsPerLID.extent(0) << ".");
6992  }
6993 
6994  if (combineMode == ZERO) {
6995  return; // nothing to do
6996  }
6997 
6998  if (debug) {
6999  using Teuchos::reduceAll;
7000  std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
7001  int lclBad = 0;
7002  try {
7003  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
7004  constantNumPackets, combineMode,
7005  verbose);
7006  } catch (std::exception& e) {
7007  lclBad = 1;
7008  *msg << e.what ();
7009  }
7010  int gblBad = 0;
7011  const Teuchos::Comm<int>& comm = * (this->getComm ());
7012  reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
7013  lclBad, Teuchos::outArg (gblBad));
7014  if (gblBad != 0) {
7015  // mfh 22 Oct 2017: 'prefix' might be null, since it is only
7016  // initialized in a debug build. Thus, we get the process
7017  // rank again here. This is an error message, so the small
7018  // run-time cost doesn't matter. See #1887.
7019  std::ostringstream os;
7020  os << "Proc " << comm.getRank () << ": " << msg->str () << endl;
7021  msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
7022  ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
7023  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7024  (true, std::logic_error, std::endl << "unpackAndCombineImpl "
7025  "threw an exception on one or more participating processes: "
7026  << endl << msg->str ());
7027  }
7028  }
7029  else {
7030  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
7031  constantNumPackets, combineMode,
7032  verbose);
7033  }
7034 
7035  if (verbose) {
7036  std::ostringstream os;
7037  os << *prefix << "Done!" << endl
7038  << *prefix << " "
7039  << dualViewStatusToString (importLIDs, "importLIDs")
7040  << endl
7041  << *prefix << " "
7042  << dualViewStatusToString (imports, "imports")
7043  << endl
7044  << *prefix << " "
7045  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7046  << endl;
7047  std::cerr << os.str ();
7048  }
7049  }
7050 
7051  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7052  void
7055  const Kokkos::DualView<const local_ordinal_type*,
7056  buffer_device_type>& importLIDs,
7057  Kokkos::DualView<char*, buffer_device_type> imports,
7058  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7059  const size_t constantNumPackets,
7060  const CombineMode combineMode,
7061  const bool verbose)
7062  {
7063  Details::ProfilingRegion region_unpack_and_combine_impl(
7064  "Tpetra::CrsMatrix::unpackAndCombineImpl",
7065  "Import/Export"
7066  );
7067  using std::endl;
7068  const char tfecfFuncName[] = "unpackAndCombineImpl";
7069  std::unique_ptr<std::string> prefix;
7070  if (verbose) {
7071  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7072  std::ostringstream os;
7073  os << *prefix << "isStaticGraph(): "
7074  << (isStaticGraph() ? "true" : "false")
7075  << ", importLIDs.extent(0): "
7076  << importLIDs.extent(0)
7077  << ", imports.extent(0): "
7078  << imports.extent(0)
7079  << ", numPacketsPerLID.extent(0): "
7080  << numPacketsPerLID.extent(0)
7081  << endl;
7082  std::cerr << os.str();
7083  }
7084 
7085  if (isStaticGraph ()) {
7086  using Details::unpackCrsMatrixAndCombineNew;
7087  unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
7088  importLIDs, constantNumPackets,
7089  combineMode);
7090  }
7091  else {
7092  {
7093  using padding_type = typename crs_graph_type::padding_type;
7094  std::unique_ptr<padding_type> padding;
7095  try {
7096  padding = myGraph_->computePaddingForCrsMatrixUnpack(
7097  importLIDs, imports, numPacketsPerLID, verbose);
7098  }
7099  catch (std::exception& e) {
7100  const auto rowMap = getRowMap();
7101  const auto comm = rowMap.is_null() ? Teuchos::null :
7102  rowMap->getComm();
7103  const int myRank = comm.is_null() ? -1 : comm->getRank();
7104  TEUCHOS_TEST_FOR_EXCEPTION
7105  (true, std::runtime_error, "Proc " << myRank << ": "
7106  "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
7107  "threw an exception: " << e.what());
7108  }
7109  if (verbose) {
7110  std::ostringstream os;
7111  os << *prefix << "Call applyCrsPadding" << endl;
7112  std::cerr << os.str();
7113  }
7114  applyCrsPadding(*padding, verbose);
7115  }
7116  if (verbose) {
7117  std::ostringstream os;
7118  os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
7119  std::cerr << os.str();
7120  }
7121  unpackAndCombineImplNonStatic(importLIDs, imports,
7122  numPacketsPerLID,
7123  constantNumPackets,
7124  combineMode);
7125  }
7126 
7127  if (verbose) {
7128  std::ostringstream os;
7129  os << *prefix << "Done" << endl;
7130  std::cerr << os.str();
7131  }
7132  }
7133 
7134  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7135  void
7136  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7137  unpackAndCombineImplNonStatic(
7138  const Kokkos::DualView<const local_ordinal_type*,
7139  buffer_device_type>& importLIDs,
7140  Kokkos::DualView<char*, buffer_device_type> imports,
7141  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7142  const size_t constantNumPackets,
7143  const CombineMode combineMode)
7144  {
7145  using Kokkos::View;
7146  using Kokkos::subview;
7147  using Kokkos::MemoryUnmanaged;
7148  using Details::Behavior;
7151  using Details::PackTraits;
7152  using Details::ScalarViewTraits;
7153  using std::endl;
7154  using LO = LocalOrdinal;
7155  using GO = GlobalOrdinal;
7156  using ST = impl_scalar_type;
7157  using size_type = typename Teuchos::ArrayView<LO>::size_type;
7158  using HES =
7159  typename View<int*, device_type>::HostMirror::execution_space;
7160  using pair_type = std::pair<typename View<int*, HES>::size_type,
7161  typename View<int*, HES>::size_type>;
7162  using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
7163  using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
7164  const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
7165 
7166  const bool debug = Behavior::debug("CrsMatrix");
7167  const bool verbose = Behavior::verbose("CrsMatrix");
7168  std::unique_ptr<std::string> prefix;
7169  if (verbose) {
7170  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7171  std::ostringstream os;
7172  os << *prefix << endl; // we've already printed DualViews' statuses
7173  std::cerr << os.str ();
7174  }
7175  const char* const prefix_raw =
7176  verbose ? prefix.get()->c_str() : nullptr;
7177 
7178  const size_type numImportLIDs = importLIDs.extent (0);
7179  if (combineMode == ZERO || numImportLIDs == 0) {
7180  return; // nothing to do; no need to combine entries
7181  }
7182 
7183  Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
7184  "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
7185  "Import/Export"
7186  );
7187 
7188  // We're unpacking on host. This is read-only host access.
7189  if (imports.need_sync_host()) {
7190  imports.sync_host ();
7191  }
7192  auto imports_h = imports.view_host();
7193 
7194  // Read-only host access.
7195  if (numPacketsPerLID.need_sync_host()) {
7196  numPacketsPerLID.sync_host ();
7197  }
7198  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
7199 
7200  TEUCHOS_ASSERT( ! importLIDs.need_sync_host() );
7201  auto importLIDs_h = importLIDs.view_host();
7202 
7203  size_t numBytesPerValue;
7204  {
7205  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7206  // with run-time size? We already assume that all entries in both the
7207  // source and target matrices have the same size. If the calling process
7208  // owns at least one entry in either matrix, we can use that entry to set
7209  // the size. However, it is possible that the calling process owns no
7210  // entries. In that case, we're in trouble. One way to fix this would be
7211  // for each row's data to contain the run-time size. This is only
7212  // necessary if the size is not a compile-time constant.
7213  Scalar val;
7214  numBytesPerValue = PackTraits<ST>::packValueCount (val);
7215  }
7216 
7217  // Determine the maximum number of entries in any one row
7218  size_t offset = 0;
7219  size_t maxRowNumEnt = 0;
7220  for (size_type i = 0; i < numImportLIDs; ++i) {
7221  const size_t numBytes = numPacketsPerLID_h[i];
7222  if (numBytes == 0) {
7223  continue; // empty buffer for that row means that the row is empty
7224  }
7225  // We need to unpack a nonzero number of entries for this row.
7226  if (debug) {
7227  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7228  (offset + numBytes > size_t(imports_h.extent (0)),
7229  std::logic_error, ": At local row index importLIDs_h[i="
7230  << i << "]=" << importLIDs_h[i] << ", offset (=" << offset
7231  << ") + numBytes (=" << numBytes << ") > "
7232  "imports_h.extent(0)=" << imports_h.extent (0) << ".");
7233  }
7234  LO numEntLO = 0;
7235 
7236  if (debug) {
7237  const size_t theNumBytes =
7238  PackTraits<LO>::packValueCount (numEntLO);
7239  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7240  (theNumBytes > numBytes, std::logic_error, ": theNumBytes="
7241  << theNumBytes << " > numBytes = " << numBytes << ".");
7242  }
7243  const char* const inBuf = imports_h.data () + offset;
7244  const size_t actualNumBytes =
7245  PackTraits<LO>::unpackValue (numEntLO, inBuf);
7246 
7247  if (debug) {
7248  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7249  (actualNumBytes > numBytes, std::logic_error, ": At i=" << i
7250  << ", actualNumBytes=" << actualNumBytes
7251  << " > numBytes=" << numBytes << ".");
7252  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7253  (numEntLO == 0, std::logic_error, ": At local row index "
7254  "importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", "
7255  "the number of entries read from the packed data is "
7256  "numEntLO=" << numEntLO << ", but numBytes=" << numBytes
7257  << " != 0.");
7258  }
7259 
7260  maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
7261  offset += numBytes;
7262  }
7263 
7264  // Temporary space to cache incoming global column indices and
7265  // values. Column indices come in as global indices, in case the
7266  // source object's column Map differs from the target object's
7267  // (this's) column Map.
7268  View<GO*, HES> gblColInds;
7269  View<LO*, HES> lclColInds;
7270  View<ST*, HES> vals;
7271  {
7272  GO gid = 0;
7273  LO lid = 0;
7274  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7275  // with run-time size? We already assume that all entries in both the
7276  // source and target matrices have the same size. If the calling process
7277  // owns at least one entry in either matrix, we can use that entry to set
7278  // the size. However, it is possible that the calling process owns no
7279  // entries. In that case, we're in trouble. One way to fix this would be
7280  // for each row's data to contain the run-time size. This is only
7281  // necessary if the size is not a compile-time constant.
7282  Scalar val;
7283  gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
7284  gid, maxRowNumEnt, "gids");
7285  lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
7286  lid, maxRowNumEnt, "lids");
7287  vals = ScalarViewTraits<ST, HES>::allocateArray(
7288  val, maxRowNumEnt, "vals");
7289  }
7290 
7291  offset = 0;
7292  for (size_type i = 0; i < numImportLIDs; ++i) {
7293  const size_t numBytes = numPacketsPerLID_h[i];
7294  if (numBytes == 0) {
7295  continue; // empty buffer for that row means that the row is empty
7296  }
7297  LO numEntLO = 0;
7298  const char* const inBuf = imports_h.data () + offset;
7299  (void) PackTraits<LO>::unpackValue (numEntLO, inBuf);
7300 
7301  const size_t numEnt = static_cast<size_t>(numEntLO);;
7302  const LO lclRow = importLIDs_h[i];
7303 
7304  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
7305  vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
7306 
7307  const size_t numBytesOut =
7308  unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
7309  offset, numBytes, numEnt, numBytesPerValue);
7310  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7311  (numBytes != numBytesOut, std::logic_error, ": At i=" << i
7312  << ", numBytes=" << numBytes << " != numBytesOut="
7313  << numBytesOut << ".");
7314 
7315  const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
7316  const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
7317  combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
7318  combineMode, prefix_raw, debug, verbose);
7319  // Don't update offset until current LID has succeeded.
7320  offset += numBytes;
7321  } // for each import LID i
7322 
7323  if (verbose) {
7324  std::ostringstream os;
7325  os << *prefix << "Done" << endl;
7326  std::cerr << os.str();
7327  }
7328  }
7329 
7330  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7331  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7333  getColumnMapMultiVector (const MV& X_domainMap,
7334  const bool force) const
7335  {
7336  using Teuchos::null;
7337  using Teuchos::RCP;
7338  using Teuchos::rcp;
7339 
7340  TEUCHOS_TEST_FOR_EXCEPTION(
7341  ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
7342  "MapMultiVector: You may only call this method if the matrix has a "
7343  "column Map. If the matrix does not yet have a column Map, you should "
7344  "first call fillComplete (with domain and range Map if necessary).");
7345 
7346  // If the graph is not fill complete, then the Import object (if
7347  // one should exist) hasn't been constructed yet.
7348  TEUCHOS_TEST_FOR_EXCEPTION(
7349  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7350  "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
7351  "this matrix's graph is fill complete.");
7352 
7353  const size_t numVecs = X_domainMap.getNumVectors ();
7354  RCP<const import_type> importer = this->getGraph ()->getImporter ();
7355  RCP<const map_type> colMap = this->getColMap ();
7356 
7357  RCP<MV> X_colMap; // null by default
7358 
7359  // If the Import object is trivial (null), then we don't need a
7360  // separate column Map multivector. Just return null in that
7361  // case. The caller is responsible for knowing not to use the
7362  // returned null pointer.
7363  //
7364  // If the Import is nontrivial, then we do need a separate
7365  // column Map multivector for the Import operation. Check in
7366  // that case if we have to (re)create the column Map
7367  // multivector.
7368  if (! importer.is_null () || force) {
7369  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
7370  X_colMap = rcp (new MV (colMap, numVecs));
7371 
7372  // Cache the newly created multivector for later reuse.
7373  importMV_ = X_colMap;
7374  }
7375  else { // Yay, we can reuse the cached multivector!
7376  X_colMap = importMV_;
7377  // mfh 09 Jan 2013: We don't have to fill with zeros first,
7378  // because the Import uses INSERT combine mode, which overwrites
7379  // existing entries.
7380  //
7381  //X_colMap->putScalar (ZERO);
7382  }
7383  }
7384  return X_colMap;
7385  }
7386 
7387  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7388  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7391  const bool force) const
7392  {
7393  using Teuchos::null;
7394  using Teuchos::RCP;
7395  using Teuchos::rcp;
7396 
7397  // If the graph is not fill complete, then the Export object (if
7398  // one should exist) hasn't been constructed yet.
7399  TEUCHOS_TEST_FOR_EXCEPTION(
7400  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7401  "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7402  "matrix's graph is fill complete.");
7403 
7404  const size_t numVecs = Y_rangeMap.getNumVectors ();
7405  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
7406  // Every version of the constructor takes either a row Map, or a
7407  // graph (all of whose constructors take a row Map). Thus, the
7408  // matrix always has a row Map.
7409  RCP<const map_type> rowMap = this->getRowMap ();
7410 
7411  RCP<MV> Y_rowMap; // null by default
7412 
7413  // If the Export object is trivial (null), then we don't need a
7414  // separate row Map multivector. Just return null in that case.
7415  // The caller is responsible for knowing not to use the returned
7416  // null pointer.
7417  //
7418  // If the Export is nontrivial, then we do need a separate row
7419  // Map multivector for the Export operation. Check in that case
7420  // if we have to (re)create the row Map multivector.
7421  if (! exporter.is_null () || force) {
7422  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
7423  Y_rowMap = rcp (new MV (rowMap, numVecs));
7424  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7425  }
7426  else { // Yay, we can reuse the cached multivector!
7427  Y_rowMap = exportMV_;
7428  }
7429  }
7430  return Y_rowMap;
7431  }
7432 
7433  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7434  void
7436  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
7437  {
7438  TEUCHOS_TEST_FOR_EXCEPTION(
7439  myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
7440  "removeEmptyProcessesInPlace: This method does not work when the matrix "
7441  "was created with a constant graph (that is, when it was created using "
7442  "the version of its constructor that takes an RCP<const CrsGraph>). "
7443  "This is because the matrix is not allowed to modify the graph in that "
7444  "case, but removing empty processes requires modifying the graph.");
7445  myGraph_->removeEmptyProcessesInPlace (newMap);
7446  // Even though CrsMatrix's row Map (as returned by getRowMap())
7447  // comes from its CrsGraph, CrsMatrix still implements DistObject,
7448  // so we also have to change the DistObject's Map.
7449  this->map_ = this->getRowMap ();
7450  // In the nonconst graph case, staticGraph_ is just a const
7451  // pointer to myGraph_. This assignment is probably redundant,
7452  // but it doesn't hurt.
7453  staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
7454  }
7455 
7456  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7457  Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7459  add (const Scalar& alpha,
7461  const Scalar& beta,
7462  const Teuchos::RCP<const map_type>& domainMap,
7463  const Teuchos::RCP<const map_type>& rangeMap,
7464  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7465  {
7466  using Teuchos::Array;
7467  using Teuchos::ArrayView;
7468  using Teuchos::ParameterList;
7469  using Teuchos::RCP;
7470  using Teuchos::rcp;
7471  using Teuchos::rcp_implicit_cast;
7472  using Teuchos::sublist;
7473  using std::endl;
7474  using LO = local_ordinal_type;
7475  using GO = global_ordinal_type;
7476  using crs_matrix_type =
7478  const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7479 
7480  const bool debug = Details::Behavior::debug("CrsMatrix");
7481  const bool verbose = Details::Behavior::verbose("CrsMatrix");
7482  std::unique_ptr<std::string> prefix;
7483  if (verbose) {
7484  prefix = this->createPrefix("CrsMatrix", "add");
7485  std::ostringstream os;
7486  os << *prefix << "Start" << endl;
7487  std::cerr << os.str ();
7488  }
7489 
7490  const crs_matrix_type& B = *this; // a convenient abbreviation
7491  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7492  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7493 
7494  // If the user didn't supply a domain or range Map, then try to
7495  // get one from B first (if it has them), then from A (if it has
7496  // them). If we don't have any domain or range Maps, scold the
7497  // user.
7498  RCP<const map_type> A_domainMap = A.getDomainMap ();
7499  RCP<const map_type> A_rangeMap = A.getRangeMap ();
7500  RCP<const map_type> B_domainMap = B.getDomainMap ();
7501  RCP<const map_type> B_rangeMap = B.getRangeMap ();
7502 
7503  RCP<const map_type> theDomainMap = domainMap;
7504  RCP<const map_type> theRangeMap = rangeMap;
7505 
7506  if (domainMap.is_null ()) {
7507  if (B_domainMap.is_null ()) {
7508  TEUCHOS_TEST_FOR_EXCEPTION(
7509  A_domainMap.is_null (), std::invalid_argument,
7510  "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7511  "then you must supply a nonnull domain Map to this method.");
7512  theDomainMap = A_domainMap;
7513  } else {
7514  theDomainMap = B_domainMap;
7515  }
7516  }
7517  if (rangeMap.is_null ()) {
7518  if (B_rangeMap.is_null ()) {
7519  TEUCHOS_TEST_FOR_EXCEPTION(
7520  A_rangeMap.is_null (), std::invalid_argument,
7521  "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7522  "then you must supply a nonnull range Map to this method.");
7523  theRangeMap = A_rangeMap;
7524  } else {
7525  theRangeMap = B_rangeMap;
7526  }
7527  }
7528 
7529  if (debug) {
7530  // In debug mode, check that A and B have matching domain and
7531  // range Maps, if they have domain and range Maps at all. (If
7532  // they aren't fill complete, then they may not yet have them.)
7533  if (! A_domainMap.is_null() && ! A_rangeMap.is_null()) {
7534  if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7535  TEUCHOS_TEST_FOR_EXCEPTION
7536  (! B_domainMap->isSameAs(*A_domainMap),
7537  std::invalid_argument,
7538  errPfx << "The input RowMatrix A must have a domain Map "
7539  "which is the same as (isSameAs) this RowMatrix's "
7540  "domain Map.");
7541  TEUCHOS_TEST_FOR_EXCEPTION
7542  (! B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7543  errPfx << "The input RowMatrix A must have a range Map "
7544  "which is the same as (isSameAs) this RowMatrix's range "
7545  "Map.");
7546  TEUCHOS_TEST_FOR_EXCEPTION
7547  (! domainMap.is_null() &&
7548  ! domainMap->isSameAs(*B_domainMap),
7549  std::invalid_argument,
7550  errPfx << "The input domain Map must be the same as "
7551  "(isSameAs) this RowMatrix's domain Map.");
7552  TEUCHOS_TEST_FOR_EXCEPTION
7553  (! rangeMap.is_null() &&
7554  ! rangeMap->isSameAs(*B_rangeMap),
7555  std::invalid_argument,
7556  errPfx << "The input range Map must be the same as "
7557  "(isSameAs) this RowMatrix's range Map.");
7558  }
7559  }
7560  else if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7561  TEUCHOS_TEST_FOR_EXCEPTION
7562  (! domainMap.is_null() &&
7563  ! domainMap->isSameAs(*B_domainMap),
7564  std::invalid_argument,
7565  errPfx << "The input domain Map must be the same as "
7566  "(isSameAs) this RowMatrix's domain Map.");
7567  TEUCHOS_TEST_FOR_EXCEPTION
7568  (! rangeMap.is_null() && ! rangeMap->isSameAs(*B_rangeMap),
7569  std::invalid_argument,
7570  errPfx << "The input range Map must be the same as "
7571  "(isSameAs) this RowMatrix's range Map.");
7572  }
7573  else {
7574  TEUCHOS_TEST_FOR_EXCEPTION
7575  (domainMap.is_null() || rangeMap.is_null(),
7576  std::invalid_argument, errPfx << "If neither A nor B "
7577  "have a domain and range Map, then you must supply a "
7578  "nonnull domain and range Map to this method.");
7579  }
7580  }
7581 
7582  // What parameters do we pass to C's constructor? Do we call
7583  // fillComplete on C after filling it? And if so, what parameters
7584  // do we pass to C's fillComplete call?
7585  bool callFillComplete = true;
7586  RCP<ParameterList> constructorSublist;
7587  RCP<ParameterList> fillCompleteSublist;
7588  if (! params.is_null()) {
7589  callFillComplete =
7590  params->get("Call fillComplete", callFillComplete);
7591  constructorSublist = sublist(params, "Constructor parameters");
7592  fillCompleteSublist = sublist(params, "fillComplete parameters");
7593  }
7594 
7595  RCP<const map_type> A_rowMap = A.getRowMap ();
7596  RCP<const map_type> B_rowMap = B.getRowMap ();
7597  RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7598  RCP<crs_matrix_type> C; // The result matrix.
7599 
7600  // If A and B's row Maps are the same, we can compute an upper
7601  // bound on the number of entries in each row of C, before
7602  // actually computing the sum. A reasonable upper bound is the
7603  // sum of the two entry counts in each row.
7604  if (A_rowMap->isSameAs (*B_rowMap)) {
7605  const LO localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7606  Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
7607 
7608  // Get the number of entries in each row of A.
7609  if (alpha != ZERO) {
7610  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7611  const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7612  C_maxNumEntriesPerRow[localRow] += A_numEntries;
7613  }
7614  }
7615  // Get the number of entries in each row of B.
7616  if (beta != ZERO) {
7617  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7618  const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7619  C_maxNumEntriesPerRow[localRow] += B_numEntries;
7620  }
7621  }
7622  // Construct the result matrix C.
7623  if (constructorSublist.is_null ()) {
7624  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow ()));
7625  } else {
7626  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
7627  constructorSublist));
7628  }
7629  // Since A and B have the same row Maps, we could add them
7630  // together all at once and merge values before we call
7631  // insertGlobalValues. However, we don't really need to, since
7632  // we've already allocated enough space in each row of C for C
7633  // to do the merge itself.
7634  }
7635  else { // the row Maps of A and B are not the same
7636  // Construct the result matrix C.
7637  // true: !A_rowMap->isSameAs (*B_rowMap)
7638  TEUCHOS_TEST_FOR_EXCEPTION
7639  (true, std::invalid_argument, errPfx << "The row maps must "
7640  "be the same for statically allocated matrices, to ensure "
7641  "that there is sufficient space to do the addition.");
7642  }
7643 
7644  TEUCHOS_TEST_FOR_EXCEPTION
7645  (C.is_null (), std::logic_error,
7646  errPfx << "C should not be null at this point. "
7647  "Please report this bug to the Tpetra developers.");
7648 
7649  if (verbose) {
7650  std::ostringstream os;
7651  os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7652  std::cerr << os.str ();
7653  }
7654  using gids_type = nonconst_global_inds_host_view_type;
7655  using vals_type = nonconst_values_host_view_type;
7656  gids_type ind;
7657  vals_type val;
7658 
7659  if (alpha != ZERO) {
7660  const LO A_localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7661  for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7662  size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7663  const GO globalRow = A_rowMap->getGlobalElement (localRow);
7664  if (A_numEntries > static_cast<size_t> (ind.size ())) {
7665  Kokkos::resize(ind,A_numEntries);
7666  Kokkos::resize(val,A_numEntries);
7667  }
7668  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
7669  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
7670  A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
7671 
7672  if (alpha != ONE) {
7673  for (size_t k = 0; k < A_numEntries; ++k) {
7674  valView[k] *= alpha;
7675  }
7676  }
7677  C->insertGlobalValues (globalRow, A_numEntries,
7678  reinterpret_cast<Scalar *>(valView.data()),
7679  indView.data());
7680  }
7681  }
7682 
7683  if (beta != ZERO) {
7684  const LO B_localNumRows = static_cast<LO> (B_rowMap->getLocalNumElements ());
7685  for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7686  size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7687  const GO globalRow = B_rowMap->getGlobalElement (localRow);
7688  if (B_numEntries > static_cast<size_t> (ind.size ())) {
7689  Kokkos::resize(ind,B_numEntries);
7690  Kokkos::resize(val,B_numEntries);
7691  }
7692  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
7693  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
7694  B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
7695 
7696  if (beta != ONE) {
7697  for (size_t k = 0; k < B_numEntries; ++k) {
7698  valView[k] *= beta;
7699  }
7700  }
7701  C->insertGlobalValues (globalRow, B_numEntries,
7702  reinterpret_cast<Scalar *>(valView.data()),
7703  indView.data());
7704  }
7705  }
7706 
7707  if (callFillComplete) {
7708  if (verbose) {
7709  std::ostringstream os;
7710  os << *prefix << "Call fillComplete on C" << endl;
7711  std::cerr << os.str ();
7712  }
7713  if (fillCompleteSublist.is_null ()) {
7714  C->fillComplete (theDomainMap, theRangeMap);
7715  } else {
7716  C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
7717  }
7718  }
7719  else if (verbose) {
7720  std::ostringstream os;
7721  os << *prefix << "Do NOT call fillComplete on C" << endl;
7722  std::cerr << os.str ();
7723  }
7724 
7725  if (verbose) {
7726  std::ostringstream os;
7727  os << *prefix << "Done" << endl;
7728  std::cerr << os.str ();
7729  }
7730  return rcp_implicit_cast<row_matrix_type> (C);
7731  }
7732 
7733 
7734 
7735  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7736  void
7739  const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7740  const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
7741  const Teuchos::RCP<const map_type>& domainMap,
7742  const Teuchos::RCP<const map_type>& rangeMap,
7743  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7744  {
7745  using Details::Behavior;
7750  using Teuchos::ArrayRCP;
7751  using Teuchos::ArrayView;
7752  using Teuchos::Comm;
7753  using Teuchos::ParameterList;
7754  using Teuchos::RCP;
7755  using std::endl;
7756  typedef LocalOrdinal LO;
7757  typedef GlobalOrdinal GO;
7758  typedef node_type NT;
7759  typedef CrsMatrix<Scalar, LO, GO, NT> this_CRS_type;
7760  typedef Vector<int, LO, GO, NT> IntVectorType;
7761  using Teuchos::as;
7762 
7763  const bool debug = Behavior::debug("CrsMatrix");
7764  const bool verbose = Behavior::verbose("CrsMatrix");
7765  int MyPID = getComm ()->getRank ();
7766 
7767  std::unique_ptr<std::string> verbosePrefix;
7768  if (verbose) {
7769  verbosePrefix =
7770  this->createPrefix("CrsMatrix", "transferAndFillComplete");
7771  std::ostringstream os;
7772  os << "Start" << endl;
7773  std::cerr << os.str();
7774  }
7775 
7776  //
7777  // Get the caller's parameters
7778  //
7779  bool isMM = false; // optimize for matrix-matrix ops.
7780  bool reverseMode = false; // Are we in reverse mode?
7781  bool restrictComm = false; // Do we need to restrict the communicator?
7782 
7783  int mm_optimization_core_count =
7784  Behavior::TAFC_OptimizationCoreCount();
7785  RCP<ParameterList> matrixparams; // parameters for the destination matrix
7786  bool overrideAllreduce = false;
7787  bool useKokkosPath = false;
7788  if (! params.is_null ()) {
7789  matrixparams = sublist (params, "CrsMatrix");
7790  reverseMode = params->get ("Reverse Mode", reverseMode);
7791  useKokkosPath = params->get ("TAFC: use kokkos path", useKokkosPath);
7792  restrictComm = params->get ("Restrict Communicator", restrictComm);
7793  auto & slist = params->sublist("matrixmatrix: kernel params",false);
7794  isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
7795  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
7796 
7797  overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
7798  if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7799  if(reverseMode) isMM = false;
7800  }
7801 
7802  // Only used in the sparse matrix-matrix multiply (isMM) case.
7803  std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
7804  int mismatch = 0;
7805  int reduced_mismatch = 0;
7806  if (isMM && !overrideAllreduce) {
7807 
7808  // Test for pathological matrix transfer
7809  const bool source_vals = ! getGraph ()->getImporter ().is_null();
7810  const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
7811  rowTransfer.getRemoteLIDs ().size() == 0);
7812  mismatch = (source_vals != target_vals) ? 1 : 0;
7813  iallreduceRequest =
7814  ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
7815  Teuchos::REDUCE_MAX, * (getComm ()));
7816  }
7817 
7818 #ifdef HAVE_TPETRA_MMM_TIMINGS
7819  using Teuchos::TimeMonitor;
7820  std::string label;
7821  if(!params.is_null())
7822  label = params->get("Timer Label",label);
7823  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
7824  std::string tlstr;
7825  {
7826  std::ostringstream os;
7827  if(isMM) os<<":MMOpt";
7828  else os<<":MMLegacy";
7829  tlstr = os.str();
7830  }
7831 
7832  Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
7833 #endif
7834 
7835  // Make sure that the input argument rowTransfer is either an
7836  // Import or an Export. Import and Export are the only two
7837  // subclasses of Transfer that we defined, but users might
7838  // (unwisely, for now at least) decide to implement their own
7839  // subclasses. Exclude this possibility.
7840  const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
7841  const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
7842  TEUCHOS_TEST_FOR_EXCEPTION(
7843  xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7844  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7845  "argument must be either an Import or an Export, and its template "
7846  "parameters must match the corresponding template parameters of the "
7847  "CrsMatrix.");
7848 
7849  // Make sure that the input argument domainTransfer is either an
7850  // Import or an Export. Import and Export are the only two
7851  // subclasses of Transfer that we defined, but users might
7852  // (unwisely, for now at least) decide to implement their own
7853  // subclasses. Exclude this possibility.
7854  Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
7855  Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
7856 
7857  if(! domainTransfer.is_null()) {
7858  TEUCHOS_TEST_FOR_EXCEPTION(
7859  (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7860  "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7861  "argument must be either an Import or an Export, and its template "
7862  "parameters must match the corresponding template parameters of the "
7863  "CrsMatrix.");
7864 
7865  TEUCHOS_TEST_FOR_EXCEPTION(
7866  ( xferAsImport != nullptr || ! xferDomainAsImport.is_null() ) &&
7867  (( xferAsImport != nullptr && xferDomainAsImport.is_null() ) ||
7868  ( xferAsImport == nullptr && ! xferDomainAsImport.is_null() )), std::invalid_argument,
7869  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7870  "arguments must be of the same type (either Import or Export).");
7871 
7872  TEUCHOS_TEST_FOR_EXCEPTION(
7873  ( xferAsExport != nullptr || ! xferDomainAsExport.is_null() ) &&
7874  (( xferAsExport != nullptr && xferDomainAsExport.is_null() ) ||
7875  ( xferAsExport == nullptr && ! xferDomainAsExport.is_null() )), std::invalid_argument,
7876  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7877  "arguments must be of the same type (either Import or Export).");
7878  } // domainTransfer != null
7879 
7880 
7881  // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7882  // if the source Map is not distributed but the target Map is?
7883  const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
7884 
7885  // Get the new domain and range Maps. We need some of them for
7886  // error checking, now that we have the reverseMode parameter.
7887  RCP<const map_type> MyRowMap = reverseMode ?
7888  rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
7889  RCP<const map_type> MyColMap; // create this below
7890  RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
7891  domainMap : getDomainMap ();
7892  RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
7893  rangeMap : getRangeMap ();
7894  RCP<const map_type> BaseRowMap = MyRowMap;
7895  RCP<const map_type> BaseDomainMap = MyDomainMap;
7896 
7897  // If the user gave us a nonnull destMat, then check whether it's
7898  // "pristine." That means that it has no entries.
7899  //
7900  // FIXME (mfh 15 May 2014) If this is not true on all processes,
7901  // then this exception test may hang. It would be better to
7902  // forward an error flag to the next communication phase.
7903  if (! destMat.is_null ()) {
7904  // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7905  // whether a graph or matrix has no entries on the calling
7906  // process, is that it is neither locally nor globally indexed.
7907  // This may change eventually with the Kokkos refactor version
7908  // of Tpetra, so it would be better just to check the quantity
7909  // of interest directly. Note that with the Kokkos refactor
7910  // version of Tpetra, asking for the total number of entries in
7911  // a graph or matrix that is not fill complete might require
7912  // computation (kernel launch), since it is not thread scalable
7913  // to update a count every time an entry is inserted.
7914  const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
7915  ! destMat->getGraph ()->isGloballyIndexed ();
7916  TEUCHOS_TEST_FOR_EXCEPTION(
7917  ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
7918  "transferAndFillComplete: The input argument 'destMat' is only allowed "
7919  "to be nonnull, if its graph is empty (neither locally nor globally "
7920  "indexed).");
7921  // FIXME (mfh 15 May 2014) At some point, we want to change
7922  // graphs and matrices so that their DistObject Map
7923  // (this->getMap()) may differ from their row Map. This will
7924  // make redistribution for 2-D distributions more efficient. I
7925  // hesitate to change this check, because I'm not sure how much
7926  // the code here depends on getMap() and getRowMap() being the
7927  // same.
7928  TEUCHOS_TEST_FOR_EXCEPTION(
7929  ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
7930  "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7931  "input argument 'destMat' is not the same as the (row) Map specified "
7932  "by the input argument 'rowTransfer'.");
7933  TEUCHOS_TEST_FOR_EXCEPTION(
7934  ! destMat->checkSizes (*this), std::invalid_argument,
7935  "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7936  "destination matrix, but checkSizes() indicates that it is not a legal "
7937  "legal target for redistribution from the source matrix (*this). This "
7938  "may mean that they do not have the same dimensions.");
7939  }
7940 
7941  // If forward mode (the default), then *this's (row) Map must be
7942  // the same as the source Map of the Transfer. If reverse mode,
7943  // then *this's (row) Map must be the same as the target Map of
7944  // the Transfer.
7945  //
7946  // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7947  // and matrices so that their DistObject Map (this->getMap()) may
7948  // differ from their row Map. This will make redistribution for
7949  // 2-D distributions more efficient. I hesitate to change this
7950  // check, because I'm not sure how much the code here depends on
7951  // getMap() and getRowMap() being the same.
7952  TEUCHOS_TEST_FOR_EXCEPTION(
7953  ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
7954  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7955  "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7956  TEUCHOS_TEST_FOR_EXCEPTION(
7957  ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
7958  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7959  "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7960 
7961  // checks for domainTransfer
7962  TEUCHOS_TEST_FOR_EXCEPTION(
7963  ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7964  std::invalid_argument,
7965  "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7966  "argument must be the same as the rebalanced domain map 'domainMap'");
7967 
7968  TEUCHOS_TEST_FOR_EXCEPTION(
7969  ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7970  std::invalid_argument,
7971  "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7972  "argument must be the same as the rebalanced domain map 'domainMap'");
7973 
7974  // The basic algorithm here is:
7975  //
7976  // 1. Call the moral equivalent of "Distor.do" to handle the import.
7977  // 2. Copy all the Imported and Copy/Permuted data into the raw
7978  // CrsMatrix / CrsGraphData pointers, still using GIDs.
7979  // 3. Call an optimized version of MakeColMap that avoids the
7980  // Directory lookups (since the importer knows who owns all the
7981  // GIDs) AND reindexes to LIDs.
7982  // 4. Call expertStaticFillComplete()
7983 
7984  // Get information from the Importer
7985  const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7986  ArrayView<const LO> ExportLIDs = reverseMode ?
7987  rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
7988  auto RemoteLIDs = reverseMode ?
7989  rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
7990  auto PermuteToLIDs = reverseMode ?
7991  rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
7992  auto PermuteFromLIDs = reverseMode ?
7993  rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
7994  Distributor& Distor = rowTransfer.getDistributor ();
7995 
7996  // Owning PIDs
7997  Teuchos::Array<int> SourcePids;
7998 
7999  // Temp variables for sub-communicators
8000  RCP<const map_type> ReducedRowMap, ReducedColMap,
8001  ReducedDomainMap, ReducedRangeMap;
8002  RCP<const Comm<int> > ReducedComm;
8003 
8004  // If the user gave us a null destMat, then construct the new
8005  // destination matrix. We will replace its column Map later.
8006  if (destMat.is_null ()) {
8007  destMat = rcp (new this_CRS_type (MyRowMap, 0, matrixparams));
8008  }
8009 
8010  /***************************************************/
8011  /***** 1) First communicator restriction phase ****/
8012  /***************************************************/
8013  if (restrictComm) {
8014 #ifdef HAVE_TPETRA_MMM_TIMINGS
8015  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrictComm")));
8016 #endif
8017  ReducedRowMap = MyRowMap->removeEmptyProcesses ();
8018  ReducedComm = ReducedRowMap.is_null () ?
8019  Teuchos::null :
8020  ReducedRowMap->getComm ();
8021  destMat->removeEmptyProcessesInPlace (ReducedRowMap);
8022 
8023  ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
8024  ReducedRowMap :
8025  MyDomainMap->replaceCommWithSubset (ReducedComm);
8026  ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
8027  ReducedRowMap :
8028  MyRangeMap->replaceCommWithSubset (ReducedComm);
8029 
8030  // Reset the "my" maps
8031  MyRowMap = ReducedRowMap;
8032  MyDomainMap = ReducedDomainMap;
8033  MyRangeMap = ReducedRangeMap;
8034 
8035  // Update my PID, if we've restricted the communicator
8036  if (! ReducedComm.is_null ()) {
8037  MyPID = ReducedComm->getRank ();
8038  }
8039  else {
8040  MyPID = -2; // For debugging
8041  }
8042  }
8043  else {
8044  ReducedComm = MyRowMap->getComm ();
8045  }
8046 
8047 
8048 
8049  /***************************************************/
8050  /***** 2) From Tpetra::DistObject::doTransfer() ****/
8051  /***************************************************/
8052  // Get the owning PIDs
8053  RCP<const import_type> MyImporter = getGraph ()->getImporter ();
8054 
8055  // check whether domain maps of source matrix and base domain map is the same
8056  bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
8057 
8058  if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
8059 #ifdef HAVE_TPETRA_MMM_TIMINGS
8060  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs same map")));
8061 #endif
8062  // Same domain map as source matrix
8063  //
8064  // NOTE: This won't work for restrictComm (because the Import
8065  // doesn't know the restricted PIDs), though writing an
8066  // optimized version for that case would be easy (Import an
8067  // IntVector of the new PIDs). Might want to add this later.
8068  Import_Util::getPids (*MyImporter, SourcePids, false);
8069  }
8070  else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
8071  // Same domain map as source matrix (restricted communicator)
8072  // We need one import from the domain to the column map
8073 #ifdef HAVE_TPETRA_MMM_TIMINGS
8074  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs restricted comm")));
8075 #endif
8076  IntVectorType SourceDomain_pids(getDomainMap (),true);
8077  IntVectorType SourceCol_pids(getColMap());
8078  // SourceDomain_pids contains the restricted pids
8079  SourceDomain_pids.putScalar(MyPID);
8080 
8081  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8082  SourcePids.resize (getColMap ()->getLocalNumElements ());
8083  SourceCol_pids.get1dCopy (SourcePids ());
8084  }
8085  else if (MyImporter.is_null ()) {
8086  // Matrix has no off-process entries
8087 #ifdef HAVE_TPETRA_MMM_TIMINGS
8088  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs all local entries")));
8089 #endif
8090  SourcePids.resize (getColMap ()->getLocalNumElements ());
8091  SourcePids.assign (getColMap ()->getLocalNumElements (), MyPID);
8092  }
8093  else if ( ! MyImporter.is_null () &&
8094  ! domainTransfer.is_null () ) {
8095  // general implementation for rectangular matrices with
8096  // domain map different than SourceMatrix domain map.
8097  // User has to provide a DomainTransfer object. We need
8098  // to communications (import/export)
8099 #ifdef HAVE_TPETRA_MMM_TIMINGS
8100  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs rectangular case")));
8101 #endif
8102 
8103  // TargetDomain_pids lives on the rebalanced new domain map
8104  IntVectorType TargetDomain_pids (domainMap);
8105  TargetDomain_pids.putScalar (MyPID);
8106 
8107  // SourceDomain_pids lives on the non-rebalanced old domain map
8108  IntVectorType SourceDomain_pids (getDomainMap ());
8109 
8110  // SourceCol_pids lives on the non-rebalanced old column map
8111  IntVectorType SourceCol_pids (getColMap ());
8112 
8113  if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8114  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8115  }
8116  else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8117  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8118  }
8119  else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8120  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8121  }
8122  else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8123  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8124  }
8125  else {
8126  TEUCHOS_TEST_FOR_EXCEPTION(
8127  true, std::logic_error, "Tpetra::CrsMatrix::"
8128  "transferAndFillComplete: Should never get here! "
8129  "Please report this bug to a Tpetra developer.");
8130  }
8131  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8132  SourcePids.resize (getColMap ()->getLocalNumElements ());
8133  SourceCol_pids.get1dCopy (SourcePids ());
8134  }
8135  else if ( ! MyImporter.is_null () &&
8136  BaseDomainMap->isSameAs (*BaseRowMap) &&
8137  getDomainMap ()->isSameAs (*getRowMap ())) {
8138  // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
8139 #ifdef HAVE_TPETRA_MMM_TIMINGS
8140  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs query import")));
8141 #endif
8142 
8143  IntVectorType TargetRow_pids (domainMap);
8144  IntVectorType SourceRow_pids (getRowMap ());
8145  IntVectorType SourceCol_pids (getColMap ());
8146 
8147  TargetRow_pids.putScalar (MyPID);
8148  if (! reverseMode && xferAsImport != nullptr) {
8149  SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
8150  }
8151  else if (reverseMode && xferAsExport != nullptr) {
8152  SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
8153  }
8154  else if (! reverseMode && xferAsExport != nullptr) {
8155  SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
8156  }
8157  else if (reverseMode && xferAsImport != nullptr) {
8158  SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
8159  }
8160  else {
8161  TEUCHOS_TEST_FOR_EXCEPTION(
8162  true, std::logic_error, "Tpetra::CrsMatrix::"
8163  "transferAndFillComplete: Should never get here! "
8164  "Please report this bug to a Tpetra developer.");
8165  }
8166 
8167  SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
8168  SourcePids.resize (getColMap ()->getLocalNumElements ());
8169  SourceCol_pids.get1dCopy (SourcePids ());
8170  }
8171  else {
8172  TEUCHOS_TEST_FOR_EXCEPTION(
8173  true, std::invalid_argument, "Tpetra::CrsMatrix::"
8174  "transferAndFillComplete: This method only allows either domainMap == "
8175  "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
8176  "getDomainMap () == getRowMap ()).");
8177  }
8178 
8179  // Tpetra-specific stuff
8180  size_t constantNumPackets = destMat->constantNumberOfPackets ();
8181  {
8182 #ifdef HAVE_TPETRA_MMM_TIMINGS
8183  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC reallocate buffers")));
8184 #endif
8185  if (constantNumPackets == 0) {
8186  destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
8187  RemoteLIDs.view_host().size ());
8188  }
8189  else {
8190  // There are a constant number of packets per element. We
8191  // already know (from the number of "remote" (incoming)
8192  // elements) how many incoming elements we expect, so we can
8193  // resize the buffer accordingly.
8194  const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
8195  destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
8196  }
8197  }
8198 
8199  // Pack & Prepare w/ owning PIDs
8200  {
8201 #ifdef HAVE_TPETRA_MMM_TIMINGS
8202  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC pack and prepare")));
8203 #endif
8204  if (debug) {
8205  using Teuchos::outArg;
8206  using Teuchos::REDUCE_MAX;
8207  using Teuchos::reduceAll;
8208  using std::cerr;
8209  using std::endl;
8210  RCP<const Teuchos::Comm<int> > comm = this->getComm ();
8211  const int myRank = comm->getRank ();
8212 
8213  std::ostringstream errStrm;
8214  int lclErr = 0;
8215  int gblErr = 0;
8216 
8217  Teuchos::ArrayView<size_t> numExportPacketsPerLID;
8218  try {
8219  // packAndPrepare* methods modify numExportPacketsPerLID_.
8220  destMat->numExportPacketsPerLID_.modify_host ();
8221  numExportPacketsPerLID =
8222  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8223  }
8224  catch (std::exception& e) {
8225  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
8226  << e.what () << std::endl;
8227  lclErr = 1;
8228  }
8229  catch (...) {
8230  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
8231  "an exception not a subclass of std::exception" << std::endl;
8232  lclErr = 1;
8233  }
8234 
8235  if (! comm.is_null ()) {
8236  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8237  }
8238  if (gblErr != 0) {
8239  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8240  TEUCHOS_TEST_FOR_EXCEPTION(
8241  true, std::runtime_error, "getArrayViewFromDualView threw an "
8242  "exception on at least one process.");
8243  }
8244 
8245  if (verbose) {
8246  std::ostringstream os;
8247  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8248  << std::endl;
8249  std::cerr << os.str ();
8250  }
8251  try {
8253  destMat->exports_,
8254  numExportPacketsPerLID,
8255  ExportLIDs,
8256  SourcePids,
8257  constantNumPackets);
8258  }
8259  catch (std::exception& e) {
8260  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
8261  << e.what () << std::endl;
8262  lclErr = 1;
8263  }
8264  catch (...) {
8265  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
8266  "an exception not a subclass of std::exception" << std::endl;
8267  lclErr = 1;
8268  }
8269 
8270  if (verbose) {
8271  std::ostringstream os;
8272  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8273  << std::endl;
8274  std::cerr << os.str ();
8275  }
8276 
8277  if (! comm.is_null ()) {
8278  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8279  }
8280  if (gblErr != 0) {
8281  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8282  TEUCHOS_TEST_FOR_EXCEPTION(
8283  true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
8284  "exception on at least one process.");
8285  }
8286  }
8287  else {
8288  // packAndPrepare* methods modify numExportPacketsPerLID_.
8289  destMat->numExportPacketsPerLID_.modify_host ();
8290  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
8291  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8292  if (verbose) {
8293  std::ostringstream os;
8294  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8295  << std::endl;
8296  std::cerr << os.str ();
8297  }
8299  destMat->exports_,
8300  numExportPacketsPerLID,
8301  ExportLIDs,
8302  SourcePids,
8303  constantNumPackets);
8304  if (verbose) {
8305  std::ostringstream os;
8306  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8307  << std::endl;
8308  std::cerr << os.str ();
8309  }
8310  }
8311  }
8312 
8313  // Do the exchange of remote data.
8314  {
8315 #ifdef HAVE_TPETRA_MMM_TIMINGS
8316  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs exchange remote data")));
8317 #endif
8318  if (! communication_needed) {
8319  if (verbose) {
8320  std::ostringstream os;
8321  os << *verbosePrefix << "Communication not needed" << std::endl;
8322  std::cerr << os.str ();
8323  }
8324  }
8325  else {
8326  if (reverseMode) {
8327  if (constantNumPackets == 0) { // variable number of packets per LID
8328  if (verbose) {
8329  std::ostringstream os;
8330  os << *verbosePrefix << "Reverse mode, variable # packets / LID"
8331  << std::endl;
8332  std::cerr << os.str ();
8333  }
8334  // Make sure that host has the latest version, since we're
8335  // using the version on host. If host has the latest
8336  // version, syncing to host does nothing.
8337  destMat->numExportPacketsPerLID_.sync_host ();
8338  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8339  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8340  destMat->numImportPacketsPerLID_.sync_host ();
8341  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8342  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8343 
8344  if (verbose) {
8345  std::ostringstream os;
8346  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8347  << std::endl;
8348  std::cerr << os.str ();
8349  }
8350  Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8351  destMat->numImportPacketsPerLID_.view_host());
8352  if (verbose) {
8353  std::ostringstream os;
8354  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8355  << std::endl;
8356  std::cerr << os.str ();
8357  }
8358 
8359  size_t totalImportPackets = 0;
8360  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8361  totalImportPackets += numImportPacketsPerLID[i];
8362  }
8363 
8364  // Reallocation MUST go before setting the modified flag,
8365  // because it may clear out the flags.
8366  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8367  verbosePrefix.get ());
8368  destMat->imports_.modify_host ();
8369  auto hostImports = destMat->imports_.view_host();
8370  // This is a legacy host pack/unpack path, so use the host
8371  // version of exports_.
8372  destMat->exports_.sync_host ();
8373  auto hostExports = destMat->exports_.view_host();
8374  if (verbose) {
8375  std::ostringstream os;
8376  os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
8377  << std::endl;
8378  std::cerr << os.str ();
8379  }
8380  Distor.doReversePostsAndWaits (hostExports,
8381  numExportPacketsPerLID,
8382  hostImports,
8383  numImportPacketsPerLID);
8384  if (verbose) {
8385  std::ostringstream os;
8386  os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
8387  << std::endl;
8388  std::cerr << os.str ();
8389  }
8390  }
8391  else { // constant number of packets per LID
8392  if (verbose) {
8393  std::ostringstream os;
8394  os << *verbosePrefix << "Reverse mode, constant # packets / LID"
8395  << std::endl;
8396  std::cerr << os.str ();
8397  }
8398  destMat->imports_.modify_host ();
8399  auto hostImports = destMat->imports_.view_host();
8400  // This is a legacy host pack/unpack path, so use the host
8401  // version of exports_.
8402  destMat->exports_.sync_host ();
8403  auto hostExports = destMat->exports_.view_host();
8404  if (verbose) {
8405  std::ostringstream os;
8406  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8407  << std::endl;
8408  std::cerr << os.str ();
8409  }
8410  Distor.doReversePostsAndWaits (hostExports,
8411  constantNumPackets,
8412  hostImports);
8413  if (verbose) {
8414  std::ostringstream os;
8415  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8416  << std::endl;
8417  std::cerr << os.str ();
8418  }
8419  }
8420  }
8421  else { // forward mode (the default)
8422  if (constantNumPackets == 0) { // variable number of packets per LID
8423  if (verbose) {
8424  std::ostringstream os;
8425  os << *verbosePrefix << "Forward mode, variable # packets / LID"
8426  << std::endl;
8427  std::cerr << os.str ();
8428  }
8429  // Make sure that host has the latest version, since we're
8430  // using the version on host. If host has the latest
8431  // version, syncing to host does nothing.
8432  destMat->numExportPacketsPerLID_.sync_host ();
8433  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8434  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8435  destMat->numImportPacketsPerLID_.sync_host ();
8436  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8437  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8438  if (verbose) {
8439  std::ostringstream os;
8440  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8441  << std::endl;
8442  std::cerr << os.str ();
8443  }
8444  Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8445  destMat->numImportPacketsPerLID_.view_host());
8446  if (verbose) {
8447  std::ostringstream os;
8448  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8449  << std::endl;
8450  std::cerr << os.str ();
8451  }
8452 
8453  size_t totalImportPackets = 0;
8454  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8455  totalImportPackets += numImportPacketsPerLID[i];
8456  }
8457 
8458  // Reallocation MUST go before setting the modified flag,
8459  // because it may clear out the flags.
8460  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8461  verbosePrefix.get ());
8462  destMat->imports_.modify_host ();
8463  auto hostImports = destMat->imports_.view_host();
8464  // This is a legacy host pack/unpack path, so use the host
8465  // version of exports_.
8466  destMat->exports_.sync_host ();
8467  auto hostExports = destMat->exports_.view_host();
8468  if (verbose) {
8469  std::ostringstream os;
8470  os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8471  << std::endl;
8472  std::cerr << os.str ();
8473  }
8474  Distor.doPostsAndWaits (hostExports,
8475  numExportPacketsPerLID,
8476  hostImports,
8477  numImportPacketsPerLID);
8478  if (verbose) {
8479  std::ostringstream os;
8480  os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8481  << std::endl;
8482  std::cerr << os.str ();
8483  }
8484  }
8485  else { // constant number of packets per LID
8486  if (verbose) {
8487  std::ostringstream os;
8488  os << *verbosePrefix << "Forward mode, constant # packets / LID"
8489  << std::endl;
8490  std::cerr << os.str ();
8491  }
8492  destMat->imports_.modify_host ();
8493  auto hostImports = destMat->imports_.view_host();
8494  // This is a legacy host pack/unpack path, so use the host
8495  // version of exports_.
8496  destMat->exports_.sync_host ();
8497  auto hostExports = destMat->exports_.view_host();
8498  if (verbose) {
8499  std::ostringstream os;
8500  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8501  << std::endl;
8502  std::cerr << os.str ();
8503  }
8504  Distor.doPostsAndWaits (hostExports,
8505  constantNumPackets,
8506  hostImports);
8507  if (verbose) {
8508  std::ostringstream os;
8509  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8510  << std::endl;
8511  std::cerr << os.str ();
8512  }
8513  }
8514  }
8515  }
8516  }
8517 
8518  /*********************************************************************/
8519  /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8520  /*********************************************************************/
8521 
8522  bool runOnHost = std::is_same_v<typename device_type::memory_space, Kokkos::HostSpace> && !useKokkosPath;
8523 
8524  Teuchos::Array<int> RemotePids;
8525  if (runOnHost) {
8526  Teuchos::Array<int> TargetPids;
8527  // Backwards compatibility measure. We'll use this again below.
8528 
8529  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8530  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8531  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8532  destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8533 
8534 # ifdef HAVE_TPETRA_MMM_TIMINGS
8535  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8536 # endif
8537  ArrayRCP<size_t> CSR_rowptr;
8538  ArrayRCP<GO> CSR_colind_GID;
8539  ArrayRCP<LO> CSR_colind_LID;
8540  ArrayRCP<Scalar> CSR_vals;
8541 
8542  destMat->imports_.sync_device ();
8543  destMat->numImportPacketsPerLID_.sync_device ();
8544 
8545  size_t N = BaseRowMap->getLocalNumElements ();
8546 
8547  auto RemoteLIDs_d = RemoteLIDs.view_device();
8548  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8549  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8550 
8552  *this,
8553  RemoteLIDs_d,
8554  destMat->imports_.view_device(), //hostImports
8555  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8556  NumSameIDs,
8557  PermuteToLIDs_d,
8558  PermuteFromLIDs_d,
8559  N,
8560  MyPID,
8561  CSR_rowptr,
8562  CSR_colind_GID,
8563  CSR_vals,
8564  SourcePids(),
8565  TargetPids);
8566 
8567  // If LO and GO are the same, we can reuse memory when
8568  // converting the column indices from global to local indices.
8569  if (typeid (LO) == typeid (GO)) {
8570  CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
8571  }
8572  else {
8573  CSR_colind_LID.resize (CSR_colind_GID.size());
8574  }
8575  CSR_colind_LID.resize (CSR_colind_GID.size());
8576 
8577  // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8578  // owned entries. Convert them to the actual PID.
8579  // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
8580  for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
8581  {
8582  if(TargetPids[i] == -1) TargetPids[i] = MyPID;
8583  }
8584 #ifdef HAVE_TPETRA_MMM_TIMINGS
8585  tmCopySPRdata = Teuchos::null;
8586 #endif
8587  /**************************************************************/
8588  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8589  /**************************************************************/
8590  // Call an optimized version of makeColMap that avoids the
8591  // Directory lookups (since the Import object knows who owns all
8592  // the GIDs).
8593  if (verbose) {
8594  std::ostringstream os;
8595  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8596  << std::endl;
8597  std::cerr << os.str ();
8598  }
8599  {
8600 #ifdef HAVE_TPETRA_MMM_TIMINGS
8601  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8602 #endif
8603  Import_Util::lowCommunicationMakeColMapAndReindexSerial(CSR_rowptr (),
8604  CSR_colind_LID (),
8605  CSR_colind_GID (),
8606  BaseDomainMap,
8607  TargetPids,
8608  RemotePids,
8609  MyColMap);
8610  }
8611 
8612  if (verbose) {
8613  std::ostringstream os;
8614  os << *verbosePrefix << "restrictComm="
8615  << (restrictComm ? "true" : "false") << std::endl;
8616  std::cerr << os.str ();
8617  }
8618 
8619  /*******************************************************/
8620  /**** 4) Second communicator restriction phase ****/
8621  /*******************************************************/
8622  {
8623 #ifdef HAVE_TPETRA_MMM_TIMINGS
8624  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8625 #endif
8626  if (restrictComm) {
8627  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8628  ReducedRowMap :
8629  MyColMap->replaceCommWithSubset (ReducedComm);
8630  MyColMap = ReducedColMap; // Reset the "my" maps
8631  }
8632 
8633  // Replace the col map
8634  if (verbose) {
8635  std::ostringstream os;
8636  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8637  std::cerr << os.str ();
8638  }
8639  destMat->replaceColMap (MyColMap);
8640 
8641  // Short circuit if the processor is no longer in the communicator
8642  //
8643  // NOTE: Epetra replaces modifies all "removed" processes so they
8644  // have a dummy (serial) Map that doesn't touch the original
8645  // communicator. Duplicating that here might be a good idea.
8646  if (ReducedComm.is_null ()) {
8647  if (verbose) {
8648  std::ostringstream os;
8649  os << *verbosePrefix << "I am no longer in the communicator; "
8650  "returning" << std::endl;
8651  std::cerr << os.str ();
8652  }
8653  return;
8654  }
8655  }
8656 
8657  /***************************************************/
8658  /**** 5) Sort ****/
8659  /***************************************************/
8660  if ((! reverseMode && xferAsImport != nullptr) ||
8661  (reverseMode && xferAsExport != nullptr)) {
8662  if (verbose) {
8663  std::ostringstream os;
8664  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8665  std::cerr << os.str ();
8666  }
8667 #ifdef HAVE_TPETRA_MMM_TIMINGS
8668  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8669 #endif
8670  Import_Util::sortCrsEntries (CSR_rowptr(),
8671  CSR_colind_LID(),
8672  CSR_vals());
8673  }
8674  else if ((! reverseMode && xferAsExport != nullptr) ||
8675  (reverseMode && xferAsImport != nullptr)) {
8676  if (verbose) {
8677  std::ostringstream os;
8678  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8679  << endl;
8680  std::cerr << os.str();
8681  }
8682 #ifdef HAVE_TPETRA_MMM_TIMINGS
8683  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8684 #endif
8685  Import_Util::sortAndMergeCrsEntries (CSR_rowptr(),
8686  CSR_colind_LID(),
8687  CSR_vals());
8688  if (CSR_rowptr[N] != static_cast<size_t>(CSR_vals.size())) {
8689  CSR_colind_LID.resize (CSR_rowptr[N]);
8690  CSR_vals.resize (CSR_rowptr[N]);
8691  }
8692  }
8693  else {
8694  TEUCHOS_TEST_FOR_EXCEPTION(
8695  true, std::logic_error, "Tpetra::CrsMatrix::"
8696  "transferAndFillComplete: Should never get here! "
8697  "Please report this bug to a Tpetra developer.");
8698  }
8699  /***************************************************/
8700  /**** 6) Reset the colmap and the arrays ****/
8701  /***************************************************/
8702 
8703  if (verbose) {
8704  std::ostringstream os;
8705  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8706  std::cerr << os.str ();
8707  }
8708 
8709  // Call constructor for the new matrix (restricted as needed)
8710  //
8711  // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8712  // refactor version of CrsMatrix, though it reserves the right to
8713  // make a deep copy of the arrays.
8714  {
8715 #ifdef HAVE_TPETRA_MMM_TIMINGS
8716  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8717 #endif
8718  destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
8719  }
8720 
8721  } else {
8722  // run on device
8723 
8724 
8725  // Backwards compatibility measure. We'll use this again below.
8726 
8727  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8728  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8729  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8730  destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8731 
8732 # ifdef HAVE_TPETRA_MMM_TIMINGS
8733  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8734 # endif
8735  ArrayRCP<size_t> CSR_rowptr;
8736  ArrayRCP<GO> CSR_colind_GID;
8737  ArrayRCP<LO> CSR_colind_LID;
8738  ArrayRCP<Scalar> CSR_vals;
8739 
8740  destMat->imports_.sync_device ();
8741  destMat->numImportPacketsPerLID_.sync_device ();
8742 
8743  size_t N = BaseRowMap->getLocalNumElements ();
8744 
8745  auto RemoteLIDs_d = RemoteLIDs.view_device();
8746  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8747  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8748 
8749  Kokkos::View<size_t*,device_type> CSR_rowptr_d;
8750  Kokkos::View<GO*,device_type> CSR_colind_GID_d;
8751  Kokkos::View<LO*,device_type> CSR_colind_LID_d;
8752  Kokkos::View<impl_scalar_type*,device_type> CSR_vals_d;
8753  Kokkos::View<int*,device_type> TargetPids_d;
8754 
8756  *this,
8757  RemoteLIDs_d,
8758  destMat->imports_.view_device(), //hostImports
8759  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8760  NumSameIDs,
8761  PermuteToLIDs_d,
8762  PermuteFromLIDs_d,
8763  N,
8764  MyPID,
8765  CSR_rowptr_d,
8766  CSR_colind_GID_d,
8767  CSR_vals_d,
8768  SourcePids(),
8769  TargetPids_d);
8770 
8771  Kokkos::resize (CSR_colind_LID_d, CSR_colind_GID_d.size());
8772 
8773 #ifdef HAVE_TPETRA_MMM_TIMINGS
8774  tmCopySPRdata = Teuchos::null;
8775 #endif
8776  /**************************************************************/
8777  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8778  /**************************************************************/
8779  // Call an optimized version of makeColMap that avoids the
8780  // Directory lookups (since the Import object knows who owns all
8781  // the GIDs).
8782  if (verbose) {
8783  std::ostringstream os;
8784  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8785  << std::endl;
8786  std::cerr << os.str ();
8787  }
8788  {
8789 #ifdef HAVE_TPETRA_MMM_TIMINGS
8790  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8791 #endif
8792  Import_Util::lowCommunicationMakeColMapAndReindex(CSR_rowptr_d,
8793  CSR_colind_LID_d,
8794  CSR_colind_GID_d,
8795  BaseDomainMap,
8796  TargetPids_d,
8797  RemotePids,
8798  MyColMap);
8799  }
8800 
8801  if (verbose) {
8802  std::ostringstream os;
8803  os << *verbosePrefix << "restrictComm="
8804  << (restrictComm ? "true" : "false") << std::endl;
8805  std::cerr << os.str ();
8806  }
8807 
8808  /*******************************************************/
8809  /**** 4) Second communicator restriction phase ****/
8810  /*******************************************************/
8811  {
8812 #ifdef HAVE_TPETRA_MMM_TIMINGS
8813  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8814 #endif
8815  if (restrictComm) {
8816  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8817  ReducedRowMap :
8818  MyColMap->replaceCommWithSubset (ReducedComm);
8819  MyColMap = ReducedColMap; // Reset the "my" maps
8820  }
8821 
8822  // Replace the col map
8823  if (verbose) {
8824  std::ostringstream os;
8825  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8826  std::cerr << os.str ();
8827  }
8828  destMat->replaceColMap (MyColMap);
8829 
8830  // Short circuit if the processor is no longer in the communicator
8831  //
8832  // NOTE: Epetra replaces modifies all "removed" processes so they
8833  // have a dummy (serial) Map that doesn't touch the original
8834  // communicator. Duplicating that here might be a good idea.
8835  if (ReducedComm.is_null ()) {
8836  if (verbose) {
8837  std::ostringstream os;
8838  os << *verbosePrefix << "I am no longer in the communicator; "
8839  "returning" << std::endl;
8840  std::cerr << os.str ();
8841  }
8842  return;
8843  }
8844  }
8845 
8846  /***************************************************/
8847  /**** 5) Sort ****/
8848  /***************************************************/
8849 
8850  if ((! reverseMode && xferAsImport != nullptr) ||
8851  (reverseMode && xferAsExport != nullptr)) {
8852  if (verbose) {
8853  std::ostringstream os;
8854  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8855  std::cerr << os.str ();
8856  }
8857 #ifdef HAVE_TPETRA_MMM_TIMINGS
8858  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8859 #endif
8860  Import_Util::sortCrsEntries (CSR_rowptr_d,
8861  CSR_colind_LID_d,
8862  CSR_vals_d);
8863  }
8864  else if ((! reverseMode && xferAsExport != nullptr) ||
8865  (reverseMode && xferAsImport != nullptr)) {
8866  if (verbose) {
8867  std::ostringstream os;
8868  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8869  << endl;
8870  std::cerr << os.str();
8871  }
8872 #ifdef HAVE_TPETRA_MMM_TIMINGS
8873  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8874 #endif
8875  Import_Util::sortAndMergeCrsEntries (CSR_rowptr_d,
8876  CSR_colind_LID_d,
8877  CSR_vals_d);
8878  }
8879  else {
8880  TEUCHOS_TEST_FOR_EXCEPTION(
8881  true, std::logic_error, "Tpetra::CrsMatrix::"
8882  "transferAndFillComplete: Should never get here! "
8883  "Please report this bug to a Tpetra developer.");
8884  }
8885 
8886  /***************************************************/
8887  /**** 6) Reset the colmap and the arrays ****/
8888  /***************************************************/
8889 
8890  if (verbose) {
8891  std::ostringstream os;
8892  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8893  std::cerr << os.str ();
8894  }
8895 
8896  {
8897 #ifdef HAVE_TPETRA_MMM_TIMINGS
8898  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8899 #endif
8900  destMat->setAllValues (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d);
8901  }
8902 
8903  } //if (runOnHost) .. else ..
8904 
8905  /***************************************************/
8906  /**** 7) Build Importer & Call ESFC ****/
8907  /***************************************************/
8908 #ifdef HAVE_TPETRA_MMM_TIMINGS
8909  RCP<TimeMonitor> tmIESFC = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC build importer and esfc"))));
8910 #endif
8911  // Pre-build the importer using the existing PIDs
8912  Teuchos::ParameterList esfc_params;
8913 
8914  RCP<import_type> MyImport;
8915 
8916  // Fulfull the non-blocking allreduce on reduced_mismatch.
8917  if (iallreduceRequest.get () != nullptr) {
8918  if (verbose) {
8919  std::ostringstream os;
8920  os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8921  << endl;
8922  std::cerr << os.str ();
8923  }
8924  iallreduceRequest->wait ();
8925  if (reduced_mismatch != 0) {
8926  isMM = false;
8927  }
8928  }
8929 
8930  if( isMM ) {
8931 #ifdef HAVE_TPETRA_MMM_TIMINGS
8932  Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8933 #endif
8934  // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8935 
8936  if (verbose) {
8937  std::ostringstream os;
8938  os << *verbosePrefix << "Getting CRS pointers" << endl;
8939  std::cerr << os.str ();
8940  }
8941 
8942  Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8943  Teuchos::ArrayRCP<int> type3PIDs;
8944  auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8945  auto colind = getCrsGraph()->getLocalIndicesHost();
8946 
8947  if (verbose) {
8948  std::ostringstream os;
8949  os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8950  std::cerr << os.str ();
8951  }
8952 
8953  {
8954 #ifdef HAVE_TPETRA_MMM_TIMINGS
8955  TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8956 #endif
8957  Import_Util::reverseNeighborDiscovery(*this,
8958  rowptr,
8959  colind,
8960  rowTransfer,
8961  MyImporter,
8962  MyDomainMap,
8963  type3PIDs,
8964  type3LIDs,
8965  ReducedComm);
8966  }
8967 
8968  if (verbose) {
8969  std::ostringstream os;
8970  os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8971  std::cerr << os.str ();
8972  }
8973 
8974  Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8975  Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8976 
8977  Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8978  Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8979 
8980  const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8981  // from EpetraExt_MMHelpers.cpp: build_type2_exports
8982  std::vector<bool> IsOwned(numCols,true);
8983  std::vector<int> SentTo(numCols,-1);
8984  if (! MyImporter.is_null ()) {
8985  for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8986  IsOwned[rlid]=false;
8987  }
8988  }
8989 
8990  std::vector<std::pair<int,GO> > usrtg;
8991  usrtg.reserve(TEPID2.size());
8992 
8993  {
8994  const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
8995  for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
8996  const LO row = TELID2[i];
8997  const int pid = TEPID2[i];
8998  for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
8999  const int col = colind[j];
9000  if (IsOwned[col] && SentTo[col] != pid) {
9001  SentTo[col] = pid;
9002  GO gid = colMap.getGlobalElement (col);
9003  usrtg.push_back (std::pair<int,GO> (pid, gid));
9004  }
9005  }
9006  }
9007  }
9008 
9009 // This sort can _not_ be omitted.[
9010  std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
9011  auto eopg = std ::unique(usrtg.begin(),usrtg.end());
9012  // 25 Jul 2018: Could just ignore the entries at and after eopg.
9013  usrtg.erase(eopg,usrtg.end());
9014 
9015  const Array_size_type type2_us_size = usrtg.size();
9016  Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
9017  Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
9018 
9019  int pos=0;
9020  for(auto && p : usrtg) {
9021  EPID2[pos]= p.first;
9022  ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
9023  pos++;
9024  }
9025 
9026  Teuchos::ArrayView<int> EPID3 = type3PIDs();
9027  Teuchos::ArrayView< LO> ELID3 = type3LIDs();
9028  GO InfGID = std::numeric_limits<GO>::max();
9029  int InfPID = INT_MAX;
9030 #ifdef TPETRA_MIN3
9031 # undef TPETRA_MIN3
9032 #endif // TPETRA_MIN3
9033 #define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
9034  int i1=0, i2=0, i3=0;
9035  int Len1 = EPID1.size();
9036  int Len2 = EPID2.size();
9037  int Len3 = EPID3.size();
9038 
9039  int MyLen=Len1+Len2+Len3;
9040  Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
9041  Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
9042  int iloc = 0; // will be the size of the userExportLID/PIDs
9043 
9044  while(i1 < Len1 || i2 < Len2 || i3 < Len3){
9045  int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
9046  int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
9047  int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
9048 
9049  GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
9050  GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
9051  GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
9052 
9053  int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
9054  GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
9055 #ifdef TPETRA_MIN3
9056 # undef TPETRA_MIN3
9057 #endif // TPETRA_MIN3
9058  bool added_entry=false;
9059 
9060  if(PID1 == MIN_PID && GID1 == MIN_GID){
9061  userExportLIDs[iloc]=ELID1[i1];
9062  userExportPIDs[iloc]=EPID1[i1];
9063  i1++;
9064  added_entry=true;
9065  iloc++;
9066  }
9067  if(PID2 == MIN_PID && GID2 == MIN_GID){
9068  if(!added_entry) {
9069  userExportLIDs[iloc]=ELID2[i2];
9070  userExportPIDs[iloc]=EPID2[i2];
9071  added_entry=true;
9072  iloc++;
9073  }
9074  i2++;
9075  }
9076  if(PID3 == MIN_PID && GID3 == MIN_GID){
9077  if(!added_entry) {
9078  userExportLIDs[iloc]=ELID3[i3];
9079  userExportPIDs[iloc]=EPID3[i3];
9080  iloc++;
9081  }
9082  i3++;
9083  }
9084  }
9085 
9086  if (verbose) {
9087  std::ostringstream os;
9088  os << *verbosePrefix << "Create Import" << std::endl;
9089  std::cerr << os.str ();
9090  }
9091 
9092 #ifdef HAVE_TPETRA_MMM_TIMINGS
9093  auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
9094 #endif
9095  Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
9096  // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
9097  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9098  MyImport = rcp ( new import_type (MyDomainMap,
9099  MyColMap,
9100  RemotePids,
9101  userExportLIDs.view(0,iloc).getConst(),
9102  userExportPIDs.view(0,iloc).getConst(),
9103  plist)
9104  );
9105 
9106  if (verbose) {
9107  std::ostringstream os;
9108  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9109  std::cerr << os.str ();
9110  }
9111 
9112  {
9113 #ifdef HAVE_TPETRA_MMM_TIMINGS
9114  TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
9115  esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
9116 #endif
9117  if(!params.is_null())
9118  esfc_params.set("compute global constants",params->get("compute global constants",true));
9119  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
9120 
9121  }
9122 
9123  } // if(isMM)
9124  else {
9125 #ifdef HAVE_TPETRA_MMM_TIMINGS
9126  TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
9127 #endif
9128  if (verbose) {
9129  std::ostringstream os;
9130  os << *verbosePrefix << "Create Import" << std::endl;
9131  std::cerr << os.str ();
9132  }
9133 
9134 #ifdef HAVE_TPETRA_MMM_TIMINGS
9135  TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
9136 #endif
9137  Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
9138  mypars->set("Timer Label","notMMFrom_tAFC");
9139  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9140  MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
9141 
9142  if (verbose) {
9143  std::ostringstream os;
9144  os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
9145  std::cerr << os.str ();
9146  }
9147 
9148 #ifdef HAVE_TPETRA_MMM_TIMINGS
9149  TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
9150  esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
9151 #else
9152  esfc_params.set("Timer Label",std::string("notMM eSFC"));
9153 #endif
9154 
9155  if (!params.is_null ()) {
9156  esfc_params.set ("compute global constants",
9157  params->get ("compute global constants", true));
9158  }
9159  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
9160  MyImport, Teuchos::null,
9161  rcp (new Teuchos::ParameterList (esfc_params)));
9162  }
9163 
9164 #ifdef HAVE_TPETRA_MMM_TIMINGS
9165  tmIESFC = Teuchos::null;
9166 #endif
9167 
9168  if (verbose) {
9169  std::ostringstream os;
9170  os << *verbosePrefix << "Done" << endl;
9171  std::cerr << os.str ();
9172  }
9173  } //transferAndFillComplete
9174 
9175 
9176  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9177  void
9180  const import_type& importer,
9181  const Teuchos::RCP<const map_type>& domainMap,
9182  const Teuchos::RCP<const map_type>& rangeMap,
9183  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9184  {
9185  transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
9186  }
9187 
9188  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9189  void
9192  const import_type& rowImporter,
9193  const import_type& domainImporter,
9194  const Teuchos::RCP<const map_type>& domainMap,
9195  const Teuchos::RCP<const map_type>& rangeMap,
9196  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9197  {
9198  transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
9199  }
9200 
9201  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9202  void
9205  const export_type& exporter,
9206  const Teuchos::RCP<const map_type>& domainMap,
9207  const Teuchos::RCP<const map_type>& rangeMap,
9208  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9209  {
9210  transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
9211  }
9212 
9213  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9214  void
9217  const export_type& rowExporter,
9218  const export_type& domainExporter,
9219  const Teuchos::RCP<const map_type>& domainMap,
9220  const Teuchos::RCP<const map_type>& rangeMap,
9221  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9222  {
9223  transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
9224  }
9225 
9226 } // namespace Tpetra
9227 
9228 //
9229 // Explicit instantiation macro
9230 //
9231 // Must be expanded from within the Tpetra namespace!
9232 //
9233 
9234 #define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
9235  \
9236  template class CrsMatrix< SCALAR , LO , GO , NODE >;
9237 
9238 #define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
9239  \
9240  template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
9241  CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
9242 
9243 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9244  template<> \
9245  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9246  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9247  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9248  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9249  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
9250  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9251  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9252  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9253  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9254  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9255  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9256  const Teuchos::RCP<Teuchos::ParameterList>& params);
9257 
9258 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9259  template<> \
9260  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9261  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9262  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9263  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9264  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
9265  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9266  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9267  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
9268  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9269  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9270  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9271  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9272  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9273  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9274  const Teuchos::RCP<Teuchos::ParameterList>& params);
9275 
9276 
9277 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9278  template<> \
9279  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9280  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9281  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9282  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9283  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
9284  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9285  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9286  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9287  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9288  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9289  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9290  const Teuchos::RCP<Teuchos::ParameterList>& params);
9291 
9292 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9293  template<> \
9294  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9295  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9296  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9297  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9298  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9299  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9300  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9301  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9302  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9303  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9304  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9305  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9306  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9307  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9308  const Teuchos::RCP<Teuchos::ParameterList>& params);
9309 
9310 
9311 #define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9312  TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9313  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9314  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9315  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9316  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9317 
9318 #endif // TPETRA_CRSMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
bool indicesAreSorted_
Whether the graph&#39;s indices are sorted in each row, on this process.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Functor for the the ABSMAX CombineMode of Import and Export operations.
void checkInternalState() const
Check that this object&#39;s state is sane; throw if it&#39;s not.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types...
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
typename device_type::execution_space execution_space
The Kokkos execution space.
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object&#39;s data for an Import or Export.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
void getGlobalRowView(GlobalOrdinal GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices...
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Traits class for packing / unpacking data of type T.
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular...
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
bool noRedundancies_
Whether the graph&#39;s indices are non-redundant (merged) in each row, on this process.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
bool isDistributed() const
Whether this is a globally distributed object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
static bool debug()
Whether Tpetra is in debug mode.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix&#39;s communicator...
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y...
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T...
Scalar scalar_type
The type of each entry in the matrix.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of column indices for all rows Valid when isLocallyIndexed is true If OptimizedStorage...
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void sort(View &view, const size_t &size)
Convenience wrapper for std::sort for host-accessible views.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular...
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow)...
bool isFillComplete() const override
Whether the matrix is fill complete.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified, possibly nondefault Kokkos Node type.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row&#39;s entries.
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Functions for manipulating CRS arrays.
Kokkos::View< size_t *, Kokkos::LayoutLeft, device_type >::HostMirror num_row_entries_type
Row offsets for &quot;1-D&quot; storage.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
bool isNodeGlobalElement(global_ordinal_type globalIndex) const
Whether the given global index is owned by this Map on the calling process.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for &quot;new&quot; DistObject inter...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void copyConvert(const OutputViewType &dst, const InputViewType &src)
Copy values from the 1-D Kokkos::View src, to the 1-D Kokkos::View dst, of the same length...
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
bool fillComplete_
Whether the matrix is fill complete.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using global indices.
Replace old value with maximum of magnitudes of old and new values.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
Abstract base class for objects that can be the source of an Import or Export operation.
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
typename Node::device_type device_type
The Kokkos device type.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process...
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps...
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
std::string description() const override
A one-line description of this object.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
LO getLocalDiagCopyWithoutOffsetsNotFillComplete(::Tpetra::Vector< SC, LO, GO, NT > &diag, const ::Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool debug=false)
Given a locally indexed, global sparse matrix, extract the matrix&#39;s diagonal entries into a Tpetra::V...
Replace existing values with new values.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
Replace old values with zero.
const row_ptrs_host_view_type & getRowPtrsUnpackedHost() const
Get the unpacked row pointers on host. Lazily make a copy from device.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is &quot;imbalanced&quot; in the number of entries per row...
bool isLocallyComplete() const
Is this Export or Import locally complete?
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
local_map_type getLocalMap() const
Get the local Map for Kokkos kernels.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
A parallel distribution of indices over processes.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const CombineMode CM=ADD)
Insert one or more entries into the matrix, using local column indices.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Details::EStorageStatus storageStatus_
Status of the matrix&#39;s storage, when not in a fill-complete state.
A read-only, row-oriented interface to a sparse matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
A distributed dense vector.
Declaration of Tpetra::Details::iallreduce.
void reduce()
Sum values of a locally replicated multivector across all processes.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process...
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using local row and column indices.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
Accumulate new values into existing values (may not be supported in all classes)
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
void rightScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Right-scale a KokkosSparse::CrsMatrix.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Description of Tpetra&#39;s behavior.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.