Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 // clang-format off
11 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
12 #define TPETRA_MULTIVECTOR_DEF_HPP
13 
21 
22 #include "Tpetra_Core.hpp"
23 #include "Tpetra_Util.hpp"
24 #include "Tpetra_Vector.hpp"
28 #include "Tpetra_Details_fill.hpp"
36 #include "Tpetra_Details_Random.hpp"
37 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
38 # include "Teuchos_SerialDenseMatrix.hpp"
39 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
40 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
41 #include "KokkosCompat_View.hpp"
42 #include "KokkosBlas.hpp"
43 #include "KokkosKernels_Utils.hpp"
44 #include "Kokkos_Random.hpp"
45 #include "Kokkos_ArithTraits.hpp"
46 #include <memory>
47 #include <sstream>
48 
49 #ifdef HAVE_TPETRA_INST_FLOAT128
50 namespace Kokkos {
51  // FIXME (mfh 04 Sep 2015) Just a stub for now!
52  template<class Generator>
53  struct rand<Generator, __float128> {
54  static KOKKOS_INLINE_FUNCTION __float128 max ()
55  {
56  return static_cast<__float128> (1.0);
57  }
58  static KOKKOS_INLINE_FUNCTION __float128
59  draw (Generator& gen)
60  {
61  // Half the smallest normalized double, is the scaling factor of
62  // the lower-order term in the double-double representation.
63  const __float128 scalingFactor =
64  static_cast<__float128> (std::numeric_limits<double>::min ()) /
65  static_cast<__float128> (2.0);
66  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
67  const __float128 lowerOrderTerm =
68  static_cast<__float128> (gen.drand ()) * scalingFactor;
69  return higherOrderTerm + lowerOrderTerm;
70  }
71  static KOKKOS_INLINE_FUNCTION __float128
72  draw (Generator& gen, const __float128& range)
73  {
74  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
75  const __float128 scalingFactor =
76  static_cast<__float128> (std::numeric_limits<double>::min ()) /
77  static_cast<__float128> (2.0);
78  const __float128 higherOrderTerm =
79  static_cast<__float128> (gen.drand (range));
80  const __float128 lowerOrderTerm =
81  static_cast<__float128> (gen.drand (range)) * scalingFactor;
82  return higherOrderTerm + lowerOrderTerm;
83  }
84  static KOKKOS_INLINE_FUNCTION __float128
85  draw (Generator& gen, const __float128& start, const __float128& end)
86  {
87  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
88  const __float128 scalingFactor =
89  static_cast<__float128> (std::numeric_limits<double>::min ()) /
90  static_cast<__float128> (2.0);
91  const __float128 higherOrderTerm =
92  static_cast<__float128> (gen.drand (start, end));
93  const __float128 lowerOrderTerm =
94  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
95  return higherOrderTerm + lowerOrderTerm;
96  }
97  };
98 } // namespace Kokkos
99 #endif // HAVE_TPETRA_INST_FLOAT128
100 
101 namespace { // (anonymous)
102 
117  template<class ST, class LO, class GO, class NT>
119  allocDualView (const size_t lclNumRows,
120  const size_t numCols,
121  const bool zeroOut = true,
122  const bool allowPadding = false)
123  {
124  using ::Tpetra::Details::Behavior;
125  using Kokkos::AllowPadding;
126  using Kokkos::view_alloc;
127  using Kokkos::WithoutInitializing;
128  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
129  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
130  typedef typename dual_view_type::t_dev dev_view_type;
131 
132  // This needs to be a string and not a char*, if given as an
133  // argument to Kokkos::view_alloc. This is because view_alloc
134  // also allows a raw pointer as its first argument. See
135  // https://github.com/kokkos/kokkos/issues/434.
136  const std::string label ("MV::DualView");
137  const bool debug = Behavior::debug ();
138 
139  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
140  // creation of the DualView's Views works around
141  // Kokkos::DualView's current inability to accept an
142  // AllocationProperties initial argument (as Kokkos::View does).
143  // However, the work-around is harmless, since it does what the
144  // (currently nonexistent) equivalent DualView constructor would
145  // have done anyway.
146 
147  dev_view_type d_view;
148  if (zeroOut) {
149  if (allowPadding) {
150  d_view = dev_view_type (view_alloc (label, AllowPadding),
151  lclNumRows, numCols);
152  }
153  else {
154  d_view = dev_view_type (view_alloc (label),
155  lclNumRows, numCols);
156  }
157  }
158  else {
159  if (allowPadding) {
160  d_view = dev_view_type (view_alloc (label,
161  WithoutInitializing,
162  AllowPadding),
163  lclNumRows, numCols);
164  }
165  else {
166  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
167  lclNumRows, numCols);
168  }
169  if (debug) {
170  // Filling with NaN is a cheap and effective way to tell if
171  // downstream code is trying to use a MultiVector's data
172  // without them having been initialized. ArithTraits lets us
173  // call nan() even if the scalar type doesn't define it; it
174  // just returns some undefined value in the latter case. This
175  // won't hurt anything because by setting zeroOut=false, users
176  // already agreed that they don't care about the contents of
177  // the MultiVector.
178  const ST nan = Kokkos::ArithTraits<ST>::nan ();
179  KokkosBlas::fill (d_view, nan);
180  }
181  }
182  if (debug) {
183  TEUCHOS_TEST_FOR_EXCEPTION
184  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
185  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
186  "allocDualView: d_view's dimensions actual dimensions do not match "
187  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
188  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
189  << ". Please report this bug to the Tpetra developers.");
190  }
191 
192  return wrapped_dual_view_type(d_view);
193  }
194 
195  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
196  //
197  // T: The type of the entries of the View.
198  // ExecSpace: The Kokkos execution space.
199  template<class T, class ExecSpace>
200  struct MakeUnmanagedView {
201  // The 'false' part of the branch carefully ensures that this
202  // won't attempt to use a host execution space that hasn't been
203  // initialized. For example, if Kokkos::OpenMP is disabled and
204  // Kokkos::Threads is enabled, the latter is always the default
205  // execution space of Kokkos::HostSpace, even when ExecSpace is
206  // Kokkos::Serial. That's why we go through the trouble of asking
207  // Kokkos::DualView what _its_ space is. That seems to work
208  // around this default execution space issue.
209  //
210  typedef typename std::conditional<
211  Kokkos::SpaceAccessibility<
212  typename ExecSpace::memory_space,
213  Kokkos::HostSpace>::accessible,
214  typename ExecSpace::device_type,
215  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
216  typedef Kokkos::LayoutLeft array_layout;
217  typedef Kokkos::View<T*, array_layout, host_exec_space,
218  Kokkos::MemoryUnmanaged> view_type;
219 
220  static view_type getView (const Teuchos::ArrayView<T>& x_in)
221  {
222  const size_t numEnt = static_cast<size_t> (x_in.size ());
223  if (numEnt == 0) {
224  return view_type ();
225  } else {
226  return view_type (x_in.getRawPtr (), numEnt);
227  }
228  }
229  };
230 
231 
232  template<class WrappedDualViewType>
233  WrappedDualViewType
234  takeSubview (const WrappedDualViewType& X,
235  const std::pair<size_t, size_t>& rowRng,
236  const Kokkos::ALL_t& colRng)
237 
238  {
239  // The bug we saw below should be harder to trigger here.
240  return WrappedDualViewType(X,rowRng,colRng);
241  }
242 
243  template<class WrappedDualViewType>
244  WrappedDualViewType
245  takeSubview (const WrappedDualViewType& X,
246  const Kokkos::ALL_t& rowRng,
247  const std::pair<size_t, size_t>& colRng)
248  {
249  using DualViewType = typename WrappedDualViewType::DVT;
250  // Look carefullly at the comment in the below version of this function.
251  // The comment applies here as well.
252  if (X.extent (0) == 0 && X.extent (1) != 0) {
253  return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
254  }
255  else {
256  return WrappedDualViewType(X,rowRng,colRng);
257  }
258  }
259 
260  template<class WrappedDualViewType>
261  WrappedDualViewType
262  takeSubview (const WrappedDualViewType& X,
263  const std::pair<size_t, size_t>& rowRng,
264  const std::pair<size_t, size_t>& colRng)
265  {
266  using DualViewType = typename WrappedDualViewType::DVT;
267  // If you take a subview of a view with zero rows Kokkos::subview()
268  // always returns a DualView with the same data pointers. This will break
269  // pointer equality testing in between two subviews of the same 2D View if
270  // it has zero row extent. While the one (known) case where this was actually used
271  // has been fixed, that sort of check could very easily be reintroduced in the future,
272  // hence I've added this if check here.
273  //
274  // This is not a bug in Kokkos::subview(), just some very subtle behavior which
275  // future developers should be wary of.
276  if (X.extent (0) == 0 && X.extent (1) != 0) {
277  return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
278  }
279  else {
280  return WrappedDualViewType(X,rowRng,colRng);
281  }
282  }
283 
284  template<class WrappedOrNotDualViewType>
285  size_t
286  getDualViewStride (const WrappedOrNotDualViewType& dv)
287  {
288  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
289  // method yet, but its Views do.
290  // NOTE: dv.stride() returns a vector of length one
291  // more than its rank
292  size_t strides[WrappedOrNotDualViewType::t_dev::rank+1];
293  dv.stride(strides);
294  const size_t LDA = strides[1];
295  const size_t numRows = dv.extent (0);
296 
297  if (LDA == 0) {
298  return (numRows == 0) ? size_t (1) : numRows;
299  }
300  else {
301  return LDA;
302  }
303  }
304 
305  template<class ViewType>
306  size_t
307  getViewStride (const ViewType& view)
308  {
309  const size_t LDA = view.stride (1);
310  const size_t numRows = view.extent (0);
311 
312  if (LDA == 0) {
313  return (numRows == 0) ? size_t (1) : numRows;
314  }
315  else {
316  return LDA;
317  }
318  }
319 
320  template <class impl_scalar_type, class buffer_device_type>
321  bool
322  runKernelOnHost (
323  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
324  )
325  {
326  if (! imports.need_sync_device ()) {
327  return false; // most up-to-date on device
328  }
329  else { // most up-to-date on host,
330  // but if large enough, worth running on device anyway
331  size_t localLengthThreshold =
333  return imports.extent(0) <= localLengthThreshold;
334  }
335  }
336 
337 
338  template <class SC, class LO, class GO, class NT>
339  bool
340  runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
341  {
342  if (! X.need_sync_device ()) {
343  return false; // most up-to-date on device
344  }
345  else { // most up-to-date on host
346  // but if large enough, worth running on device anyway
347  size_t localLengthThreshold =
349  return X.getLocalLength () <= localLengthThreshold;
350  }
351  }
352 
353  template <class SC, class LO, class GO, class NT>
354  void
355  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
358  {
359  using namespace Tpetra;
362  using val_type = typename MV::impl_scalar_type;
363  using mag_type = typename MV::mag_type;
364  using dual_view_type = typename MV::dual_view_type;
365 
366  auto map = X.getMap ();
367  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
368  auto whichVecs = getMultiVectorWhichVectors (X);
369  const bool isConstantStride = X.isConstantStride ();
370  const bool isDistributed = X.isDistributed ();
371 
372  const bool runOnHost = runKernelOnHost (X);
373  if (runOnHost) {
374  using view_type = typename dual_view_type::t_host;
375  using array_layout = typename view_type::array_layout;
376  using device_type = typename view_type::device_type;
377 
378  auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
379  normImpl<val_type, array_layout, device_type,
380  mag_type> (norms, X_lcl, whichNorm, whichVecs,
381  isConstantStride, isDistributed, comm);
382  }
383  else {
384  using view_type = typename dual_view_type::t_dev;
385  using array_layout = typename view_type::array_layout;
386  using device_type = typename view_type::device_type;
387 
388  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
389  normImpl<val_type, array_layout, device_type,
390  mag_type> (norms, X_lcl, whichNorm, whichVecs,
391  isConstantStride, isDistributed, comm);
392  }
393  }
394 } // namespace (anonymous)
395 
396 
397 namespace Tpetra {
398 
399  namespace Details {
400  template <typename DstView, typename SrcView>
401  struct AddAssignFunctor {
402  // This functor would be better as a lambda, but CUDA cannot compile
403  // lambdas in protected functions. It compiles fine with the functor.
404  AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
405 
406  KOKKOS_INLINE_FUNCTION void
407  operator () (const size_t k) const { tgt(k) += src(k); }
408 
409  DstView tgt;
410  SrcView src;
411  };
412  }
413 
414  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
415  bool
417  vectorIndexOutOfRange (const size_t VectorIndex) const {
418  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
419  }
420 
421  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
424  base_type (Teuchos::rcp (new map_type ()))
425  {}
426 
427  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
429  MultiVector (const Teuchos::RCP<const map_type>& map,
430  const size_t numVecs,
431  const bool zeroOut) : /* default is true */
432  base_type (map)
433  {
434  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
435 
436  const size_t lclNumRows = this->getLocalLength ();
437  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
438  }
439 
440  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
443  const Teuchos::DataAccess copyOrView) :
444  base_type (source),
445  view_ (source.view_),
446  whichVectors_ (source.whichVectors_)
447  {
449  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
450  "const Teuchos::DataAccess): ";
451 
452  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
453 
454  if (copyOrView == Teuchos::Copy) {
455  // Reuse the conveniently already existing function that creates
456  // a deep copy.
457  MV cpy = createCopy (source);
458  this->view_ = cpy.view_;
459  this->whichVectors_ = cpy.whichVectors_;
460  }
461  else if (copyOrView == Teuchos::View) {
462  }
463  else {
464  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
465  true, std::invalid_argument, "Second argument 'copyOrView' has an "
466  "invalid value " << copyOrView << ". Valid values include "
467  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
468  << Teuchos::View << ".");
469  }
470  }
471 
472  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
474  MultiVector (const Teuchos::RCP<const map_type>& map,
475  const dual_view_type& view) :
476  base_type (map),
477  view_ (wrapped_dual_view_type(view))
478  {
479  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
480  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
481  map->getLocalNumElements ();
482  const size_t lclNumRows_view = view.extent (0);
483  const size_t LDA = getDualViewStride (view_);
484 
485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
486  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
487  std::invalid_argument, "Kokkos::DualView does not match Map. "
488  "map->getLocalNumElements() = " << lclNumRows_map
489  << ", view.extent(0) = " << lclNumRows_view
490  << ", and getStride() = " << LDA << ".");
491 
492  using ::Tpetra::Details::Behavior;
493  const bool debug = Behavior::debug ();
494  if (debug) {
495  using ::Tpetra::Details::checkGlobalDualViewValidity;
496  std::ostringstream gblErrStrm;
497  const bool verbose = Behavior::verbose ();
498  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
499  const bool gblValid =
500  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
501  comm.getRawPtr ());
502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
503  (! gblValid, std::runtime_error, gblErrStrm.str ());
504  }
505  }
506 
507 
508  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
510  MultiVector (const Teuchos::RCP<const map_type>& map,
511  const wrapped_dual_view_type& view) :
512  base_type (map),
513  view_ (view)
514  {
515  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
516  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
517  map->getLocalNumElements ();
518  const size_t lclNumRows_view = view.extent (0);
519  const size_t LDA = getDualViewStride (view);
520 
521  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
522  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
523  std::invalid_argument, "Kokkos::DualView does not match Map. "
524  "map->getLocalNumElements() = " << lclNumRows_map
525  << ", view.extent(0) = " << lclNumRows_view
526  << ", and getStride() = " << LDA << ".");
527 
528  using ::Tpetra::Details::Behavior;
529  const bool debug = Behavior::debug ();
530  if (debug) {
531  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
532  std::ostringstream gblErrStrm;
533  const bool verbose = Behavior::verbose ();
534  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
535  const bool gblValid =
536  checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
537  comm.getRawPtr ());
538  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
539  (! gblValid, std::runtime_error, gblErrStrm.str ());
540  }
541  }
542 
543 
544 
545  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
547  MultiVector (const Teuchos::RCP<const map_type>& map,
548  const typename dual_view_type::t_dev& d_view) :
549  base_type (map)
550  {
551  using Teuchos::ArrayRCP;
552  using Teuchos::RCP;
553  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
554 
555  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
556 
557  const size_t LDA = getViewStride (d_view);
558  const size_t lclNumRows = map->getLocalNumElements ();
559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
560  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
561  "Kokkos::View. map->getLocalNumElements() = " << lclNumRows
562  << ", View's column stride = " << LDA
563  << ", and View's extent(0) = " << d_view.extent (0) << ".");
564 
565  auto h_view = Kokkos::create_mirror_view (d_view);
566  auto dual_view = dual_view_type (d_view, h_view);
567  view_ = wrapped_dual_view_type(dual_view);
568 
569  using ::Tpetra::Details::Behavior;
570  const bool debug = Behavior::debug ();
571  if (debug) {
572  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
573  std::ostringstream gblErrStrm;
574  const bool verbose = Behavior::verbose ();
575  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
576  const bool gblValid =
577  checkGlobalWrappedDualViewValidity (&gblErrStrm, view_, verbose,
578  comm.getRawPtr ());
579  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
580  (! gblValid, std::runtime_error, gblErrStrm.str ());
581  }
582  // The user gave us a device view. In order to respect its
583  // initial contents, we mark the DualView as "modified on device."
584  // That way, the next sync will synchronize it with the host view.
585  dual_view.modify_device ();
586  }
587 
588  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
590  MultiVector (const Teuchos::RCP<const map_type>& map,
591  const dual_view_type& view,
592  const dual_view_type& origView) :
593  base_type (map),
594  view_ (wrapped_dual_view_type(view,origView))
595  {
596  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
597 
598  const size_t LDA = getDualViewStride (origView);
599  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
600  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
601  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
602  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
603  << ". This may also mean that the input origView's first dimension (number "
604  "of rows = " << origView.extent (0) << ") does not not match the number "
605  "of entries in the Map on the calling process.");
606 
607  using ::Tpetra::Details::Behavior;
608  const bool debug = Behavior::debug ();
609  if (debug) {
610  using ::Tpetra::Details::checkGlobalDualViewValidity;
611  std::ostringstream gblErrStrm;
612  const bool verbose = Behavior::verbose ();
613  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
614  const bool gblValid_0 =
615  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
616  comm.getRawPtr ());
617  const bool gblValid_1 =
618  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
619  comm.getRawPtr ());
620  const bool gblValid = gblValid_0 && gblValid_1;
621  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
622  (! gblValid, std::runtime_error, gblErrStrm.str ());
623  }
624  }
625 
626 
627  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
629  MultiVector (const Teuchos::RCP<const map_type>& map,
630  const dual_view_type& view,
631  const Teuchos::ArrayView<const size_t>& whichVectors) :
632  base_type (map),
633  view_ (view),
634  whichVectors_ (whichVectors.begin (), whichVectors.end ())
635  {
636  using Kokkos::ALL;
637  using Kokkos::subview;
638  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
639 
640  using ::Tpetra::Details::Behavior;
641  const bool debug = Behavior::debug ();
642  if (debug) {
643  using ::Tpetra::Details::checkGlobalDualViewValidity;
644  std::ostringstream gblErrStrm;
645  const bool verbose = Behavior::verbose ();
646  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
647  const bool gblValid =
648  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
649  comm.getRawPtr ());
650  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
651  (! gblValid, std::runtime_error, gblErrStrm.str ());
652  }
653 
654  const size_t lclNumRows = map.is_null () ? size_t (0) :
655  map->getLocalNumElements ();
656  // Check dimensions of the input DualView. We accept that Kokkos
657  // might not allow construction of a 0 x m (Dual)View with m > 0,
658  // so we only require the number of rows to match if the
659  // (Dual)View has more than zero columns. Likewise, we only
660  // require the number of columns to match if the (Dual)View has
661  // more than zero rows.
662  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
663  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
664  std::invalid_argument, "view.extent(0) = " << view.extent (0)
665  << " < map->getLocalNumElements() = " << lclNumRows << ".");
666  if (whichVectors.size () != 0) {
667  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
668  view.extent (1) != 0 && view.extent (1) == 0,
669  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
670  " = " << whichVectors.size () << " > 0.");
671  size_t maxColInd = 0;
672  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
673  for (size_type k = 0; k < whichVectors.size (); ++k) {
674  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
675  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
676  std::invalid_argument, "whichVectors[" << k << "] = "
677  "Teuchos::OrdinalTraits<size_t>::invalid().");
678  maxColInd = std::max (maxColInd, whichVectors[k]);
679  }
680  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
681  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
682  std::invalid_argument, "view.extent(1) = " << view.extent (1)
683  << " <= max(whichVectors) = " << maxColInd << ".");
684  }
685 
686  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
687  // zero strides, so modify in that case.
688  const size_t LDA = getDualViewStride (view);
689  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
690  (LDA < lclNumRows, std::invalid_argument,
691  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
692 
693  if (whichVectors.size () == 1) {
694  // If whichVectors has only one entry, we don't need to bother
695  // with nonconstant stride. Just shift the view over so it
696  // points to the desired column.
697  //
698  // NOTE (mfh 10 May 2014) This is a special case where we set
699  // origView_ just to view that one column, not all of the
700  // original columns. This ensures that the use of origView_ in
701  // offsetView works correctly.
702  //
703  const std::pair<size_t, size_t> colRng (whichVectors[0],
704  whichVectors[0] + 1);
705  view_ = takeSubview (view_, ALL (), colRng);
706  // whichVectors_.size() == 0 means "constant stride."
707  whichVectors_.clear ();
708  }
709  }
710 
711  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
713  MultiVector (const Teuchos::RCP<const map_type>& map,
714  const wrapped_dual_view_type& view,
715  const Teuchos::ArrayView<const size_t>& whichVectors) :
716  base_type (map),
717  view_ (view),
718  whichVectors_ (whichVectors.begin (), whichVectors.end ())
719  {
720  using Kokkos::ALL;
721  using Kokkos::subview;
722  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
723 
724  using ::Tpetra::Details::Behavior;
725  const bool debug = Behavior::debug ();
726  if (debug) {
727  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
728  std::ostringstream gblErrStrm;
729  const bool verbose = Behavior::verbose ();
730  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
731  const bool gblValid =
732  checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
733  comm.getRawPtr ());
734  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
735  (! gblValid, std::runtime_error, gblErrStrm.str ());
736  }
737 
738  const size_t lclNumRows = map.is_null () ? size_t (0) :
739  map->getLocalNumElements ();
740  // Check dimensions of the input DualView. We accept that Kokkos
741  // might not allow construction of a 0 x m (Dual)View with m > 0,
742  // so we only require the number of rows to match if the
743  // (Dual)View has more than zero columns. Likewise, we only
744  // require the number of columns to match if the (Dual)View has
745  // more than zero rows.
746  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
747  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
748  std::invalid_argument, "view.extent(0) = " << view.extent (0)
749  << " < map->getLocalNumElements() = " << lclNumRows << ".");
750  if (whichVectors.size () != 0) {
751  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
752  view.extent (1) != 0 && view.extent (1) == 0,
753  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
754  " = " << whichVectors.size () << " > 0.");
755  size_t maxColInd = 0;
756  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
757  for (size_type k = 0; k < whichVectors.size (); ++k) {
758  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
759  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
760  std::invalid_argument, "whichVectors[" << k << "] = "
761  "Teuchos::OrdinalTraits<size_t>::invalid().");
762  maxColInd = std::max (maxColInd, whichVectors[k]);
763  }
764  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
765  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
766  std::invalid_argument, "view.extent(1) = " << view.extent (1)
767  << " <= max(whichVectors) = " << maxColInd << ".");
768  }
769 
770  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
771  // zero strides, so modify in that case.
772  const size_t LDA = getDualViewStride (view);
773  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
774  (LDA < lclNumRows, std::invalid_argument,
775  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
776 
777  if (whichVectors.size () == 1) {
778  // If whichVectors has only one entry, we don't need to bother
779  // with nonconstant stride. Just shift the view over so it
780  // points to the desired column.
781  //
782  // NOTE (mfh 10 May 2014) This is a special case where we set
783  // origView_ just to view that one column, not all of the
784  // original columns. This ensures that the use of origView_ in
785  // offsetView works correctly.
786  //
787  const std::pair<size_t, size_t> colRng (whichVectors[0],
788  whichVectors[0] + 1);
789  view_ = takeSubview (view_, ALL (), colRng);
790  // whichVectors_.size() == 0 means "constant stride."
791  whichVectors_.clear ();
792  }
793  }
794 
795 
796  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
798  MultiVector (const Teuchos::RCP<const map_type>& map,
799  const dual_view_type& view,
800  const dual_view_type& origView,
801  const Teuchos::ArrayView<const size_t>& whichVectors) :
802  base_type (map),
803  view_ (wrapped_dual_view_type(view,origView)),
804  whichVectors_ (whichVectors.begin (), whichVectors.end ())
805  {
806  using Kokkos::ALL;
807  using Kokkos::subview;
808  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
809 
810  using ::Tpetra::Details::Behavior;
811  const bool debug = Behavior::debug ();
812  if (debug) {
813  using ::Tpetra::Details::checkGlobalDualViewValidity;
814  std::ostringstream gblErrStrm;
815  const bool verbose = Behavior::verbose ();
816  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
817  const bool gblValid_0 =
818  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
819  comm.getRawPtr ());
820  const bool gblValid_1 =
821  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
822  comm.getRawPtr ());
823  const bool gblValid = gblValid_0 && gblValid_1;
824  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
825  (! gblValid, std::runtime_error, gblErrStrm.str ());
826  }
827 
828  const size_t lclNumRows = this->getLocalLength ();
829  // Check dimensions of the input DualView. We accept that Kokkos
830  // might not allow construction of a 0 x m (Dual)View with m > 0,
831  // so we only require the number of rows to match if the
832  // (Dual)View has more than zero columns. Likewise, we only
833  // require the number of columns to match if the (Dual)View has
834  // more than zero rows.
835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
836  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
837  std::invalid_argument, "view.extent(0) = " << view.extent (0)
838  << " < map->getLocalNumElements() = " << lclNumRows << ".");
839  if (whichVectors.size () != 0) {
840  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
841  view.extent (1) != 0 && view.extent (1) == 0,
842  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
843  " = " << whichVectors.size () << " > 0.");
844  size_t maxColInd = 0;
845  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
846  for (size_type k = 0; k < whichVectors.size (); ++k) {
847  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
848  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
849  std::invalid_argument, "whichVectors[" << k << "] = "
850  "Teuchos::OrdinalTraits<size_t>::invalid().");
851  maxColInd = std::max (maxColInd, whichVectors[k]);
852  }
853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
854  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
855  std::invalid_argument, "view.extent(1) = " << view.extent (1)
856  << " <= max(whichVectors) = " << maxColInd << ".");
857  }
858 
859  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
860  // zero strides, so modify in that case.
861  const size_t LDA = getDualViewStride (origView);
862  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
863  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
864  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
865  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
866  << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
867 
868  if (whichVectors.size () == 1) {
869  // If whichVectors has only one entry, we don't need to bother
870  // with nonconstant stride. Just shift the view over so it
871  // points to the desired column.
872  //
873  // NOTE (mfh 10 May 2014) This is a special case where we set
874  // origView_ just to view that one column, not all of the
875  // original columns. This ensures that the use of origView_ in
876  // offsetView works correctly.
877  const std::pair<size_t, size_t> colRng (whichVectors[0],
878  whichVectors[0] + 1);
879  view_ = takeSubview (view_, ALL (), colRng);
880  // origView_ = takeSubview (origView_, ALL (), colRng);
881  // whichVectors_.size() == 0 means "constant stride."
882  whichVectors_.clear ();
883  }
884  }
885 
886  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
888  MultiVector (const Teuchos::RCP<const map_type>& map,
889  const Teuchos::ArrayView<const Scalar>& data,
890  const size_t LDA,
891  const size_t numVecs) :
892  base_type (map)
893  {
894  typedef LocalOrdinal LO;
895  typedef GlobalOrdinal GO;
896  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
897  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
898 
899  // Deep copy constructor, constant stride (NO whichVectors_).
900  // There is no need for a deep copy constructor with nonconstant stride.
901 
902  const size_t lclNumRows =
903  map.is_null () ? size_t (0) : map->getLocalNumElements ();
904  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
905  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
906  "map->getLocalNumElements() = " << lclNumRows << ".");
907  if (numVecs != 0) {
908  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
909  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
910  (static_cast<size_t> (data.size ()) < minNumEntries,
911  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
912  "entries, given the input Map and number of vectors in the MultiVector."
913  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
914  "map->getLocalNumElements () = " << minNumEntries << ".");
915  }
916 
917  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
918  //Note: X_out will be completely overwritten
919  auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
920 
921  // Make an unmanaged host Kokkos::View of the input data. First
922  // create a View (X_in_orig) with the original stride. Then,
923  // create a subview (X_in) with the right number of columns.
924  const impl_scalar_type* const X_in_raw =
925  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
926  Kokkos::View<const impl_scalar_type**,
927  Kokkos::LayoutLeft,
928  Kokkos::HostSpace,
929  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
930  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
931  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
932 
933  // If LDA != X_out's column stride, then we need to copy one
934  // column at a time; Kokkos::deep_copy refuses to work in that
935  // case.
936  const size_t outStride =
937  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
938  if (LDA == outStride) { // strides are the same; deep_copy once
939  // This only works because MultiVector uses LayoutLeft.
940  // We would need a custom copy functor otherwise.
941  // DEEP_COPY REVIEW - HOST-TO-DEVICE
942  Kokkos::deep_copy (execution_space(), X_out, X_in);
943  }
944  else { // strides differ; copy one column at a time
945  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
946  out_col_view_type;
947  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
948  in_col_view_type;
949  for (size_t j = 0; j < numVecs; ++j) {
950  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
951  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
952  // DEEP_COPY REVIEW - HOST-TO-DEVICE
953  Kokkos::deep_copy (execution_space(), X_out_j, X_in_j);
954  }
955  }
956  }
957 
958  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
960  MultiVector (const Teuchos::RCP<const map_type>& map,
961  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
962  const size_t numVecs) :
963  base_type (map)
964  {
965  typedef impl_scalar_type IST;
966  typedef LocalOrdinal LO;
967  typedef GlobalOrdinal GO;
968  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
969  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
970 
971  const size_t lclNumRows =
972  map.is_null () ? size_t (0) : map->getLocalNumElements ();
973  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
974  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
975  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
976  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
977  for (size_t j = 0; j < numVecs; ++j) {
978  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
979  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
980  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
981  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
982  << X_j_av.size () << " < map->getLocalNumElements() = " << lclNumRows
983  << ".");
984  }
985 
986  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
987  auto X_out = this->getLocalViewDevice(Access::ReadWrite);
988 
989  // Make sure that the type of a single input column has the same
990  // array layout as each output column does, so we can deep_copy.
991  using array_layout = typename decltype (X_out)::array_layout;
992  using input_col_view_type = typename Kokkos::View<const IST*,
993  array_layout,
994  Kokkos::HostSpace,
995  Kokkos::MemoryUnmanaged>;
996 
997  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
998  for (size_t j = 0; j < numVecs; ++j) {
999  Teuchos::ArrayView<const IST> X_j_av =
1000  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
1001  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
1002  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
1003  // DEEP_COPY REVIEW - HOST-TO-DEVICE
1004  Kokkos::deep_copy (execution_space(), X_j_out, X_j_in);
1005  }
1006  }
1007 
1008  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1011  return whichVectors_.empty ();
1012  }
1013 
1014  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1015  size_t
1018  {
1019  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1020  return static_cast<size_t> (0);
1021  } else {
1022  return this->getMap ()->getLocalNumElements ();
1023  }
1024  }
1025 
1026  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1030  {
1031  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1032  return static_cast<size_t> (0);
1033  } else {
1034  return this->getMap ()->getGlobalNumElements ();
1035  }
1036  }
1037 
1038  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1039  size_t
1041  getStride () const
1042  {
1043  return isConstantStride () ? getDualViewStride (view_) : size_t (0);
1044  }
1045 
1046  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1047  bool
1050  {
1051  //Don't actually get a view, just get pointers.
1052  auto thisData = view_.getDualView().h_view.data();
1053  auto otherData = other.view_.getDualView().h_view.data();
1054  size_t thisSpan = view_.getDualView().h_view.span();
1055  size_t otherSpan = other.view_.getDualView().h_view.span();
1056  return (otherData <= thisData && thisData < otherData + otherSpan)
1057  || (thisData <= otherData && otherData < thisData + thisSpan);
1058  }
1059 
1060  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1061  bool
1063  checkSizes (const SrcDistObject& sourceObj)
1064  {
1065  // Check whether the source object is a MultiVector. If not, then
1066  // we can't even compare sizes, so it's definitely not OK to
1067  // Import or Export from it.
1069  const MV* src = dynamic_cast<const MV*> (&sourceObj);
1070  if (src == nullptr) {
1071  return false;
1072  }
1073  else {
1074  // The target of the Import or Export calls checkSizes() in
1075  // DistObject::doTransfer(). By that point, we've already
1076  // constructed an Import or Export object using the two
1077  // multivectors' Maps, which means that (hopefully) we've
1078  // already checked other attributes of the multivectors. Thus,
1079  // all we need to do here is check the number of columns.
1080  return src->getNumVectors () == this->getNumVectors ();
1081  }
1082  }
1083 
1084  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1085  size_t
1088  return this->getNumVectors ();
1089  }
1090 
1091  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1092  void
1095  (const SrcDistObject& sourceObj,
1096  const size_t numSameIDs,
1097  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1098  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1099  const CombineMode CM,
1100  const execution_space &space)
1101  {
1102  using ::Tpetra::Details::Behavior;
1104  using ::Tpetra::Details::ProfilingRegion;
1105  using std::endl;
1106  using KokkosRefactor::Details::permute_array_multi_column;
1107  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1108  using Kokkos::Compat::create_const_view;
1110 
1111  // We've already called checkSizes(), so this cast must succeed.
1112  MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1113  const bool copyOnHost = runKernelOnHost(sourceMV);
1114  const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1115  const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1116  const char tfecfFuncName[] = "copyAndPermute: ";
1117  ProfilingRegion regionCAP (copyOnHost ? longFuncNameHost : longFuncNameDevice);
1118 
1119  const bool verbose = Behavior::verbose ();
1120  std::unique_ptr<std::string> prefix;
1121  if (verbose) {
1122  auto map = this->getMap ();
1123  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1124  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1125  std::ostringstream os;
1126  os << "Proc " << myRank << ": MV::copyAndPermute: ";
1127  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1128  os << "Start" << endl;
1129  std::cerr << os.str ();
1130  }
1131 
1132  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1133  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1134  std::logic_error, "permuteToLIDs.extent(0) = "
1135  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1136  << permuteFromLIDs.extent (0) << ".");
1137  const size_t numCols = this->getNumVectors ();
1138 
1139  // sourceMV doesn't belong to us, so we can't sync it. Do the
1140  // copying where it's currently sync'd.
1141  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1142  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1143  std::logic_error, "Input MultiVector needs sync to both host "
1144  "and device.");
1145  if (verbose) {
1146  std::ostringstream os;
1147  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1148  std::cerr << os.str ();
1149  }
1150 
1151  if (verbose) {
1152  std::ostringstream os;
1153  os << *prefix << "Copy" << endl;
1154  std::cerr << os.str ();
1155  }
1156 
1157  // TODO (mfh 15 Sep 2013) When we replace
1158  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1159  // ways to copy the data:
1160  //
1161  // 1. Get a (sub)view of each column and call deep_copy on that.
1162  // 2. Write a custom kernel to copy the data.
1163  //
1164  // The first is easier, but the second might be more performant in
1165  // case we decide to use layouts other than LayoutLeft. It might
1166  // even make sense to hide whichVectors_ in an entirely new layout
1167  // for Kokkos Views.
1168 
1169  // Copy rows [0, numSameIDs-1] of the local multivectors.
1170  //
1171  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1172  // time, even when both multivectors are constant-stride, since
1173  // deep_copy between strided subviews with more than one column
1174  // doesn't currently work.
1175 
1176  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1177  // both source and target are constant stride and have multiple
1178  // columns.
1179  if (numSameIDs > 0) {
1180  const std::pair<size_t, size_t> rows (0, numSameIDs);
1181  if (copyOnHost) {
1182  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1183  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1184 
1185  for (size_t j = 0; j < numCols; ++j) {
1186  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1187  const size_t srcCol =
1188  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1189 
1190  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1191  auto src_j = Kokkos::subview (src_h, rows, srcCol);
1192  if (CM == ADD_ASSIGN) {
1193  // Sum src_j into tgt_j
1194  using range_t =
1195  Kokkos::RangePolicy<execution_space, size_t>;
1196  range_t rp(space, 0,numSameIDs);
1197  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1198  aaf(tgt_j, src_j);
1199  Kokkos::parallel_for(rp, aaf);
1200  }
1201  else {
1202  // Copy src_j into tgt_j
1203  // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1204  Kokkos::deep_copy (space, tgt_j, src_j);
1205  space.fence();
1206  }
1207  }
1208  }
1209  else { // copy on device
1210  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1211  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1212 
1213  for (size_t j = 0; j < numCols; ++j) {
1214  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1215  const size_t srcCol =
1216  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1217 
1218  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1219  auto src_j = Kokkos::subview (src_d, rows, srcCol);
1220  if (CM == ADD_ASSIGN) {
1221  // Sum src_j into tgt_j
1222  using range_t =
1223  Kokkos::RangePolicy<execution_space, size_t>;
1224  range_t rp(space, 0,numSameIDs);
1225  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1226  aaf(tgt_j, src_j);
1227  Kokkos::parallel_for(rp, aaf);
1228  }
1229  else {
1230  // Copy src_j into tgt_j
1231  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1232  Kokkos::deep_copy (space, tgt_j, src_j);
1233  space.fence();
1234  }
1235  }
1236  }
1237  }
1238 
1239 
1240  // For the remaining GIDs, execute the permutations. This may
1241  // involve noncontiguous access of both source and destination
1242  // vectors, depending on the LID lists.
1243  //
1244  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1245  // the same process, this merges their values by replacement of
1246  // the last encountered GID, not by the specified merge rule
1247  // (such as ADD).
1248 
1249  // If there are no permutations, we are done
1250  if (permuteFromLIDs.extent (0) == 0 ||
1251  permuteToLIDs.extent (0) == 0) {
1252  if (verbose) {
1253  std::ostringstream os;
1254  os << *prefix << "No permutations. Done!" << endl;
1255  std::cerr << os.str ();
1256  }
1257  return;
1258  }
1259 
1260  if (verbose) {
1261  std::ostringstream os;
1262  os << *prefix << "Permute" << endl;
1263  std::cerr << os.str ();
1264  }
1265 
1266  // We could in theory optimize for the case where exactly one of
1267  // them is constant stride, but we don't currently do that.
1268  const bool nonConstStride =
1269  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1270 
1271  if (verbose) {
1272  std::ostringstream os;
1273  os << *prefix << "nonConstStride="
1274  << (nonConstStride ? "true" : "false") << endl;
1275  std::cerr << os.str ();
1276  }
1277 
1278  // We only need the "which vectors" arrays if either the source or
1279  // target MV is not constant stride. Since we only have one
1280  // kernel that must do double-duty, we have to create a "which
1281  // vectors" array for the MV that _is_ constant stride.
1282  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1283  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1284  if (nonConstStride) {
1285  if (this->whichVectors_.size () == 0) {
1286  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1287  tmpTgt.modify_host ();
1288  for (size_t j = 0; j < numCols; ++j) {
1289  tmpTgt.h_view(j) = j;
1290  }
1291  if (! copyOnHost) {
1292  tmpTgt.sync_device ();
1293  }
1294  tgtWhichVecs = tmpTgt;
1295  }
1296  else {
1297  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1298  tgtWhichVecs =
1299  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1300  "tgtWhichVecs",
1301  copyOnHost);
1302  }
1303  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1304  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1305  this->getNumVectors (),
1306  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1307  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1308  this->getNumVectors () << ".");
1309 
1310  if (sourceMV.whichVectors_.size () == 0) {
1311  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1312  tmpSrc.modify_host ();
1313  for (size_t j = 0; j < numCols; ++j) {
1314  tmpSrc.h_view(j) = j;
1315  }
1316  if (! copyOnHost) {
1317  tmpSrc.sync_device ();
1318  }
1319  srcWhichVecs = tmpSrc;
1320  }
1321  else {
1322  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1323  sourceMV.whichVectors_ ();
1324  srcWhichVecs =
1325  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1326  "srcWhichVecs",
1327  copyOnHost);
1328  }
1329  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1330  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1331  sourceMV.getNumVectors (), std::logic_error,
1332  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1333  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1334  << ".");
1335  }
1336 
1337  if (copyOnHost) { // permute on host too
1338  if (verbose) {
1339  std::ostringstream os;
1340  os << *prefix << "Get permute LIDs on host" << std::endl;
1341  std::cerr << os.str ();
1342  }
1343  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1344  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1345 
1346  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1347  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1348  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1349  auto permuteFromLIDs_h =
1350  create_const_view (permuteFromLIDs.view_host ());
1351 
1352  if (verbose) {
1353  std::ostringstream os;
1354  os << *prefix << "Permute on host" << endl;
1355  std::cerr << os.str ();
1356  }
1357  if (nonConstStride) {
1358  // No need to sync first, because copyOnHost argument to
1359  // getDualViewCopyFromArrayView puts them in the right place.
1360  auto tgtWhichVecs_h =
1361  create_const_view (tgtWhichVecs.view_host ());
1362  auto srcWhichVecs_h =
1363  create_const_view (srcWhichVecs.view_host ());
1364  if (CM == ADD_ASSIGN) {
1365  using op_type = KokkosRefactor::Details::AddOp;
1366  permute_array_multi_column_variable_stride (tgt_h, src_h,
1367  permuteToLIDs_h,
1368  permuteFromLIDs_h,
1369  tgtWhichVecs_h,
1370  srcWhichVecs_h, numCols,
1371  op_type());
1372  }
1373  else {
1374  using op_type = KokkosRefactor::Details::InsertOp;
1375  permute_array_multi_column_variable_stride (tgt_h, src_h,
1376  permuteToLIDs_h,
1377  permuteFromLIDs_h,
1378  tgtWhichVecs_h,
1379  srcWhichVecs_h, numCols,
1380  op_type());
1381  }
1382  }
1383  else {
1384  if (CM == ADD_ASSIGN) {
1385  using op_type = KokkosRefactor::Details::AddOp;
1386  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1387  permuteFromLIDs_h, numCols, op_type());
1388  }
1389  else {
1390  using op_type = KokkosRefactor::Details::InsertOp;
1391  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1392  permuteFromLIDs_h, numCols, op_type());
1393  }
1394  }
1395  }
1396  else { // permute on device
1397  if (verbose) {
1398  std::ostringstream os;
1399  os << *prefix << "Get permute LIDs on device" << endl;
1400  std::cerr << os.str ();
1401  }
1402  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1403  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1404 
1405  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1406  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1407  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1408  auto permuteFromLIDs_d =
1409  create_const_view (permuteFromLIDs.view_device ());
1410 
1411  if (verbose) {
1412  std::ostringstream os;
1413  os << *prefix << "Permute on device" << endl;
1414  std::cerr << os.str ();
1415  }
1416  if (nonConstStride) {
1417  // No need to sync first, because copyOnHost argument to
1418  // getDualViewCopyFromArrayView puts them in the right place.
1419  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1420  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1421  if (CM == ADD_ASSIGN) {
1422  using op_type = KokkosRefactor::Details::AddOp;
1423  permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1424  permuteToLIDs_d,
1425  permuteFromLIDs_d,
1426  tgtWhichVecs_d,
1427  srcWhichVecs_d, numCols,
1428  op_type());
1429  }
1430  else {
1431  using op_type = KokkosRefactor::Details::InsertOp;
1432  permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1433  permuteToLIDs_d,
1434  permuteFromLIDs_d,
1435  tgtWhichVecs_d,
1436  srcWhichVecs_d, numCols,
1437  op_type());
1438  }
1439  }
1440  else {
1441  if (CM == ADD_ASSIGN) {
1442  using op_type = KokkosRefactor::Details::AddOp;
1443  permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1444  permuteFromLIDs_d, numCols, op_type());
1445  }
1446  else {
1447  using op_type = KokkosRefactor::Details::InsertOp;
1448  permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1449  permuteFromLIDs_d, numCols, op_type());
1450  }
1451  }
1452  }
1453 
1454  if (verbose) {
1455  std::ostringstream os;
1456  os << *prefix << "Done!" << endl;
1457  std::cerr << os.str ();
1458  }
1459  }
1460 
1461 // clang-format on
1462 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1464  const SrcDistObject &sourceObj, const size_t numSameIDs,
1465  const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1466  &permuteToLIDs,
1467  const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1468  &permuteFromLIDs,
1469  const CombineMode CM) {
1470  copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1471  execution_space());
1472 }
1473 // clang-format off
1474 
1475  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1476  void
1479  (const SrcDistObject& sourceObj,
1480  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1481  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1482  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1483  size_t& constantNumPackets,
1484  const execution_space &space)
1485  {
1486  using ::Tpetra::Details::Behavior;
1487  using ::Tpetra::Details::ProfilingRegion;
1489  using Kokkos::Compat::create_const_view;
1490  using Kokkos::Compat::getKokkosViewDeepCopy;
1491  using std::endl;
1493 
1494  // We've already called checkSizes(), so this cast must succeed.
1495  MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1496 
1497  const bool packOnHost = runKernelOnHost(sourceMV);
1498  const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1499  const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1500  const char tfecfFuncName[] = "packAndPrepare: ";
1501  ProfilingRegion regionPAP (packOnHost ? longFuncNameHost : longFuncNameDevice);
1502 
1503  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1504  // have the option to check indices. We do so when Tpetra is in
1505  // debug mode. It is in debug mode by default in a debug build,
1506  // but you may control this at run time, before launching the
1507  // executable, by setting the TPETRA_DEBUG environment variable to
1508  // "1" (or "TRUE").
1509  const bool debugCheckIndices = Behavior::debug ();
1510  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1511  // environment variable to "1" (or "TRUE") for copious debug
1512  // output to std::cerr on every MPI process. This is unwise for
1513  // runs with large numbers of MPI processes.
1514  const bool printDebugOutput = Behavior::verbose ();
1515 
1516  std::unique_ptr<std::string> prefix;
1517  if (printDebugOutput) {
1518  auto map = this->getMap ();
1519  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1520  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1521  std::ostringstream os;
1522  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1523  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1524  os << "Start" << endl;
1525  std::cerr << os.str ();
1526  }
1527 
1528 
1529  const size_t numCols = sourceMV.getNumVectors ();
1530 
1531  // This spares us from needing to fill numExportPacketsPerLID.
1532  // Setting constantNumPackets to a nonzero value signals that
1533  // all packets have the same number of entries.
1534  constantNumPackets = numCols;
1535 
1536  // If we have no exports, there is nothing to do. Make sure this
1537  // goes _after_ setting constantNumPackets correctly.
1538  if (exportLIDs.extent (0) == 0) {
1539  if (printDebugOutput) {
1540  std::ostringstream os;
1541  os << *prefix << "No exports on this proc, DONE" << std::endl;
1542  std::cerr << os.str ();
1543  }
1544  return;
1545  }
1546 
1547  /* The layout in the export for MultiVectors is as follows:
1548  exports = { all of the data from row exportLIDs.front() ;
1549  ....
1550  all of the data from row exportLIDs.back() }
1551  This doesn't have the best locality, but is necessary because
1552  the data for a Packet (all data associated with an LID) is
1553  required to be contiguous. */
1554 
1555  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1556  // packing scheme in the above comment? The data going to a
1557  // particular process must be contiguous, of course, but those
1558  // data could include entries from multiple LIDs. DistObject just
1559  // needs to know how to index into that data. Kokkos is good at
1560  // decoupling storage intent from data layout choice.
1561 
1562  const size_t numExportLIDs = exportLIDs.extent (0);
1563  const size_t newExportsSize = numCols * numExportLIDs;
1564  if (printDebugOutput) {
1565  std::ostringstream os;
1566  os << *prefix << "realloc: "
1567  << "numExportLIDs: " << numExportLIDs
1568  << ", exports.extent(0): " << exports.extent (0)
1569  << ", newExportsSize: " << newExportsSize << std::endl;
1570  std::cerr << os.str ();
1571  }
1572  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1573 
1574  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1575  // sync it. Pack it where it's currently sync'd.
1576  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1577  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1578  std::logic_error, "Input MultiVector needs sync to both host "
1579  "and device.");
1580  if (printDebugOutput) {
1581  std::ostringstream os;
1582  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1583  std::cerr << os.str ();
1584  }
1585 
1586  // Mark 'exports' here, since we might have resized it above.
1587  // Resizing currently requires calling the constructor, which
1588  // clears out the 'modified' flags.
1589  if (packOnHost) {
1590  // nde 06 Feb 2020: If 'exports' does not require resize
1591  // when reallocDualViewIfNeeded is called, the modified flags
1592  // are not cleared out. This can result in host and device views
1593  // being out-of-sync, resuling in an error in exports.modify_* calls.
1594  // Clearing the sync flags prevents this possible case.
1595  exports.clear_sync_state ();
1596  exports.modify_host ();
1597  }
1598  else {
1599  // nde 06 Feb 2020: If 'exports' does not require resize
1600  // when reallocDualViewIfNeeded is called, the modified flags
1601  // are not cleared out. This can result in host and device views
1602  // being out-of-sync, resuling in an error in exports.modify_* calls.
1603  // Clearing the sync flags prevents this possible case.
1604  exports.clear_sync_state ();
1605  exports.modify_device ();
1606  }
1607 
1608  if (numCols == 1) { // special case for one column only
1609  // MultiVector always represents a single column with constant
1610  // stride, but it doesn't hurt to implement both cases anyway.
1611  //
1612  // ETP: I'm not sure I agree with the above statement. Can't a single-
1613  // column multivector be a subview of another multi-vector, in which case
1614  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1615  // separately.
1616  //
1617  // mfh 18 Jan 2016: In answer to ETP's comment above:
1618  // MultiVector treats single-column MultiVectors created using a
1619  // "nonconstant stride constructor" as a special case, and makes
1620  // them constant stride (by making whichVectors_ have length 0).
1621  if (sourceMV.isConstantStride ()) {
1622  using KokkosRefactor::Details::pack_array_single_column;
1623  if (printDebugOutput) {
1624  std::ostringstream os;
1625  os << *prefix << "Pack numCols=1 const stride" << endl;
1626  std::cerr << os.str ();
1627  }
1628  if (packOnHost) {
1629  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1630  pack_array_single_column (exports.view_host (),
1631  src_host,
1632  exportLIDs.view_host (),
1633  0,
1634  debugCheckIndices);
1635  }
1636  else { // pack on device
1637  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1638  pack_array_single_column (exports.view_device (),
1639  src_dev,
1640  exportLIDs.view_device (),
1641  0,
1642  debugCheckIndices,
1643  space);
1644  }
1645  }
1646  else {
1647  using KokkosRefactor::Details::pack_array_single_column;
1648  if (printDebugOutput) {
1649  std::ostringstream os;
1650  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1651  std::cerr << os.str ();
1652  }
1653  if (packOnHost) {
1654  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1655  pack_array_single_column (exports.view_host (),
1656  src_host,
1657  exportLIDs.view_host (),
1658  sourceMV.whichVectors_[0],
1659  debugCheckIndices);
1660  }
1661  else { // pack on device
1662  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1663  pack_array_single_column (exports.view_device (),
1664  src_dev,
1665  exportLIDs.view_device (),
1666  sourceMV.whichVectors_[0],
1667  debugCheckIndices,
1668  space);
1669  }
1670  }
1671  }
1672  else { // the source MultiVector has multiple columns
1673  if (sourceMV.isConstantStride ()) {
1674  using KokkosRefactor::Details::pack_array_multi_column;
1675  if (printDebugOutput) {
1676  std::ostringstream os;
1677  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1678  std::cerr << os.str ();
1679  }
1680  if (packOnHost) {
1681  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1682  pack_array_multi_column (exports.view_host (),
1683  src_host,
1684  exportLIDs.view_host (),
1685  numCols,
1686  debugCheckIndices);
1687  }
1688  else { // pack on device
1689  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1690  pack_array_multi_column (exports.view_device (),
1691  src_dev,
1692  exportLIDs.view_device (),
1693  numCols,
1694  debugCheckIndices,
1695  space);
1696  }
1697  }
1698  else {
1699  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1700  if (printDebugOutput) {
1701  std::ostringstream os;
1702  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1703  << endl;
1704  std::cerr << os.str ();
1705  }
1706  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1707  // whichVectors_ can be expensive, but pack and unpack for
1708  // nonconstant-stride MultiVectors is slower anyway.
1709  using IST = impl_scalar_type;
1710  using DV = Kokkos::DualView<IST*, device_type>;
1711  using HES = typename DV::t_host::execution_space;
1712  using DES = typename DV::t_dev::execution_space;
1713  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1714  if (packOnHost) {
1715  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1716  pack_array_multi_column_variable_stride
1717  (exports.view_host (),
1718  src_host,
1719  exportLIDs.view_host (),
1720  getKokkosViewDeepCopy<HES> (whichVecs),
1721  numCols,
1722  debugCheckIndices);
1723  }
1724  else { // pack on device
1725  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1726  pack_array_multi_column_variable_stride
1727  (exports.view_device (),
1728  src_dev,
1729  exportLIDs.view_device (),
1730  getKokkosViewDeepCopy<DES> (whichVecs),
1731  numCols,
1732  debugCheckIndices, space);
1733  }
1734  }
1735  }
1736 
1737  if (printDebugOutput) {
1738  std::ostringstream os;
1739  os << *prefix << "Done!" << endl;
1740  std::cerr << os.str ();
1741  }
1742 
1743  }
1744 
1745 // clang-format on
1746  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1747  void
1750  (const SrcDistObject& sourceObj,
1751  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1752  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1753  Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1754  size_t& constantNumPackets) {
1755  packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space());
1756  }
1757 // clang-format off
1758 
1759  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1760  template <class NO>
1761  typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1762  typename NO::device_type::memory_space>::value,
1763  bool>::type
1765  reallocImportsIfNeededImpl (const size_t newSize,
1766  const bool verbose,
1767  const std::string* prefix,
1768  const bool areRemoteLIDsContiguous,
1769  const CombineMode CM)
1770  {
1771  // This implementation of reallocImportsIfNeeded is an
1772  // optimization that is specific to MultiVector. We check if the
1773  // imports_ view can be aliased to the end of the data view_. If
1774  // that is the case, we can skip the unpackAndCombine call.
1775 
1776  if (verbose) {
1777  std::ostringstream os;
1778  os << *prefix << "Realloc (if needed) imports_ from "
1779  << this->imports_.extent (0) << " to " << newSize << std::endl;
1780  std::cerr << os.str ();
1781  }
1782 
1783  bool reallocated = false;
1784 
1785  using IST = impl_scalar_type;
1786  using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1787 
1788  // Conditions for aliasing memory:
1789  // - When both sides of the dual view are in the same memory
1790  // space, we do not need to worry about syncing things.
1791  // - When both memory spaces are different, we only alias if this
1792  // does not incur additional sync'ing.
1793  // - The remote LIDs need to be contiguous, so that we do not need
1794  // to reorder received information.
1795  // - CombineMode needs to be INSERT.
1796  // - The number of vectors needs to be 1, otherwise we need to
1797  // reorder the received data.
1798  if ((std::is_same_v<typename dual_view_type::t_dev::device_type, typename dual_view_type::t_host::device_type> ||
1799  (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) ||
1800  (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) &&
1801  areRemoteLIDsContiguous &&
1802  (CM == INSERT || CM == REPLACE) &&
1803  (getNumVectors() == 1) &&
1804  (newSize > 0)) {
1805 
1806  size_t offset = getLocalLength () - newSize;
1807  reallocated = this->imports_.d_view.data() != view_.getDualView().d_view.data() + offset;
1808  if (reallocated) {
1809  typedef std::pair<size_t, size_t> range_type;
1810  this->imports_ = DV(view_.getDualView(),
1811  range_type (offset, getLocalLength () ),
1812  0);
1813 
1814  if (verbose) {
1815  std::ostringstream os;
1816  os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1817  std::cerr << os.str ();
1818  }
1819  }
1820  return reallocated;
1821  }
1822  {
1824  reallocated =
1825  reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1826  if (verbose) {
1827  std::ostringstream os;
1828  os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1829  std::cerr << os.str ();
1830  }
1831  this->imports_ = this->unaliased_imports_;
1832  }
1833  return reallocated;
1834  }
1835 
1836  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1837  template <class NO>
1838  typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1839  typename NO::device_type::memory_space>::value,
1840  bool>::type
1842  reallocImportsIfNeededImpl (const size_t newSize,
1843  const bool verbose,
1844  const std::string* prefix,
1845  const bool areRemoteLIDsContiguous,
1846  const CombineMode CM)
1847  {
1848  return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1849  }
1850 
1851  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1852  bool
1854  reallocImportsIfNeeded(const size_t newSize,
1855  const bool verbose,
1856  const std::string* prefix,
1857  const bool areRemoteLIDsContiguous,
1858  const CombineMode CM) {
1860  return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1861  }
1862 
1863  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1864  bool
1867  return (this->imports_.d_view.data() + this->imports_.d_view.extent(0) ==
1868  view_.getDualView().d_view.data() + view_.getDualView().d_view.extent(0));
1869  }
1870 
1871 
1872  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1873  void
1876  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1877  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1878  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1879  const size_t constantNumPackets,
1880  const CombineMode CM,
1881  const execution_space &space)
1882  {
1883  using ::Tpetra::Details::Behavior;
1884  using ::Tpetra::Details::ProfilingRegion;
1885  using KokkosRefactor::Details::unpack_array_multi_column;
1886  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1887  using Kokkos::Compat::getKokkosViewDeepCopy;
1888  using std::endl;
1889 
1890  const bool unpackOnHost = runKernelOnHost(imports);
1891 
1892  const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1893  const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1894  const char * longFuncName = unpackOnHost ? longFuncNameHost : longFuncNameDevice;
1895  const char tfecfFuncName[] = "unpackAndCombine: ";
1896  ProfilingRegion regionUAC (longFuncName);
1897 
1898  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1899  // have the option to check indices. We do so when Tpetra is in
1900  // debug mode. It is in debug mode by default in a debug build,
1901  // but you may control this at run time, before launching the
1902  // executable, by setting the TPETRA_DEBUG environment variable to
1903  // "1" (or "TRUE").
1904  const bool debugCheckIndices = Behavior::debug ();
1905 
1906  const bool printDebugOutput = Behavior::verbose ();
1907  std::unique_ptr<std::string> prefix;
1908  if (printDebugOutput) {
1909  auto map = this->getMap ();
1910  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1911  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1912  std::ostringstream os;
1913  os << "Proc " << myRank << ": " << longFuncName << ": ";
1914  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1915  os << "Start" << endl;
1916  std::cerr << os.str ();
1917  }
1918 
1919  // If we have no imports, there is nothing to do
1920  if (importLIDs.extent (0) == 0) {
1921  if (printDebugOutput) {
1922  std::ostringstream os;
1923  os << *prefix << "No imports. Done!" << endl;
1924  std::cerr << os.str ();
1925  }
1926  return;
1927  }
1928 
1929  // Check, whether imports_ is a subview of the MV view.
1930  if (importsAreAliased()) {
1931  if (printDebugOutput) {
1932  std::ostringstream os;
1933  os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1934  std::cerr << os.str ();
1935  }
1936  return;
1937  }
1938 
1939 
1940  const size_t numVecs = getNumVectors ();
1941  if (debugCheckIndices) {
1942  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1943  (static_cast<size_t> (imports.extent (0)) !=
1944  numVecs * importLIDs.extent (0),
1945  std::runtime_error,
1946  "imports.extent(0) = " << imports.extent (0)
1947  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1948  << " * " << importLIDs.extent (0) << " = "
1949  << numVecs * importLIDs.extent (0) << ".");
1950 
1951  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1952  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1953  "constantNumPackets input argument must be nonzero.");
1954 
1955  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1956  (static_cast<size_t> (numVecs) !=
1957  static_cast<size_t> (constantNumPackets),
1958  std::runtime_error, "constantNumPackets must equal numVecs.");
1959  }
1960 
1961  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1962  // the memory space in which the imports buffer was last modified and
1963  // the size of the imports buffer.
1964  // DistObject::doTransferNew decides where it was last modified (based on
1965  // whether communication buffers used were on host or device).
1966  if (unpackOnHost) {
1967  if (this->imports_.need_sync_host()) this->imports_.sync_host();
1968  }
1969  else {
1970  if (this->imports_.need_sync_device()) this->imports_.sync_device();
1971  }
1972 
1973  if (printDebugOutput) {
1974  std::ostringstream os;
1975  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1976  << endl;
1977  std::cerr << os.str ();
1978  }
1979 
1980  // We have to sync before modifying, because this method may read
1981  // as well as write (depending on the CombineMode).
1982  auto imports_d = imports.view_device ();
1983  auto imports_h = imports.view_host ();
1984  auto importLIDs_d = importLIDs.view_device ();
1985  auto importLIDs_h = importLIDs.view_host ();
1986 
1987  Kokkos::DualView<size_t*, device_type> whichVecs;
1988  if (! isConstantStride ()) {
1989  Kokkos::View<const size_t*, Kokkos::HostSpace,
1990  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1991  numVecs);
1992  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1993  if (unpackOnHost) {
1994  whichVecs.modify_host ();
1995  // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
1996  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1997  }
1998  else {
1999  whichVecs.modify_device ();
2000  // DEEP_COPY REVIEW - HOST-TO-DEVICE
2001  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
2002  }
2003  }
2004  auto whichVecs_d = whichVecs.view_device ();
2005  auto whichVecs_h = whichVecs.view_host ();
2006 
2007  /* The layout in the export for MultiVectors is as follows:
2008  imports = { all of the data from row exportLIDs.front() ;
2009  ....
2010  all of the data from row exportLIDs.back() }
2011  This doesn't have the best locality, but is necessary because
2012  the data for a Packet (all data associated with an LID) is
2013  required to be contiguous. */
2014 
2015  if (numVecs > 0 && importLIDs.extent (0) > 0) {
2016  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
2017  using host_exec_space = typename dual_view_type::t_host::execution_space;
2018 
2019  // This fixes GitHub Issue #4418.
2020  const bool use_atomic_updates = unpackOnHost ?
2021  host_exec_space().concurrency () != 1 :
2022  dev_exec_space().concurrency () != 1;
2023 
2024  if (printDebugOutput) {
2025  std::ostringstream os;
2026  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
2027  std::cerr << os.str ();
2028  }
2029 
2030  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
2031  // custom combine modes, start editing here.
2032 
2033  if (CM == INSERT || CM == REPLACE) {
2034  using op_type = KokkosRefactor::Details::InsertOp;
2035  if (isConstantStride ()) {
2036  if (unpackOnHost) {
2037  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2038  unpack_array_multi_column (host_exec_space (),
2039  X_h, imports_h, importLIDs_h,
2040  op_type (), numVecs,
2041  use_atomic_updates,
2042  debugCheckIndices);
2043 
2044  }
2045  else { // unpack on device
2046  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2047  unpack_array_multi_column (space,
2048  X_d, imports_d, importLIDs_d,
2049  op_type (), numVecs,
2050  use_atomic_updates,
2051  debugCheckIndices);
2052  }
2053  }
2054  else { // not constant stride
2055  if (unpackOnHost) {
2056  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2057  unpack_array_multi_column_variable_stride (host_exec_space (),
2058  X_h, imports_h,
2059  importLIDs_h,
2060  whichVecs_h,
2061  op_type (),
2062  numVecs,
2063  use_atomic_updates,
2064  debugCheckIndices);
2065  }
2066  else { // unpack on device
2067  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2068  unpack_array_multi_column_variable_stride (space,
2069  X_d, imports_d,
2070  importLIDs_d,
2071  whichVecs_d,
2072  op_type (),
2073  numVecs,
2074  use_atomic_updates,
2075  debugCheckIndices);
2076  }
2077  }
2078  }
2079  else if (CM == ADD || CM == ADD_ASSIGN) {
2080  using op_type = KokkosRefactor::Details::AddOp;
2081  if (isConstantStride ()) {
2082  if (unpackOnHost) {
2083  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2084  unpack_array_multi_column (host_exec_space (),
2085  X_h, imports_h, importLIDs_h,
2086  op_type (), numVecs,
2087  use_atomic_updates,
2088  debugCheckIndices);
2089  }
2090  else { // unpack on device
2091  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2092  unpack_array_multi_column (space,
2093  X_d, imports_d, importLIDs_d,
2094  op_type (), numVecs,
2095  use_atomic_updates,
2096  debugCheckIndices);
2097  }
2098  }
2099  else { // not constant stride
2100  if (unpackOnHost) {
2101  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2102  unpack_array_multi_column_variable_stride (host_exec_space (),
2103  X_h, imports_h,
2104  importLIDs_h,
2105  whichVecs_h,
2106  op_type (),
2107  numVecs,
2108  use_atomic_updates,
2109  debugCheckIndices);
2110  }
2111  else { // unpack on device
2112  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2113  unpack_array_multi_column_variable_stride (space,
2114  X_d, imports_d,
2115  importLIDs_d,
2116  whichVecs_d,
2117  op_type (),
2118  numVecs,
2119  use_atomic_updates,
2120  debugCheckIndices);
2121  }
2122  }
2123  }
2124  else if (CM == ABSMAX) {
2125  using op_type = KokkosRefactor::Details::AbsMaxOp;
2126  if (isConstantStride ()) {
2127  if (unpackOnHost) {
2128  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2129  unpack_array_multi_column (host_exec_space (),
2130  X_h, imports_h, importLIDs_h,
2131  op_type (), numVecs,
2132  use_atomic_updates,
2133  debugCheckIndices);
2134  }
2135  else { // unpack on device
2136  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2137  unpack_array_multi_column (space,
2138  X_d, imports_d, importLIDs_d,
2139  op_type (), numVecs,
2140  use_atomic_updates,
2141  debugCheckIndices);
2142  }
2143  }
2144  else {
2145  if (unpackOnHost) {
2146  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2147  unpack_array_multi_column_variable_stride (host_exec_space (),
2148  X_h, imports_h,
2149  importLIDs_h,
2150  whichVecs_h,
2151  op_type (),
2152  numVecs,
2153  use_atomic_updates,
2154  debugCheckIndices);
2155  }
2156  else { // unpack on device
2157  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2158  unpack_array_multi_column_variable_stride (space,
2159  X_d, imports_d,
2160  importLIDs_d,
2161  whichVecs_d,
2162  op_type (),
2163  numVecs,
2164  use_atomic_updates,
2165  debugCheckIndices);
2166  }
2167  }
2168  }
2169  else {
2170  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2171  (true, std::logic_error, "Invalid CombineMode");
2172  }
2173  }
2174  else {
2175  if (printDebugOutput) {
2176  std::ostringstream os;
2177  os << *prefix << "Nothing to unpack" << endl;
2178  std::cerr << os.str ();
2179  }
2180  }
2181 
2182  if (printDebugOutput) {
2183  std::ostringstream os;
2184  os << *prefix << "Done!" << endl;
2185  std::cerr << os.str ();
2186  }
2187  }
2188 
2189  // clang-format on
2190  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2191  void
2194  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2195  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2196  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2197  const size_t constantNumPackets,
2198  const CombineMode CM) {
2199  unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2200  }
2201  // clang-format off
2202 
2203  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2204  size_t
2207  {
2208  if (isConstantStride ()) {
2209  return static_cast<size_t> (view_.extent (1));
2210  } else {
2211  return static_cast<size_t> (whichVectors_.size ());
2212  }
2213  }
2214 
2215  namespace { // (anonymous)
2216 
2217  template<class RV>
2218  void
2219  gblDotImpl (const RV& dotsOut,
2220  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2221  const bool distributed)
2222  {
2223  using Teuchos::REDUCE_MAX;
2224  using Teuchos::REDUCE_SUM;
2225  using Teuchos::reduceAll;
2226  typedef typename RV::non_const_value_type dot_type;
2227 
2228  const size_t numVecs = dotsOut.extent (0);
2229 
2230  // If the MultiVector is distributed over multiple processes, do
2231  // the distributed (interprocess) part of the dot product. We
2232  // assume that the MPI implementation can read from and write to
2233  // device memory.
2234  //
2235  // replaceMap() may have removed some processes. Those
2236  // processes have a null Map. They must not participate in any
2237  // collective operations. We ask first whether the Map is null,
2238  // because isDistributed() defers that question to the Map. We
2239  // still compute and return local dots for processes not
2240  // participating in collective operations; those probably don't
2241  // make any sense, but it doesn't hurt to do them, since it's
2242  // illegal to call dot() on those processes anyway.
2243  if (distributed && ! comm.is_null ()) {
2244  // The calling process only participates in the collective if
2245  // both the Map and its Comm on that process are nonnull.
2246  const int nv = static_cast<int> (numVecs);
2247  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
2248 
2249  if (commIsInterComm) {
2250  // If comm is an intercomm, then we may not alias input and
2251  // output buffers, so we have to make a copy of the local
2252  // sum.
2253  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2254  // DEEP_COPY REVIEW - NOT TESTED
2255  Kokkos::deep_copy (lclDots, dotsOut);
2256  const dot_type* const lclSum = lclDots.data ();
2257  dot_type* const gblSum = dotsOut.data ();
2258  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2259  }
2260  else {
2261  dot_type* const inout = dotsOut.data ();
2262  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
2263  }
2264  }
2265  }
2266  } // namespace (anonymous)
2267 
2268  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2269  void
2272  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2273  {
2274  using ::Tpetra::Details::Behavior;
2275  using Kokkos::subview;
2276  using Teuchos::Comm;
2277  using Teuchos::null;
2278  using Teuchos::RCP;
2279  // View of all the dot product results.
2280  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2281  typedef typename dual_view_type::t_dev_const XMV;
2282  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2283 
2284  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2285 
2286  const size_t numVecs = this->getNumVectors ();
2287  if (numVecs == 0) {
2288  return; // nothing to do
2289  }
2290  const size_t lclNumRows = this->getLocalLength ();
2291  const size_t numDots = static_cast<size_t> (dots.extent (0));
2292  const bool debug = Behavior::debug ();
2293 
2294  if (debug) {
2295  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2296  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2297  (! compat, std::invalid_argument, "'*this' MultiVector is not "
2298  "compatible with the input MultiVector A. We only test for this "
2299  "in debug mode.");
2300  }
2301 
2302  // FIXME (mfh 11 Jul 2014) These exception tests may not
2303  // necessarily be thrown on all processes consistently. We should
2304  // instead pass along error state with the inner product. We
2305  // could do this by setting an extra slot to
2306  // Kokkos::ArithTraits<dot_type>::one() on error. The
2307  // final sum should be
2308  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2309  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2310  lclNumRows != A.getLocalLength (), std::runtime_error,
2311  "MultiVectors do not have the same local length. "
2312  "this->getLocalLength() = " << lclNumRows << " != "
2313  "A.getLocalLength() = " << A.getLocalLength () << ".");
2314  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2315  numVecs != A.getNumVectors (), std::runtime_error,
2316  "MultiVectors must have the same number of columns (vectors). "
2317  "this->getNumVectors() = " << numVecs << " != "
2318  "A.getNumVectors() = " << A.getNumVectors () << ".");
2319  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2320  numDots != numVecs, std::runtime_error,
2321  "The output array 'dots' must have the same number of entries as the "
2322  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2323  numDots << " != this->getNumVectors() = " << numVecs << ".");
2324 
2325  const std::pair<size_t, size_t> colRng (0, numVecs);
2326  RV dotsOut = subview (dots, colRng);
2327  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2328  this->getMap ()->getComm ();
2329 
2330  auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2331  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2332 
2333  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2334  this->whichVectors_.getRawPtr (),
2335  A.whichVectors_.getRawPtr (),
2336  this->isConstantStride (), A.isConstantStride ());
2337 
2338  // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2339  // functions unless they explicitely return a value to Host.
2340  // Here while the lclDot are on host, they are not a return
2341  // value, therefore they might be avaible to us immediately.
2342  // Adding a frnce here guarantees that we will have the lclDot
2343  // ahead of the MPI reduction.
2344  execution_space exec_space_instance = execution_space();
2345  exec_space_instance.fence();
2346 
2347  gblDotImpl (dotsOut, comm, this->isDistributed ());
2348  }
2349 
2350  namespace { // (anonymous)
2351  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2353  multiVectorSingleColumnDot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
2355  {
2356  using ::Tpetra::Details::ProfilingRegion;
2358  using dot_type = typename MV::dot_type;
2359  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2360 
2361  auto map = x.getMap ();
2362  Teuchos::RCP<const Teuchos::Comm<int> > comm =
2363  map.is_null () ? Teuchos::null : map->getComm ();
2364  if (comm.is_null ()) {
2365  return Kokkos::ArithTraits<dot_type>::zero ();
2366  }
2367  else {
2368  using LO = LocalOrdinal;
2369  // The min just ensures that we don't overwrite memory that
2370  // doesn't belong to us, in the erroneous input case where x
2371  // and y have different numbers of rows.
2372  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2373  y.getLocalLength ()));
2374  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2375  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2376  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2377 
2378  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2379  //const_cast<MV&>(x).sync_device ();
2380  //const_cast<MV&>(y).sync_device ();
2381 
2382  auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2383  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2384  auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2385  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2386  lclDot = KokkosBlas::dot (x_1d, y_1d);
2387 
2388  if (x.isDistributed ()) {
2389  using Teuchos::outArg;
2390  using Teuchos::REDUCE_SUM;
2391  using Teuchos::reduceAll;
2392  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2393  }
2394  else {
2395  gblDot = lclDot;
2396  }
2397  return gblDot;
2398  }
2399  }
2400  } // namespace (anonymous)
2401 
2402 
2403 
2404  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2405  void
2408  const Teuchos::ArrayView<dot_type>& dots) const
2409  {
2410  const char tfecfFuncName[] = "dot: ";
2411  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2412 
2413  const size_t numVecs = this->getNumVectors ();
2414  const size_t lclNumRows = this->getLocalLength ();
2415  const size_t numDots = static_cast<size_t> (dots.size ());
2416 
2417  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2418  // not necessarily be thrown on all processes consistently. We
2419  // keep them for now, because MultiVector's unit tests insist on
2420  // them. In the future, we should instead pass along error state
2421  // with the inner product. We could do this by setting an extra
2422  // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2423  // The final sum should be
2424  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2425  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2426  (lclNumRows != A.getLocalLength (), std::runtime_error,
2427  "MultiVectors do not have the same local length. "
2428  "this->getLocalLength() = " << lclNumRows << " != "
2429  "A.getLocalLength() = " << A.getLocalLength () << ".");
2430  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2431  (numVecs != A.getNumVectors (), std::runtime_error,
2432  "MultiVectors must have the same number of columns (vectors). "
2433  "this->getNumVectors() = " << numVecs << " != "
2434  "A.getNumVectors() = " << A.getNumVectors () << ".");
2435  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2436  (numDots != numVecs, std::runtime_error,
2437  "The output array 'dots' must have the same number of entries as the "
2438  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2439  numDots << " != this->getNumVectors() = " << numVecs << ".");
2440 
2441  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2442  const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
2443  dots[0] = gblDot;
2444  }
2445  else {
2446  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2447  }
2448  }
2449 
2450  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2451  void
2453  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2454  {
2456  using ::Tpetra::Details::NORM_TWO;
2457  using ::Tpetra::Details::ProfilingRegion;
2458  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2459 
2460  // The function needs to be able to sync X.
2461  MV& X = const_cast<MV&> (*this);
2462  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2463  }
2464 
2465  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2466  void
2468  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2469  {
2470  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2471  this->norm2 (norms_av);
2472  }
2473 
2474 
2475  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2476  void
2478  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2479  {
2481  using ::Tpetra::Details::NORM_ONE;
2482  using ::Tpetra::Details::ProfilingRegion;
2483  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2484 
2485  // The function needs to be able to sync X.
2486  MV& X = const_cast<MV&> (*this);
2487  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2488  }
2489 
2490  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2491  void
2493  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2494  {
2495  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2496  this->norm1 (norms_av);
2497  }
2498 
2499  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2500  void
2502  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2503  {
2505  using ::Tpetra::Details::NORM_INF;
2506  using ::Tpetra::Details::ProfilingRegion;
2507  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2508 
2509  // The function needs to be able to sync X.
2510  MV& X = const_cast<MV&> (*this);
2511  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2512  }
2513 
2514  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2515  void
2517  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2518  {
2519  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2520  this->normInf (norms_av);
2521  }
2522 
2523  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2524  void
2526  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2527  {
2528  // KR FIXME Overload this method to take a View.
2529  using Kokkos::ALL;
2530  using Kokkos::subview;
2531  using Teuchos::Comm;
2532  using Teuchos::RCP;
2533  using Teuchos::reduceAll;
2534  using Teuchos::REDUCE_SUM;
2535  typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2536 
2537  const size_t lclNumRows = this->getLocalLength ();
2538  const size_t numVecs = this->getNumVectors ();
2539  const size_t numMeans = static_cast<size_t> (means.size ());
2540 
2541  TEUCHOS_TEST_FOR_EXCEPTION(
2542  numMeans != numVecs, std::runtime_error,
2543  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2544  << " != this->getNumVectors() = " << numVecs << ".");
2545 
2546  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2547  const std::pair<size_t, size_t> colRng (0, numVecs);
2548 
2549  // Make sure that the final output view has the same layout as the
2550  // temporary view's HostMirror. Left or Right doesn't matter for
2551  // a 1-D array anyway; this is just to placate the compiler.
2552  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2553  typedef Kokkos::View<impl_scalar_type*,
2554  typename local_view_type::HostMirror::array_layout,
2555  Kokkos::HostSpace,
2556  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2557  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2558 
2559  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2560  this->getMap ()->getComm ();
2561 
2562  // If we need sync to device, then host has the most recent version.
2563  const bool useHostVersion = this->need_sync_device ();
2564  if (useHostVersion) {
2565  // DualView was last modified on host, so run the local kernel there.
2566  auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2567  rowRng, Kokkos::ALL ());
2568  // Compute the local sum of each column.
2569  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2570  if (isConstantStride ()) {
2571  KokkosBlas::sum (lclSums, X_lcl);
2572  }
2573  else {
2574  for (size_t j = 0; j < numVecs; ++j) {
2575  const size_t col = whichVectors_[j];
2576  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2577  }
2578  }
2579 
2580  // If there are multiple MPI processes, the all-reduce reads
2581  // from lclSums, and writes to meansOut. Otherwise, we just
2582  // copy lclSums into meansOut.
2583  if (! comm.is_null () && this->isDistributed ()) {
2584  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2585  lclSums.data (), meansOut.data ());
2586  }
2587  else {
2588  // DEEP_COPY REVIEW - NOT TESTED
2589  Kokkos::deep_copy (meansOut, lclSums);
2590  }
2591  }
2592  else {
2593  // DualView was last modified on device, so run the local kernel there.
2594  auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2595  rowRng, Kokkos::ALL ());
2596 
2597  // Compute the local sum of each column.
2598  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2599  if (isConstantStride ()) {
2600  KokkosBlas::sum (lclSums, X_lcl);
2601  }
2602  else {
2603  for (size_t j = 0; j < numVecs; ++j) {
2604  const size_t col = whichVectors_[j];
2605  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2606  }
2607  }
2608  // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2609  // functions unless they explicitly return a value to Host.
2610  // Here while the lclSums are on the host, they are not a return
2611  // value, therefore they might be available to us immediately.
2612  // Adding a fence here guarantees that we will have the lclSums
2613  // ahead of the MPI reduction.
2614  execution_space exec_space_instance = execution_space();
2615  exec_space_instance.fence();
2616 
2617  // If there are multiple MPI processes, the all-reduce reads
2618  // from lclSums, and writes to meansOut. (We assume that MPI
2619  // can read device memory.) Otherwise, we just copy lclSums
2620  // into meansOut.
2621  if (! comm.is_null () && this->isDistributed ()) {
2622  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2623  lclSums.data (), meansOut.data ());
2624  }
2625  else {
2626  // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2627  Kokkos::deep_copy (meansOut, lclSums);
2628  }
2629  }
2630 
2631  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2632  // to the magnitude type, since operator/ (std::complex<T>, int)
2633  // isn't necessarily defined.
2634  const impl_scalar_type OneOverN =
2635  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2636  for (size_t k = 0; k < numMeans; ++k) {
2637  meansOut(k) = meansOut(k) * OneOverN;
2638  }
2639  }
2640 
2641 
2642  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2643  void
2646  {
2647  typedef impl_scalar_type IST;
2648  typedef Kokkos::ArithTraits<IST> ATS;
2649  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2650  typedef typename pool_type::generator_type generator_type;
2651 
2652  const IST max = Kokkos::rand<generator_type, IST>::max ();
2653  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2654 
2655  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2656  }
2657 
2658 
2659  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2660  void
2662  randomize (const Scalar& minVal, const Scalar& maxVal)
2663  {
2664  typedef impl_scalar_type IST;
2665  typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2666  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2667 
2668  // Seed the pool based on the system RNG and the MPI rank, if needed
2669  if(!tpetra_pool_type::isSet())
2670  tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2671 
2672  pool_type & rand_pool = tpetra_pool_type::getPool();
2673  const IST max = static_cast<IST> (maxVal);
2674  const IST min = static_cast<IST> (minVal);
2675 
2676  auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2677 
2678  if (isConstantStride ()) {
2679  Kokkos::fill_random (thisView, rand_pool, min, max);
2680  }
2681  else {
2682  const size_t numVecs = getNumVectors ();
2683  for (size_t k = 0; k < numVecs; ++k) {
2684  const size_t col = whichVectors_[k];
2685  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2686  Kokkos::fill_random (X_k, rand_pool, min, max);
2687  }
2688  }
2689  }
2690 
2691  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2692  void
2694  putScalar (const Scalar& alpha)
2695  {
2696  using ::Tpetra::Details::ProfilingRegion;
2697  using ::Tpetra::Details::Blas::fill;
2698  using DES = typename dual_view_type::t_dev::execution_space;
2699  using HES = typename dual_view_type::t_host::execution_space;
2700  using LO = LocalOrdinal;
2701  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2702 
2703  // We need this cast for cases like Scalar = std::complex<T> but
2704  // impl_scalar_type = Kokkos::complex<T>.
2705  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2706  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2707  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2708 
2709  // Modify the most recently updated version of the data. This
2710  // avoids sync'ing, which could violate users' expectations.
2711  //
2712  // If we need sync to device, then host has the most recent version.
2713  const bool runOnHost = runKernelOnHost(*this);
2714  if (! runOnHost) {
2715  auto X = this->getLocalViewDevice(Access::OverwriteAll);
2716  if (this->isConstantStride ()) {
2717  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2718  }
2719  else {
2720  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2721  this->whichVectors_.getRawPtr ());
2722  }
2723  }
2724  else { // last modified in host memory, so modify data there.
2725  auto X = this->getLocalViewHost(Access::OverwriteAll);
2726  if (this->isConstantStride ()) {
2727  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2728  }
2729  else {
2730  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2731  this->whichVectors_.getRawPtr ());
2732  }
2733  }
2734  }
2735 
2736 
2737  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2738  void
2740  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2741  {
2742  using Teuchos::ArrayRCP;
2743  using Teuchos::Comm;
2744  using Teuchos::RCP;
2745  using ST = Scalar;
2746  using LO = LocalOrdinal;
2747  using GO = GlobalOrdinal;
2748 
2749  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2750  // it might work if the MV is a column view of another MV.
2751  // However, things might go wrong when restoring the original
2752  // Map, so we don't allow this case for now.
2753  TEUCHOS_TEST_FOR_EXCEPTION(
2754  ! this->isConstantStride (), std::logic_error,
2755  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2756  "if the MultiVector is a column view of another MultiVector (that is, if "
2757  "isConstantStride() == false).");
2758 
2759  // Case 1: current Map and new Map are both nonnull on this process.
2760  // Case 2: current Map is nonnull, new Map is null.
2761  // Case 3: current Map is null, new Map is nonnull.
2762  // Case 4: both Maps are null: forbidden.
2763  //
2764  // Case 1 means that we don't have to do anything on this process,
2765  // other than assign the new Map. (We always have to do that.)
2766  // It's an error for the user to supply a Map that requires
2767  // resizing in this case.
2768  //
2769  // Case 2 means that the calling process is in the current Map's
2770  // communicator, but will be excluded from the new Map's
2771  // communicator. We don't have to do anything on the calling
2772  // process; just leave whatever data it may have alone.
2773  //
2774  // Case 3 means that the calling process is excluded from the
2775  // current Map's communicator, but will be included in the new
2776  // Map's communicator. This means we need to (re)allocate the
2777  // local DualView if it does not have the right number of rows.
2778  // If the new number of rows is nonzero, we'll fill the newly
2779  // allocated local data with zeros, as befits a projection
2780  // operation.
2781  //
2782  // The typical use case for Case 3 is that the MultiVector was
2783  // first created with the Map with more processes, then that Map
2784  // was replaced with a Map with fewer processes, and finally the
2785  // original Map was restored on this call to replaceMap.
2786 
2787  if (this->getMap ().is_null ()) { // current Map is null
2788  // If this->getMap() is null, that means that this MultiVector
2789  // has already had replaceMap happen to it. In that case, just
2790  // reallocate the DualView with the right size.
2791 
2792  TEUCHOS_TEST_FOR_EXCEPTION(
2793  newMap.is_null (), std::invalid_argument,
2794  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2795  "This probably means that the input Map is incorrect.");
2796 
2797  // Case 3: current Map is null, new Map is nonnull.
2798  // Reallocate the DualView with the right dimensions.
2799  const size_t newNumRows = newMap->getLocalNumElements ();
2800  const size_t origNumRows = view_.extent (0);
2801  const size_t numCols = this->getNumVectors ();
2802 
2803  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2804  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2805  }
2806  }
2807  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2808  // I am an excluded process. Reinitialize my data so that I
2809  // have 0 rows. Keep the number of columns as before.
2810  const size_t newNumRows = static_cast<size_t> (0);
2811  const size_t numCols = this->getNumVectors ();
2812  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2813  }
2814 
2815  this->map_ = newMap;
2816  }
2817 
2818  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2819  void
2821  scale (const Scalar& alpha)
2822  {
2823  using Kokkos::ALL;
2824  using IST = impl_scalar_type;
2825 
2826  const IST theAlpha = static_cast<IST> (alpha);
2827  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2828  return; // do nothing
2829  }
2830  const size_t lclNumRows = getLocalLength ();
2831  const size_t numVecs = getNumVectors ();
2832  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2833  const std::pair<size_t, size_t> colRng (0, numVecs);
2834 
2835  // We can't substitute putScalar(0.0) for scale(0.0), because the
2836  // former will overwrite NaNs present in the MultiVector. The
2837  // semantics of this call require multiplying them by 0, which
2838  // IEEE 754 requires to be NaN.
2839 
2840  // If we need sync to device, then host has the most recent version.
2841  const bool useHostVersion = need_sync_device ();
2842  if (useHostVersion) {
2843  auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2844  if (isConstantStride ()) {
2845  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2846  }
2847  else {
2848  for (size_t k = 0; k < numVecs; ++k) {
2849  const size_t Y_col = whichVectors_[k];
2850  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2851  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2852  }
2853  }
2854  }
2855  else { // work on device
2856  auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2857  if (isConstantStride ()) {
2858  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2859  }
2860  else {
2861  for (size_t k = 0; k < numVecs; ++k) {
2862  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2863  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2864  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2865  }
2866  }
2867  }
2868  }
2869 
2870 
2871  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2872  void
2874  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2875  {
2876  const size_t numVecs = this->getNumVectors ();
2877  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2878  TEUCHOS_TEST_FOR_EXCEPTION(
2879  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2880  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2881  << numVecs << ".");
2882 
2883  // Use a DualView to copy the scaling constants onto the device.
2884  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2885  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2886  k_alphas.modify_host ();
2887  for (size_t i = 0; i < numAlphas; ++i) {
2888  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2889  }
2890  k_alphas.sync_device ();
2891  // Invoke the scale() overload that takes a device View of coefficients.
2892  this->scale (k_alphas.view_device ());
2893  }
2894 
2895  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2896  void
2898  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2899  {
2900  using Kokkos::ALL;
2901  using Kokkos::subview;
2902 
2903  const size_t lclNumRows = this->getLocalLength ();
2904  const size_t numVecs = this->getNumVectors ();
2905  TEUCHOS_TEST_FOR_EXCEPTION(
2906  static_cast<size_t> (alphas.extent (0)) != numVecs,
2907  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2908  "alphas.extent(0) = " << alphas.extent (0)
2909  << " != this->getNumVectors () = " << numVecs << ".");
2910  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2911  const std::pair<size_t, size_t> colRng (0, numVecs);
2912 
2913  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2914  // type of the return value of subview. This is because if we
2915  // switch the array layout from LayoutLeft to LayoutRight
2916  // (preferred for performance of block operations), the types
2917  // below won't be valid. (A view of a column of a LayoutRight
2918  // multivector has LayoutStride, not LayoutLeft.)
2919 
2920  // If we need sync to device, then host has the most recent version.
2921  const bool useHostVersion = this->need_sync_device ();
2922  if (useHostVersion) {
2923  // Work in host memory. This means we need to create a host
2924  // mirror of the input View of coefficients.
2925  auto alphas_h = Kokkos::create_mirror_view (alphas);
2926  // DEEP_COPY REVIEW - NOT TESTED
2927  Kokkos::deep_copy (alphas_h, alphas);
2928 
2929  auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2930  if (isConstantStride ()) {
2931  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2932  }
2933  else {
2934  for (size_t k = 0; k < numVecs; ++k) {
2935  const size_t Y_col = this->isConstantStride () ? k :
2936  this->whichVectors_[k];
2937  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2938  // We don't have to use the entire 1-D View here; we can use
2939  // the version that takes a scalar coefficient.
2940  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2941  }
2942  }
2943  }
2944  else { // Work in device memory, using the input View 'alphas' directly.
2945  auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2946  if (isConstantStride ()) {
2947  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2948  }
2949  else {
2950  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2951  // as values on host, so copy them to host. Another approach
2952  // would be to fix scal() so that it takes a 0-D View as the
2953  // second argument.
2954  auto alphas_h = Kokkos::create_mirror_view (alphas);
2955  // DEEP_COPY REVIEW - NOT TESTED
2956  Kokkos::deep_copy (alphas_h, alphas);
2957 
2958  for (size_t k = 0; k < numVecs; ++k) {
2959  const size_t Y_col = this->isConstantStride () ? k :
2960  this->whichVectors_[k];
2961  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2962  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2963  }
2964  }
2965  }
2966  }
2967 
2968  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2969  void
2971  scale (const Scalar& alpha,
2973  {
2974  using Kokkos::ALL;
2975  using Kokkos::subview;
2976  const char tfecfFuncName[] = "scale: ";
2977 
2978  const size_t lclNumRows = getLocalLength ();
2979  const size_t numVecs = getNumVectors ();
2980 
2981  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2982  lclNumRows != A.getLocalLength (), std::invalid_argument,
2983  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2984  << A.getLocalLength () << ".");
2985  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2986  numVecs != A.getNumVectors (), std::invalid_argument,
2987  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2988  << A.getNumVectors () << ".");
2989 
2990  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2991  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2992  const std::pair<size_t, size_t> colRng (0, numVecs);
2993 
2994  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2995  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2996  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2997  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2998 
2999  if (isConstantStride () && A.isConstantStride ()) {
3000  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
3001  }
3002  else {
3003  // Make sure that Kokkos only uses the local length for add.
3004  for (size_t k = 0; k < numVecs; ++k) {
3005  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3006  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3007  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3008  auto X_k = subview (X_lcl, ALL (), X_col);
3009 
3010  KokkosBlas::scal (Y_k, theAlpha, X_k);
3011  }
3012  }
3013  }
3014 
3015 
3016 
3017  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3018  void
3021  {
3022  const char tfecfFuncName[] = "reciprocal: ";
3023 
3024  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3025  getLocalLength () != A.getLocalLength (), std::runtime_error,
3026  "MultiVectors do not have the same local length. "
3027  "this->getLocalLength() = " << getLocalLength ()
3028  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3029  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3030  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3031  ": MultiVectors do not have the same number of columns (vectors). "
3032  "this->getNumVectors() = " << getNumVectors ()
3033  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3034 
3035  const size_t numVecs = getNumVectors ();
3036 
3037  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3038  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3039 
3040  if (isConstantStride () && A.isConstantStride ()) {
3041  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
3042  }
3043  else {
3044  using Kokkos::ALL;
3045  using Kokkos::subview;
3046  for (size_t k = 0; k < numVecs; ++k) {
3047  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3048  auto vector_k = subview (this_view_dev, ALL (), this_col);
3049  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3050  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3051  KokkosBlas::reciprocal (vector_k, vector_Ak);
3052  }
3053  }
3054  }
3055 
3056  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3057  void
3060  {
3061  const char tfecfFuncName[] = "abs";
3062 
3063  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3064  getLocalLength () != A.getLocalLength (), std::runtime_error,
3065  ": MultiVectors do not have the same local length. "
3066  "this->getLocalLength() = " << getLocalLength ()
3067  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3068  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3069  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3070  ": MultiVectors do not have the same number of columns (vectors). "
3071  "this->getNumVectors() = " << getNumVectors ()
3072  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3073  const size_t numVecs = getNumVectors ();
3074 
3075  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3076  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3077 
3078  if (isConstantStride () && A.isConstantStride ()) {
3079  KokkosBlas::abs (this_view_dev, A_view_dev);
3080  }
3081  else {
3082  using Kokkos::ALL;
3083  using Kokkos::subview;
3084 
3085  for (size_t k=0; k < numVecs; ++k) {
3086  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3087  auto vector_k = subview (this_view_dev, ALL (), this_col);
3088  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3089  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3090  KokkosBlas::abs (vector_k, vector_Ak);
3091  }
3092  }
3093  }
3094 
3095  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3096  void
3098  update (const Scalar& alpha,
3100  const Scalar& beta)
3101  {
3102  const char tfecfFuncName[] = "update: ";
3103  using Kokkos::subview;
3104  using Kokkos::ALL;
3105 
3106  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3107 
3108  const size_t lclNumRows = getLocalLength ();
3109  const size_t numVecs = getNumVectors ();
3110 
3112  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3113  lclNumRows != A.getLocalLength (), std::invalid_argument,
3114  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3115  << A.getLocalLength () << ".");
3116  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3117  numVecs != A.getNumVectors (), std::invalid_argument,
3118  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3119  << A.getNumVectors () << ".");
3120  }
3121 
3122  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3123  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3124  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3125  const std::pair<size_t, size_t> colRng (0, numVecs);
3126 
3127  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3128  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3129  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3130  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3131 
3132  // The device memory of *this is about to be modified
3133  if (isConstantStride () && A.isConstantStride ()) {
3134  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3135  }
3136  else {
3137  // Make sure that Kokkos only uses the local length for add.
3138  for (size_t k = 0; k < numVecs; ++k) {
3139  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3140  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3141  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3142  auto X_k = subview (X_lcl, ALL (), X_col);
3143 
3144  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3145  }
3146  }
3147  }
3148 
3149  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3150  void
3152  update (const Scalar& alpha,
3154  const Scalar& beta,
3156  const Scalar& gamma)
3157  {
3158  using Kokkos::ALL;
3159  using Kokkos::subview;
3160 
3161  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3162 
3163  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3164 
3165  const size_t lclNumRows = this->getLocalLength ();
3166  const size_t numVecs = getNumVectors ();
3167 
3169  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3170  lclNumRows != A.getLocalLength (), std::invalid_argument,
3171  "The input MultiVector A has " << A.getLocalLength () << " local "
3172  "row(s), but this MultiVector has " << lclNumRows << " local "
3173  "row(s).");
3174  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3175  lclNumRows != B.getLocalLength (), std::invalid_argument,
3176  "The input MultiVector B has " << B.getLocalLength () << " local "
3177  "row(s), but this MultiVector has " << lclNumRows << " local "
3178  "row(s).");
3179  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3180  A.getNumVectors () != numVecs, std::invalid_argument,
3181  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3182  "but this MultiVector has " << numVecs << " column(s).");
3183  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3184  B.getNumVectors () != numVecs, std::invalid_argument,
3185  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3186  "but this MultiVector has " << numVecs << " column(s).");
3187  }
3188 
3189  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3190  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3191  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3192 
3193  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3194  const std::pair<size_t, size_t> colRng (0, numVecs);
3195 
3196  // Prefer 'auto' over specifying the type explicitly. This avoids
3197  // issues with a subview possibly having a different type than the
3198  // original view.
3199  auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3200  auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3201  auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3202 
3203  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3204  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3205  }
3206  else {
3207  // Some input (or *this) is not constant stride,
3208  // so perform the update one column at a time.
3209  for (size_t k = 0; k < numVecs; ++k) {
3210  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3211  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3212  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3213  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3214  theBeta, subview (B_lcl, rowRng, B_col),
3215  theGamma, subview (C_lcl, rowRng, this_col));
3216  }
3217  }
3218  }
3219 
3220  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3221  Teuchos::ArrayRCP<const Scalar>
3223  getData (size_t j) const
3224  {
3225  using Kokkos::ALL;
3226  using IST = impl_scalar_type;
3227  const char tfecfFuncName[] = "getData: ";
3228 
3229  // Any MultiVector method that called the (classic) Kokkos Node's
3230  // viewBuffer or viewBufferNonConst methods always implied a
3231  // device->host synchronization. Thus, we synchronize here as
3232  // well.
3233 
3234  auto hostView = getLocalViewHost(Access::ReadOnly);
3235  const size_t col = isConstantStride () ? j : whichVectors_[j];
3236  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3237  Teuchos::ArrayRCP<const IST> dataAsArcp =
3238  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3239 
3240  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3241  (static_cast<size_t> (hostView_j.extent (0)) <
3242  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3243  "hostView_j.extent(0) = " << hostView_j.extent (0)
3244  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3245  "Please report this bug to the Tpetra developers.");
3246 
3247  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3248  }
3249 
3250  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3251  Teuchos::ArrayRCP<Scalar>
3254  {
3255  using Kokkos::ALL;
3256  using Kokkos::subview;
3257  using IST = impl_scalar_type;
3258  const char tfecfFuncName[] = "getDataNonConst: ";
3259 
3260  auto hostView = getLocalViewHost(Access::ReadWrite);
3261  const size_t col = isConstantStride () ? j : whichVectors_[j];
3262  auto hostView_j = subview (hostView, ALL (), col);
3263  Teuchos::ArrayRCP<IST> dataAsArcp =
3264  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3265 
3266  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3267  (static_cast<size_t> (hostView_j.extent (0)) <
3268  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3269  "hostView_j.extent(0) = " << hostView_j.extent (0)
3270  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3271  "Please report this bug to the Tpetra developers.");
3272 
3273  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3274  }
3275 
3276  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3277  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3279  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3280  {
3281  using Teuchos::RCP;
3283 
3284  // Check whether the index set in cols is contiguous. If it is,
3285  // use the more efficient Range1D version of subCopy.
3286  bool contiguous = true;
3287  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3288  for (size_t j = 1; j < numCopyVecs; ++j) {
3289  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3290  contiguous = false;
3291  break;
3292  }
3293  }
3294  if (contiguous && numCopyVecs > 0) {
3295  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3296  }
3297  else {
3298  RCP<const MV> X_sub = this->subView (cols);
3299  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3300  Y->assign (*X_sub);
3301  return Y;
3302  }
3303  }
3304 
3305  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3306  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3308  subCopy (const Teuchos::Range1D &colRng) const
3309  {
3310  using Teuchos::RCP;
3312 
3313  RCP<const MV> X_sub = this->subView (colRng);
3314  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3315  Y->assign (*X_sub);
3316  return Y;
3317  }
3318 
3319  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3320  size_t
3323  return view_.origExtent(0);
3324  }
3325 
3326  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3327  size_t
3330  return view_.origExtent(1);
3331  }
3332 
3333  template <class Scalar, class LO, class GO, class Node>
3336  const Teuchos::RCP<const map_type>& subMap,
3337  const local_ordinal_type rowOffset) :
3338  base_type (subMap)
3339  {
3340  using Kokkos::ALL;
3341  using Kokkos::subview;
3342  using Teuchos::outArg;
3343  using Teuchos::RCP;
3344  using Teuchos::rcp;
3345  using Teuchos::reduceAll;
3346  using Teuchos::REDUCE_MIN;
3347  using std::endl;
3349  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3350  const char suffix[] = "Please report this bug to the Tpetra developers.";
3351  int lclGood = 1;
3352  int gblGood = 1;
3353  std::unique_ptr<std::ostringstream> errStrm;
3354  const bool debug = ::Tpetra::Details::Behavior::debug ();
3355  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3356 
3357  // Be careful to use the input Map's communicator, not X's. The
3358  // idea is that, on return, *this is a subview of X, using the
3359  // input Map.
3360  const auto comm = subMap->getComm ();
3361  TEUCHOS_ASSERT( ! comm.is_null () );
3362  const int myRank = comm->getRank ();
3363 
3364  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3365  const LO numCols = static_cast<LO> (X.getNumVectors ());
3366  const LO newNumRows = static_cast<LO> (subMap->getLocalNumElements ());
3367  if (verbose) {
3368  std::ostringstream os;
3369  os << "Proc " << myRank << ": " << prefix
3370  << "X: {lclNumRows: " << lclNumRowsBefore
3371  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3372  << ", numCols: " << numCols << "}, "
3373  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3374  std::cerr << os.str ();
3375  }
3376  // We ask for the _original_ number of rows in X, because X could
3377  // be a shorter (fewer rows) view of a longer MV. (For example, X
3378  // could be a domain Map view of a column Map MV.)
3379  const bool tooManyElts =
3380  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3381  if (tooManyElts) {
3382  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3383  *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3384  << newNumRows << ") + rowOffset (=" << rowOffset
3385  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3386  << ")." << endl;
3387  lclGood = 0;
3388  TEUCHOS_TEST_FOR_EXCEPTION
3389  (! debug && tooManyElts, std::invalid_argument,
3390  prefix << errStrm->str () << suffix);
3391  }
3392 
3393  if (debug) {
3394  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3395  if (gblGood != 1) {
3396  std::ostringstream gblErrStrm;
3397  const std::string myErrStr =
3398  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3399  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3400  TEUCHOS_TEST_FOR_EXCEPTION
3401  (true, std::invalid_argument, gblErrStrm.str ());
3402  }
3403  }
3404 
3405  using range_type = std::pair<LO, LO>;
3406  const range_type origRowRng
3407  (rowOffset, static_cast<LO> (X.view_.origExtent (0)));
3408  const range_type rowRng
3409  (rowOffset, rowOffset + newNumRows);
3410 
3411  wrapped_dual_view_type newView = takeSubview (X.view_, rowRng, ALL ());
3412 
3413  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3414  // handling subviews of degenerate Views quite so well. For some
3415  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3416  // 0. We work around by creating a new empty DualView of the
3417  // desired (degenerate) dimensions.
3418  if (newView.extent (0) == 0 &&
3419  newView.extent (1) != X.view_.extent (1)) {
3420  newView =
3421  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3422  }
3423 
3424  MV subViewMV = X.isConstantStride () ?
3425  MV (subMap, newView) :
3426  MV (subMap, newView, X.whichVectors_ ());
3427 
3428  if (debug) {
3429  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3430  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3431  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3432  lclGood = 0;
3433  if (errStrm.get () == nullptr) {
3434  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3435  }
3436  *errStrm << " Proc " << myRank <<
3437  ": subMap.getLocalNumElements(): " << newNumRows <<
3438  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3439  ", X.getNumVectors(): " << numCols <<
3440  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3441  }
3442  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3443  if (gblGood != 1) {
3444  std::ostringstream gblErrStrm;
3445  if (myRank == 0) {
3446  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3447  "dimensions on one or more processes:" << endl;
3448  }
3449  const std::string myErrStr =
3450  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3451  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3452  gblErrStrm << suffix << endl;
3453  TEUCHOS_TEST_FOR_EXCEPTION
3454  (true, std::invalid_argument, gblErrStrm.str ());
3455  }
3456  }
3457 
3458  if (verbose) {
3459  std::ostringstream os;
3460  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3461  std::cerr << os.str ();
3462  }
3463 
3464  *this = subViewMV; // shallow copy
3465 
3466  if (verbose) {
3467  std::ostringstream os;
3468  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3469  std::cerr << os.str ();
3470  }
3471  }
3472 
3473  template <class Scalar, class LO, class GO, class Node>
3476  const map_type& subMap,
3477  const size_t rowOffset) :
3478  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3479  static_cast<local_ordinal_type> (rowOffset))
3480  {}
3481 
3482  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3483  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3485  offsetView (const Teuchos::RCP<const map_type>& subMap,
3486  const size_t offset) const
3487  {
3489  return Teuchos::rcp (new MV (*this, *subMap, offset));
3490  }
3491 
3492  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3493  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3495  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3496  const size_t offset)
3497  {
3499  return Teuchos::rcp (new MV (*this, *subMap, offset));
3500  }
3501 
3502  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3503  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3505  subView (const Teuchos::ArrayView<const size_t>& cols) const
3506  {
3507  using Teuchos::Array;
3508  using Teuchos::rcp;
3510 
3511  const size_t numViewCols = static_cast<size_t> (cols.size ());
3512  TEUCHOS_TEST_FOR_EXCEPTION(
3513  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3514  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3515  "contain at least one entry, but cols.size() = " << cols.size ()
3516  << " == 0.");
3517 
3518  // Check whether the index set in cols is contiguous. If it is,
3519  // use the more efficient Range1D version of subView.
3520  bool contiguous = true;
3521  for (size_t j = 1; j < numViewCols; ++j) {
3522  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3523  contiguous = false;
3524  break;
3525  }
3526  }
3527  if (contiguous) {
3528  if (numViewCols == 0) {
3529  // The output MV has no columns, so there is nothing to view.
3530  return rcp (new MV (this->getMap (), numViewCols));
3531  } else {
3532  // Use the more efficient contiguous-index-range version.
3533  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3534  }
3535  }
3536 
3537  if (isConstantStride ()) {
3538  return rcp (new MV (this->getMap (), view_, cols));
3539  }
3540  else {
3541  Array<size_t> newcols (cols.size ());
3542  for (size_t j = 0; j < numViewCols; ++j) {
3543  newcols[j] = whichVectors_[cols[j]];
3544  }
3545  return rcp (new MV (this->getMap (), view_, newcols ()));
3546  }
3547  }
3548 
3549 
3550  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3551  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3553  subView (const Teuchos::Range1D& colRng) const
3554  {
3555  using ::Tpetra::Details::Behavior;
3556  using Kokkos::ALL;
3557  using Kokkos::subview;
3558  using Teuchos::Array;
3559  using Teuchos::RCP;
3560  using Teuchos::rcp;
3562  const char tfecfFuncName[] = "subView(Range1D): ";
3563 
3564  const size_t lclNumRows = this->getLocalLength ();
3565  const size_t numVecs = this->getNumVectors ();
3566  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3567  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3568  // "at least one vector.");
3569  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3570  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3571  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3572  << numVecs << ".");
3573  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3574  numVecs != 0 && colRng.size () != 0 &&
3575  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3576  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3577  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3578  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3579  "[0, " << numVecs << "].");
3580 
3581  RCP<const MV> X_ret; // the MultiVector subview to return
3582 
3583  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3584  // broken for the case of views with zero rows. I will brutally
3585  // enforce that the subview has the correct dimensions. In
3586  // particular, in the case of zero rows, I will, if necessary,
3587  // create a new dual_view_type with zero rows and the correct
3588  // number of columns. In a debug build, I will use an all-reduce
3589  // to ensure that it has the correct dimensions on all processes.
3590 
3591  const std::pair<size_t, size_t> rows (0, lclNumRows);
3592  if (colRng.size () == 0) {
3593  const std::pair<size_t, size_t> cols (0, 0); // empty range
3594  wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3595  X_ret = rcp (new MV (this->getMap (), X_sub));
3596  }
3597  else {
3598  // Returned MultiVector is constant stride only if *this is.
3599  if (isConstantStride ()) {
3600  const std::pair<size_t, size_t> cols (colRng.lbound (),
3601  colRng.ubound () + 1);
3602  wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3603  X_ret = rcp (new MV (this->getMap (), X_sub));
3604  }
3605  else {
3606  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3607  // We're only asking for one column, so the result does have
3608  // constant stride, even though this MultiVector does not.
3609  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3610  whichVectors_[0] + colRng.ubound () + 1);
3611  wrapped_dual_view_type X_sub = takeSubview (view_, ALL (), col);
3612  X_ret = rcp (new MV (this->getMap (), X_sub));
3613  }
3614  else {
3615  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3616  whichVectors_.begin () + colRng.ubound () + 1);
3617  X_ret = rcp (new MV (this->getMap (), view_, which));
3618  }
3619  }
3620  }
3621 
3622  const bool debug = Behavior::debug ();
3623  if (debug) {
3624  using Teuchos::Comm;
3625  using Teuchos::outArg;
3626  using Teuchos::REDUCE_MIN;
3627  using Teuchos::reduceAll;
3628 
3629  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3630  Teuchos::null : this->getMap ()->getComm ();
3631  if (! comm.is_null ()) {
3632  int lclSuccess = 1;
3633  int gblSuccess = 1;
3634 
3635  if (X_ret.is_null ()) {
3636  lclSuccess = 0;
3637  }
3638  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3639  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3640  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3641  "MultiVector; the return value of this method) is null on some MPI "
3642  "process in this MultiVector's communicator. This should never "
3643  "happen. Please report this bug to the Tpetra developers.");
3644  if (! X_ret.is_null () &&
3645  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3646  lclSuccess = 0;
3647  }
3648  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3649  outArg (gblSuccess));
3650  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3651  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3652  "colRng.size(), on at least one MPI process in this MultiVector's "
3653  "communicator. This should never happen. "
3654  "Please report this bug to the Tpetra developers.");
3655  }
3656  }
3657  return X_ret;
3658  }
3659 
3660 
3661  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3662  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3664  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3665  {
3667  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3668  }
3669 
3670 
3671  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3672  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3674  subViewNonConst (const Teuchos::Range1D &colRng)
3675  {
3677  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3678  }
3679 
3680 
3681  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3684  const size_t j)
3685  : base_type (X.getMap ())
3686  {
3687  using Kokkos::subview;
3688  typedef std::pair<size_t, size_t> range_type;
3689  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3690 
3691  const size_t numCols = X.getNumVectors ();
3692  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3693  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3694  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3695  const size_t jj = X.isConstantStride () ?
3696  static_cast<size_t> (j) :
3697  static_cast<size_t> (X.whichVectors_[j]);
3698  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3699 
3700  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3701  // Export or Import operations with different subviews of a
3702  // MultiVector. Thus, it is safe to reuse communication buffers.
3703  // See #1560 discussion.
3704  //
3705  // We only need one column's worth of buffer for imports_ and
3706  // exports_. Taking subviews now ensures that their lengths will
3707  // be exactly what we need, so we won't have to resize them later.
3708  {
3709  const size_t newSize = X.imports_.extent (0) / numCols;
3710  const size_t offset = jj*newSize;
3711  auto newImports = X.imports_;
3712  newImports.d_view = subview (X.imports_.d_view,
3713  range_type (offset, offset+newSize));
3714  newImports.h_view = subview (X.imports_.h_view,
3715  range_type (offset, offset+newSize));
3716  this->imports_ = newImports;
3717  }
3718  {
3719  const size_t newSize = X.exports_.extent (0) / numCols;
3720  const size_t offset = jj*newSize;
3721  auto newExports = X.exports_;
3722  newExports.d_view = subview (X.exports_.d_view,
3723  range_type (offset, offset+newSize));
3724  newExports.h_view = subview (X.exports_.h_view,
3725  range_type (offset, offset+newSize));
3726  this->exports_ = newExports;
3727  }
3728  // These two DualViews already either have the right number of
3729  // entries, or zero entries. This means that we don't need to
3730  // resize them.
3733  }
3734 
3735 
3736  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3737  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3739  getVector (const size_t j) const
3740  {
3742  return Teuchos::rcp (new V (*this, j));
3743  }
3744 
3745 
3746  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3747  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3749  getVectorNonConst (const size_t j)
3750  {
3752  return Teuchos::rcp (new V (*this, j));
3753  }
3754 
3755 
3756  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3757  void
3759  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3760  {
3761  using IST = impl_scalar_type;
3762  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3763  Kokkos::HostSpace,
3764  Kokkos::MemoryUnmanaged>;
3765  const char tfecfFuncName[] = "get1dCopy: ";
3766 
3767  const size_t numRows = this->getLocalLength ();
3768  const size_t numCols = this->getNumVectors ();
3769 
3770  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3771  (LDA < numRows, std::runtime_error,
3772  "LDA = " << LDA << " < numRows = " << numRows << ".");
3773  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3774  (numRows > size_t (0) && numCols > size_t (0) &&
3775  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3776  std::runtime_error,
3777  "A.size() = " << A.size () << ", but its size must be at least "
3778  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3779 
3780  const std::pair<size_t, size_t> rowRange (0, numRows);
3781  const std::pair<size_t, size_t> colRange (0, numCols);
3782 
3783  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3784  LDA, numCols);
3785  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3786 
3798  if (this->need_sync_host() && this->need_sync_device()) {
3801  throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3802  }
3803  else {
3804  const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3805  if (this->isConstantStride ()) {
3806  if (useHostView) {
3807  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3808  // DEEP_COPY REVIEW - NOT TESTED
3809  Kokkos::deep_copy (A_view, srcView_host);
3810  } else {
3811  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3812  // DEEP_COPY REVIEW - NOT TESTED
3813  Kokkos::deep_copy (A_view, srcView_device);
3814  }
3815  }
3816  else {
3817  for (size_t j = 0; j < numCols; ++j) {
3818  const size_t srcCol = this->whichVectors_[j];
3819  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3820 
3821  if (useHostView) {
3822  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3823  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3824  // DEEP_COPY REVIEW - NOT TESTED
3825  Kokkos::deep_copy (dstColView, srcColView_host);
3826  } else {
3827  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3828  auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3829  // DEEP_COPY REVIEW - NOT TESTED
3830  Kokkos::deep_copy (dstColView, srcColView_device);
3831  }
3832  }
3833  }
3834  }
3835  }
3836 
3837 
3838  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3839  void
3841  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3842  {
3844  const char tfecfFuncName[] = "get2dCopy: ";
3845  const size_t numRows = this->getLocalLength ();
3846  const size_t numCols = this->getNumVectors ();
3847 
3848  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3849  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3850  std::runtime_error, "Input array of pointers must contain as many "
3851  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3852  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3853 
3854  if (numRows != 0 && numCols != 0) {
3855  // No side effects until we've validated the input.
3856  for (size_t j = 0; j < numCols; ++j) {
3857  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3858  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3859  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3860  "the input array of arrays is not long enough to fit all entries in "
3861  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3862  << " < getLocalLength() = " << numRows << ".");
3863  }
3864 
3865  // We've validated the input, so it's safe to start copying.
3866  for (size_t j = 0; j < numCols; ++j) {
3867  Teuchos::RCP<const V> X_j = this->getVector (j);
3868  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3869  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3870  }
3871  }
3872  }
3873 
3874 
3875  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3876  Teuchos::ArrayRCP<const Scalar>
3878  get1dView () const
3879  {
3880  if (getLocalLength () == 0 || getNumVectors () == 0) {
3881  return Teuchos::null;
3882  } else {
3883  TEUCHOS_TEST_FOR_EXCEPTION(
3884  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3885  "get1dView: This MultiVector does not have constant stride, so it is "
3886  "not possible to view its data as a single array. You may check "
3887  "whether a MultiVector has constant stride by calling "
3888  "isConstantStride().");
3889  // Since get1dView() is and was always marked const, I have to
3890  // cast away const here in order not to break backwards
3891  // compatibility.
3892  auto X_lcl = getLocalViewHost(Access::ReadOnly);
3893  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3894  Kokkos::Compat::persistingView (X_lcl);
3895  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3896  }
3897  }
3898 
3899  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3900  Teuchos::ArrayRCP<Scalar>
3903  {
3904  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3905  return Teuchos::null;
3906  }
3907  else {
3908  TEUCHOS_TEST_FOR_EXCEPTION
3909  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3910  "get1dViewNonConst: This MultiVector does not have constant stride, "
3911  "so it is not possible to view its data as a single array. You may "
3912  "check whether a MultiVector has constant stride by calling "
3913  "isConstantStride().");
3914  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3915  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3916  Kokkos::Compat::persistingView (X_lcl);
3917  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3918  }
3919  }
3920 
3921  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3922  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3925  {
3926  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3927 
3928  // Don't use the row range here on the outside, in order to avoid
3929  // a strided return type (in case Kokkos::subview is conservative
3930  // about that). Instead, use the row range for the column views
3931  // in the loop.
3932  const size_t myNumRows = this->getLocalLength ();
3933  const size_t numCols = this->getNumVectors ();
3934  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3935 
3936  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3937  for (size_t j = 0; j < numCols; ++j) {
3938  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3939  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3940  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3941  Kokkos::Compat::persistingView (X_lcl_j);
3942  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3943  }
3944  return views;
3945  }
3946 
3947  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3950  getLocalViewHost(Access::ReadOnlyStruct s) const
3951  {
3952  return view_.getHostView(s);
3953  }
3954 
3955  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3958  getLocalViewHost(Access::ReadWriteStruct s)
3959  {
3960  return view_.getHostView(s);
3961  }
3962 
3963  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3966  getLocalViewHost(Access::OverwriteAllStruct s)
3967  {
3968  return view_.getHostView(s);
3969  }
3970 
3971  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3974  getLocalViewDevice(Access::ReadOnlyStruct s) const
3975  {
3976  return view_.getDeviceView(s);
3977  }
3978 
3979  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3982  getLocalViewDevice(Access::ReadWriteStruct s)
3983  {
3984  return view_.getDeviceView(s);
3985  }
3986 
3987  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3990  getLocalViewDevice(Access::OverwriteAllStruct s)
3991  {
3992  return view_.getDeviceView(s);
3993  }
3994 
3995  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3999  return view_;
4000  }
4001 
4002  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4003  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
4005  get2dView () const
4006  {
4007  // Since get2dView() is and was always marked const, I have to
4008  // cast away const here in order not to break backwards
4009  // compatibility.
4010  auto X_lcl = getLocalViewHost(Access::ReadOnly);
4011 
4012  // Don't use the row range here on the outside, in order to avoid
4013  // a strided return type (in case Kokkos::subview is conservative
4014  // about that). Instead, use the row range for the column views
4015  // in the loop.
4016  const size_t myNumRows = this->getLocalLength ();
4017  const size_t numCols = this->getNumVectors ();
4018  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
4019 
4020  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
4021  for (size_t j = 0; j < numCols; ++j) {
4022  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
4023  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
4024  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
4025  Kokkos::Compat::persistingView (X_lcl_j);
4026  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
4027  }
4028  return views;
4029  }
4030 
4031  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4032  void
4034  multiply (Teuchos::ETransp transA,
4035  Teuchos::ETransp transB,
4036  const Scalar& alpha,
4039  const Scalar& beta)
4040  {
4041  using ::Tpetra::Details::ProfilingRegion;
4042  using Teuchos::CONJ_TRANS;
4043  using Teuchos::NO_TRANS;
4044  using Teuchos::TRANS;
4045  using Teuchos::RCP;
4046  using Teuchos::rcp;
4047  using Teuchos::rcpFromRef;
4048  using std::endl;
4049  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
4050  using LO = local_ordinal_type;
4051  using STS = Teuchos::ScalarTraits<Scalar>;
4053  const char tfecfFuncName[] = "multiply: ";
4054  ProfilingRegion region ("Tpetra::MV::multiply");
4055 
4056  // This routine performs a variety of matrix-matrix multiply
4057  // operations, interpreting the MultiVector (this-aka C , A and B)
4058  // as 2D matrices. Variations are due to the fact that A, B and C
4059  // can be locally replicated or globally distributed MultiVectors
4060  // and that we may or may not operate with the transpose of A and
4061  // B. Possible cases are:
4062  //
4063  // Operations # Cases Notes
4064  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4065  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4066  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4067  //
4068  // The following operations are not meaningful for 1-D
4069  // distributions:
4070  //
4071  // u1) C(local) = A^T(distr) * B^T(distr) 1
4072  // u2) C(local) = A (distr) * B^X(distr) 2
4073  // u3) C(distr) = A^X(local) * B^X(local) 4
4074  // u4) C(distr) = A^X(local) * B^X(distr) 4
4075  // u5) C(distr) = A^T(distr) * B^X(local) 2
4076  // u6) C(local) = A^X(distr) * B^X(local) 4
4077  // u7) C(distr) = A^X(distr) * B^X(local) 4
4078  // u8) C(local) = A^X(local) * B^X(distr) 4
4079  //
4080  // Total number of cases: 32 (= 2^5).
4081 
4082  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4083 
4084  const bool A_is_local = ! A.isDistributed ();
4085  const bool B_is_local = ! B.isDistributed ();
4086  const bool C_is_local = ! this->isDistributed ();
4087 
4088  // In debug mode, check compatibility of local dimensions. We
4089  // only do this in debug mode, since it requires an all-reduce
4090  // to ensure correctness on all processses. It's entirely
4091  // possible that only some processes may have incompatible local
4092  // dimensions. Throwing an exception only on those processes
4093  // could cause this method to hang.
4094  const bool debug = ::Tpetra::Details::Behavior::debug ();
4095  if (debug) {
4096  auto myMap = this->getMap ();
4097  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4098  using Teuchos::REDUCE_MIN;
4099  using Teuchos::reduceAll;
4100  using Teuchos::outArg;
4101 
4102  auto comm = myMap->getComm ();
4103  const size_t A_nrows =
4104  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4105  const size_t A_ncols =
4106  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4107  const size_t B_nrows =
4108  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4109  const size_t B_ncols =
4110  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4111 
4112  const bool lclBad = this->getLocalLength () != A_nrows ||
4113  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4114 
4115  const int myRank = comm->getRank ();
4116  std::ostringstream errStrm;
4117  if (this->getLocalLength () != A_nrows) {
4118  errStrm << "Proc " << myRank << ": this->getLocalLength()="
4119  << this->getLocalLength () << " != A_nrows=" << A_nrows
4120  << "." << std::endl;
4121  }
4122  if (this->getNumVectors () != B_ncols) {
4123  errStrm << "Proc " << myRank << ": this->getNumVectors()="
4124  << this->getNumVectors () << " != B_ncols=" << B_ncols
4125  << "." << std::endl;
4126  }
4127  if (A_ncols != B_nrows) {
4128  errStrm << "Proc " << myRank << ": A_ncols="
4129  << A_ncols << " != B_nrows=" << B_nrows
4130  << "." << std::endl;
4131  }
4132 
4133  const int lclGood = lclBad ? 0 : 1;
4134  int gblGood = 0;
4135  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4136  if (gblGood != 1) {
4137  std::ostringstream os;
4138  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
4139 
4140  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4141  (true, std::runtime_error, "Inconsistent local dimensions on at "
4142  "least one process in this object's communicator." << std::endl
4143  << "Operation: "
4144  << "C(" << (C_is_local ? "local" : "distr") << ") = "
4145  << alpha << "*A"
4146  << (transA == Teuchos::TRANS ? "^T" :
4147  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4148  << "(" << (A_is_local ? "local" : "distr") << ") * "
4149  << beta << "*B"
4150  << (transA == Teuchos::TRANS ? "^T" :
4151  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4152  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4153  << "Global dimensions: C(" << this->getGlobalLength () << ", "
4154  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4155  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4156  << ", " << B.getNumVectors () << ")." << std::endl
4157  << os.str ());
4158  }
4159  }
4160  }
4161 
4162  // Case 1: C(local) = A^X(local) * B^X(local)
4163  const bool Case1 = C_is_local && A_is_local && B_is_local;
4164  // Case 2: C(local) = A^T(distr) * B (distr)
4165  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4166  transA != NO_TRANS &&
4167  transB == NO_TRANS;
4168  // Case 3: C(distr) = A (distr) * B^X(local)
4169  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4170  transA == NO_TRANS;
4171 
4172  // Test that we are considering a meaningful case
4173  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4174  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4175  "Multiplication of op(A) and op(B) into *this is not a "
4176  "supported use case.");
4177 
4178  if (beta != STS::zero () && Case2) {
4179  // If Case2, then C is local and contributions must be summed
4180  // across all processes. However, if beta != 0, then accumulate
4181  // beta*C into the sum. When summing across all processes, we
4182  // only want to accumulate this once, so set beta == 0 on all
4183  // processes except Process 0.
4184  const int myRank = this->getMap ()->getComm ()->getRank ();
4185  if (myRank != 0) {
4186  beta_local = ATS::zero ();
4187  }
4188  }
4189 
4190  // We only know how to do matrix-matrix multiplies if all the
4191  // MultiVectors have constant stride. If not, we have to make
4192  // temporary copies of those MultiVectors (including possibly
4193  // *this) that don't have constant stride.
4194  RCP<MV> C_tmp;
4195  if (! isConstantStride ()) {
4196  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4197  } else {
4198  C_tmp = rcp (this, false);
4199  }
4200 
4201  RCP<const MV> A_tmp;
4202  if (! A.isConstantStride ()) {
4203  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4204  } else {
4205  A_tmp = rcpFromRef (A);
4206  }
4207 
4208  RCP<const MV> B_tmp;
4209  if (! B.isConstantStride ()) {
4210  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4211  } else {
4212  B_tmp = rcpFromRef (B);
4213  }
4214 
4215  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4216  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4217  ! A_tmp->isConstantStride (), std::logic_error,
4218  "Failed to make temporary constant-stride copies of MultiVectors.");
4219 
4220  {
4221  const LO A_lclNumRows = A_tmp->getLocalLength ();
4222  const LO A_numVecs = A_tmp->getNumVectors ();
4223  auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4224  auto A_sub = Kokkos::subview (A_lcl,
4225  std::make_pair (LO (0), A_lclNumRows),
4226  std::make_pair (LO (0), A_numVecs));
4227 
4228 
4229  const LO B_lclNumRows = B_tmp->getLocalLength ();
4230  const LO B_numVecs = B_tmp->getNumVectors ();
4231  auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4232  auto B_sub = Kokkos::subview (B_lcl,
4233  std::make_pair (LO (0), B_lclNumRows),
4234  std::make_pair (LO (0), B_numVecs));
4235 
4236  const LO C_lclNumRows = C_tmp->getLocalLength ();
4237  const LO C_numVecs = C_tmp->getNumVectors ();
4238 
4239  auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4240  auto C_sub = Kokkos::subview (C_lcl,
4241  std::make_pair (LO (0), C_lclNumRows),
4242  std::make_pair (LO (0), C_numVecs));
4243  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4244  (transA == Teuchos::TRANS ? 'T' : 'C'));
4245  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4246  (transB == Teuchos::TRANS ? 'T' : 'C'));
4247  const impl_scalar_type alpha_IST (alpha);
4248 
4249  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4250 
4251  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4252  beta_local, C_sub);
4253  }
4254 
4255  if (! isConstantStride ()) {
4256  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4257  }
4258 
4259  // Dispose of (possibly) extra copies of A and B.
4260  A_tmp = Teuchos::null;
4261  B_tmp = Teuchos::null;
4262 
4263  // If Case 2 then sum up *this and distribute it to all processes.
4264  if (Case2) {
4265  this->reduce ();
4266  }
4267  }
4268 
4269  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4270  void
4272  elementWiseMultiply (Scalar scalarAB,
4275  Scalar scalarThis)
4276  {
4277  using Kokkos::ALL;
4278  using Kokkos::subview;
4279  const char tfecfFuncName[] = "elementWiseMultiply: ";
4280 
4281  const size_t lclNumRows = this->getLocalLength ();
4282  const size_t numVecs = this->getNumVectors ();
4283 
4284  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4285  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4286  std::runtime_error, "MultiVectors do not have the same local length.");
4287  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4288  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4289  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4290  << ".");
4291 
4292  auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4293  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4294  auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4295 
4296  if (isConstantStride () && B.isConstantStride ()) {
4297  // A is just a Vector; it only has one column, so it always has
4298  // constant stride.
4299  //
4300  // If both *this and B have constant stride, we can do an
4301  // element-wise multiply on all columns at once.
4302  KokkosBlas::mult (scalarThis,
4303  this_view,
4304  scalarAB,
4305  subview (A_view, ALL (), 0),
4306  B_view);
4307  }
4308  else {
4309  for (size_t j = 0; j < numVecs; ++j) {
4310  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4311  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4312  KokkosBlas::mult (scalarThis,
4313  subview (this_view, ALL (), C_col),
4314  scalarAB,
4315  subview (A_view, ALL (), 0),
4316  subview (B_view, ALL (), B_col));
4317  }
4318  }
4319  }
4320 
4321  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4322  void
4325  {
4327  using ::Tpetra::Details::ProfilingRegion;
4328  ProfilingRegion region ("Tpetra::MV::reduce");
4329 
4330  const auto map = this->getMap ();
4331  if (map.get () == nullptr) {
4332  return;
4333  }
4334  const auto comm = map->getComm ();
4335  if (comm.get () == nullptr) {
4336  return;
4337  }
4338 
4339  // Avoid giving device buffers to MPI. Even if MPI handles them
4340  // correctly, doing so may not perform well.
4341  const bool changed_on_device = this->need_sync_host ();
4342  if (changed_on_device) {
4343  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4344  // and host will be separate allocations. In that case, it may
4345  // pay to do the all-reduce from device to host.
4346  Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4347  auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4348  allReduceView (X_lcl, X_lcl, *comm);
4349  }
4350  else {
4351  auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4352  allReduceView (X_lcl, X_lcl, *comm);
4353  }
4354  }
4355 
4356  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4357  void
4359  replaceLocalValue (const LocalOrdinal lclRow,
4360  const size_t col,
4361  const impl_scalar_type& ScalarValue)
4362  {
4364  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4365  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4366  TEUCHOS_TEST_FOR_EXCEPTION(
4367  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4368  std::runtime_error,
4369  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4370  << " is invalid. The range of valid row indices on this process "
4371  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4372  << ", " << maxLocalIndex << "].");
4373  TEUCHOS_TEST_FOR_EXCEPTION(
4374  vectorIndexOutOfRange(col),
4375  std::runtime_error,
4376  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4377  << " of the multivector is invalid.");
4378  }
4379 
4380  auto view = this->getLocalViewHost(Access::ReadWrite);
4381  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4382  view (lclRow, colInd) = ScalarValue;
4383  }
4384 
4385 
4386  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4387  void
4389  sumIntoLocalValue (const LocalOrdinal lclRow,
4390  const size_t col,
4391  const impl_scalar_type& value,
4392  const bool atomic)
4393  {
4395  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4396  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4397  TEUCHOS_TEST_FOR_EXCEPTION(
4398  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4399  std::runtime_error,
4400  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4401  << " is invalid. The range of valid row indices on this process "
4402  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4403  << ", " << maxLocalIndex << "].");
4404  TEUCHOS_TEST_FOR_EXCEPTION(
4405  vectorIndexOutOfRange(col),
4406  std::runtime_error,
4407  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4408  << " of the multivector is invalid.");
4409  }
4410 
4411  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4412 
4413  auto view = this->getLocalViewHost(Access::ReadWrite);
4414  if (atomic) {
4415  Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4416  }
4417  else {
4418  view(lclRow, colInd) += value;
4419  }
4420  }
4421 
4422 
4423  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4424  void
4426  replaceGlobalValue (const GlobalOrdinal gblRow,
4427  const size_t col,
4428  const impl_scalar_type& ScalarValue)
4429  {
4430  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4431  // touches the RCP's reference count, which isn't thread safe.
4432  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4433 
4435  const char tfecfFuncName[] = "replaceGlobalValue: ";
4436  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4437  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4438  std::runtime_error,
4439  "Global row index " << gblRow << " is not present on this process "
4440  << this->getMap ()->getComm ()->getRank () << ".");
4441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4442  (this->vectorIndexOutOfRange (col), std::runtime_error,
4443  "Vector index " << col << " of the MultiVector is invalid.");
4444  }
4445 
4446  this->replaceLocalValue (lclRow, col, ScalarValue);
4447  }
4448 
4449  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4450  void
4452  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4453  const size_t col,
4454  const impl_scalar_type& value,
4455  const bool atomic)
4456  {
4457  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4458  // touches the RCP's reference count, which isn't thread safe.
4459  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4460 
4462  TEUCHOS_TEST_FOR_EXCEPTION(
4463  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4464  std::runtime_error,
4465  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4466  << " is not present on this process "
4467  << this->getMap ()->getComm ()->getRank () << ".");
4468  TEUCHOS_TEST_FOR_EXCEPTION(
4469  vectorIndexOutOfRange(col),
4470  std::runtime_error,
4471  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4472  << " of the multivector is invalid.");
4473  }
4474 
4475  this->sumIntoLocalValue (lclRow, col, value, atomic);
4476  }
4477 
4478  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4479  template <class T>
4480  Teuchos::ArrayRCP<T>
4482  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4483  size_t j) const
4484  {
4485  typedef Kokkos::DualView<impl_scalar_type*,
4486  typename dual_view_type::array_layout,
4487  execution_space> col_dual_view_type;
4488  const size_t col = isConstantStride () ? j : whichVectors_[j];
4489  col_dual_view_type X_col =
4490  Kokkos::subview (view_, Kokkos::ALL (), col);
4491  return Kokkos::Compat::persistingView (X_col.d_view);
4492  }
4493 
4494  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4495  bool
4498  return view_.need_sync_host ();
4499  }
4500 
4501  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4502  bool
4505  return view_.need_sync_device ();
4506  }
4507 
4508  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4509  std::string
4511  descriptionImpl (const std::string& className) const
4512  {
4513  using Teuchos::TypeNameTraits;
4514 
4515  std::ostringstream out;
4516  out << "\"" << className << "\": {";
4517  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4518  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4519  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4520  << ", Node" << Node::name ()
4521  << "}, ";
4522  if (this->getObjectLabel () != "") {
4523  out << "Label: \"" << this->getObjectLabel () << "\", ";
4524  }
4525  out << ", numRows: " << this->getGlobalLength ();
4526  if (className != "Tpetra::Vector") {
4527  out << ", numCols: " << this->getNumVectors ()
4528  << ", isConstantStride: " << this->isConstantStride ();
4529  }
4530  if (this->isConstantStride ()) {
4531  out << ", columnStride: " << this->getStride ();
4532  }
4533  out << "}";
4534 
4535  return out.str ();
4536  }
4537 
4538  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4539  std::string
4542  {
4543  return this->descriptionImpl ("Tpetra::MultiVector");
4544  }
4545 
4546  namespace Details
4547  {
4548  template<typename ViewType>
4549  void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4550  {
4551  using std::endl;
4552  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4553  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4554  static_assert(ViewType::rank == 2,
4555  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4556  // The square braces [] and their contents are in Matlab
4557  // format, so users may copy and paste directly into Matlab.
4558  out << "Values("<<prefix<<"): " << std::endl
4559  << "[";
4560  const size_t numRows = v.extent(0);
4561  const size_t numCols = v.extent(1);
4562  if (numCols == 1) {
4563  for (size_t i = 0; i < numRows; ++i) {
4564  out << v(i,0);
4565  if (i + 1 < numRows) {
4566  out << "; ";
4567  }
4568  }
4569  }
4570  else {
4571  for (size_t i = 0; i < numRows; ++i) {
4572  for (size_t j = 0; j < numCols; ++j) {
4573  out << v(i,j);
4574  if (j + 1 < numCols) {
4575  out << ", ";
4576  }
4577  }
4578  if (i + 1 < numRows) {
4579  out << "; ";
4580  }
4581  }
4582  }
4583  out << "]" << endl;
4584  }
4585  }
4586 
4587  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4588  std::string
4590  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4591  {
4592  typedef LocalOrdinal LO;
4593  using std::endl;
4594 
4595  if (vl <= Teuchos::VERB_LOW) {
4596  return std::string ();
4597  }
4598  auto map = this->getMap ();
4599  if (map.is_null ()) {
4600  return std::string ();
4601  }
4602  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4603  auto outp = Teuchos::getFancyOStream (outStringP);
4604  Teuchos::FancyOStream& out = *outp;
4605  auto comm = map->getComm ();
4606  const int myRank = comm->getRank ();
4607  const int numProcs = comm->getSize ();
4608 
4609  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4610  Teuchos::OSTab tab1 (out);
4611 
4612  // VERB_MEDIUM and higher prints getLocalLength()
4613  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4614  out << "Local number of rows: " << lclNumRows << endl;
4615 
4616  if (vl > Teuchos::VERB_MEDIUM) {
4617  // VERB_HIGH and higher prints isConstantStride() and getStride().
4618  // The first is only relevant if the Vector has multiple columns.
4619  if (this->getNumVectors () != static_cast<size_t> (1)) {
4620  out << "isConstantStride: " << this->isConstantStride () << endl;
4621  }
4622  if (this->isConstantStride ()) {
4623  out << "Column stride: " << this->getStride () << endl;
4624  }
4625 
4626  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4627  // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4629  // so we can't use our regular accessor functins
4630 
4631  // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4632  // to trigger (since this function is conceptually const). Thus, we
4633  // get *copies* of the view's data instead.
4634  auto X_dev = view_.getDeviceCopy();
4635  auto X_host = view_.getHostCopy();
4636 
4637  if(X_dev.data() == X_host.data()) {
4638  // One single allocation
4639  Details::print_vector(out,"unified",X_host);
4640  }
4641  else {
4642  Details::print_vector(out,"host",X_host);
4643  Details::print_vector(out,"dev",X_dev);
4644  }
4645  }
4646  }
4647  out.flush (); // make sure the ostringstream got everything
4648  return outStringP->str ();
4649  }
4650 
4651  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4652  void
4654  describeImpl (Teuchos::FancyOStream& out,
4655  const std::string& className,
4656  const Teuchos::EVerbosityLevel verbLevel) const
4657  {
4658  using Teuchos::TypeNameTraits;
4659  using Teuchos::VERB_DEFAULT;
4660  using Teuchos::VERB_NONE;
4661  using Teuchos::VERB_LOW;
4662  using std::endl;
4663  const Teuchos::EVerbosityLevel vl =
4664  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4665 
4666  if (vl == VERB_NONE) {
4667  return; // don't print anything
4668  }
4669  // If this Vector's Comm is null, then the Vector does not
4670  // participate in collective operations with the other processes.
4671  // In that case, it is not even legal to call this method. The
4672  // reasonable thing to do in that case is nothing.
4673  auto map = this->getMap ();
4674  if (map.is_null ()) {
4675  return;
4676  }
4677  auto comm = map->getComm ();
4678  if (comm.is_null ()) {
4679  return;
4680  }
4681 
4682  const int myRank = comm->getRank ();
4683 
4684  // Only Process 0 should touch the output stream, but this method
4685  // in general may need to do communication. Thus, we may need to
4686  // preserve the current tab level across multiple "if (myRank ==
4687  // 0) { ... }" inner scopes. This is why we sometimes create
4688  // OSTab instances by pointer, instead of by value. We only need
4689  // to create them by pointer if the tab level must persist through
4690  // multiple inner scopes.
4691  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4692 
4693  // VERB_LOW and higher prints the equivalent of description().
4694  if (myRank == 0) {
4695  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4696  out << "\"" << className << "\":" << endl;
4697  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4698  {
4699  out << "Template parameters:" << endl;
4700  Teuchos::OSTab tab2 (out);
4701  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4702  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4703  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4704  << "Node: " << Node::name () << endl;
4705  }
4706  if (this->getObjectLabel () != "") {
4707  out << "Label: \"" << this->getObjectLabel () << "\", ";
4708  }
4709  out << "Global number of rows: " << this->getGlobalLength () << endl;
4710  if (className != "Tpetra::Vector") {
4711  out << "Number of columns: " << this->getNumVectors () << endl;
4712  }
4713  // getStride() may differ on different processes, so it (and
4714  // isConstantStride()) properly belong to per-process data.
4715  }
4716 
4717  // This is collective over the Map's communicator.
4718  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4719  const std::string lclStr = this->localDescribeToString (vl);
4720  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4721  }
4722  }
4723 
4724  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4725  void
4727  describe (Teuchos::FancyOStream &out,
4728  const Teuchos::EVerbosityLevel verbLevel) const
4729  {
4730  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4731  }
4732 
4733  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4734  void
4736  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4737  {
4738  replaceMap (newMap);
4739  }
4740 
4741  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4742  void
4745  {
4746  using ::Tpetra::Details::localDeepCopy;
4747  const char prefix[] = "Tpetra::MultiVector::assign: ";
4748 
4749  TEUCHOS_TEST_FOR_EXCEPTION
4750  (this->getGlobalLength () != src.getGlobalLength () ||
4751  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4752  prefix << "Global dimensions of the two Tpetra::MultiVector "
4753  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4754  << "," << src.getNumVectors () << "], and *this has dimensions ["
4755  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4756  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4757  TEUCHOS_TEST_FOR_EXCEPTION
4758  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4759  prefix << "The local row counts of the two Tpetra::MultiVector "
4760  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4761  << " and *this has " << this->getLocalLength () << " row(s).");
4762 
4763 
4764  // See #1510. We're writing to *this, so we don't care about its
4765  // contents in either memory space, and we don't want
4766  // DualView::modify to complain about "concurrent modification" of
4767  // host and device Views.
4768 
4772 
4773  // If need sync to device, then host has most recent version.
4774  const bool src_last_updated_on_host = src.need_sync_device ();
4775 
4776  if (src_last_updated_on_host) {
4777  localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4778  src.getLocalViewHost(Access::ReadOnly),
4779  this->isConstantStride (),
4780  src.isConstantStride (),
4781  this->whichVectors_ (),
4782  src.whichVectors_ ());
4783  }
4784  else {
4785  localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4786  src.getLocalViewDevice(Access::ReadOnly),
4787  this->isConstantStride (),
4788  src.isConstantStride (),
4789  this->whichVectors_ (),
4790  src.whichVectors_ ());
4791  }
4792  }
4793 
4794  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4795  template<class T>
4796  Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4798  convert () const
4799  {
4800  using Teuchos::RCP;
4801 
4802  auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4803  this->getNumVectors(),
4804  /*zeroOut=*/false));
4805  Tpetra::deep_copy(*newMV, *this);
4806  return newMV;
4807  }
4808 
4809  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4810  bool
4813  {
4814  using ::Tpetra::Details::PackTraits;
4815 
4816  const size_t l1 = this->getLocalLength();
4817  const size_t l2 = vec.getLocalLength();
4818  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4819  return false;
4820  }
4821 
4822  return true;
4823  }
4824 
4825  template <class ST, class LO, class GO, class NT>
4828  std::swap(mv.map_, this->map_);
4829  std::swap(mv.view_, this->view_);
4830  std::swap(mv.whichVectors_, this->whichVectors_);
4831  }
4832 
4833 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4834  template <class ST, class LO, class GO, class NT>
4835  void
4837  const Teuchos::SerialDenseMatrix<int, ST>& src)
4838  {
4839  using ::Tpetra::Details::localDeepCopy;
4840  using MV = MultiVector<ST, LO, GO, NT>;
4841  using IST = typename MV::impl_scalar_type;
4842  using input_view_type =
4843  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4844  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4845  using pair_type = std::pair<LO, LO>;
4846 
4847  const LO numRows = static_cast<LO> (src.numRows ());
4848  const LO numCols = static_cast<LO> (src.numCols ());
4849 
4850  TEUCHOS_TEST_FOR_EXCEPTION
4851  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4852  numCols != static_cast<LO> (dst.getNumVectors ()),
4853  std::invalid_argument, "Tpetra::deep_copy: On Process "
4854  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4855  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4856  << ", but src is " << numRows << " x " << numCols << ".");
4857 
4858  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4859  input_view_type src_orig (src_raw, src.stride (), numCols);
4860  input_view_type src_view =
4861  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4862 
4863  constexpr bool src_isConstantStride = true;
4864  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4865  localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4866  src_view,
4867  dst.isConstantStride (),
4868  src_isConstantStride,
4869  getMultiVectorWhichVectors (dst),
4870  srcWhichVectors);
4871  }
4872 
4873  template <class ST, class LO, class GO, class NT>
4874  void
4875  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4876  const MultiVector<ST, LO, GO, NT>& src)
4877  {
4878  using ::Tpetra::Details::localDeepCopy;
4879  using MV = MultiVector<ST, LO, GO, NT>;
4880  using IST = typename MV::impl_scalar_type;
4881  using output_view_type =
4882  Kokkos::View<IST**, Kokkos::LayoutLeft,
4883  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4884  using pair_type = std::pair<LO, LO>;
4885 
4886  const LO numRows = static_cast<LO> (dst.numRows ());
4887  const LO numCols = static_cast<LO> (dst.numCols ());
4888 
4889  TEUCHOS_TEST_FOR_EXCEPTION
4890  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4891  numCols != static_cast<LO> (src.getNumVectors ()),
4892  std::invalid_argument, "Tpetra::deep_copy: On Process "
4893  << src.getMap ()->getComm ()->getRank () << ", src is "
4894  << src.getLocalLength () << " x " << src.getNumVectors ()
4895  << ", but dst is " << numRows << " x " << numCols << ".");
4896 
4897  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4898  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4899  auto dst_view =
4900  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4901 
4902  constexpr bool dst_isConstantStride = true;
4903  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4904 
4905  // Prefer the host version of src's data.
4906  localDeepCopy (dst_view,
4907  src.getLocalViewHost(Access::ReadOnly),
4908  dst_isConstantStride,
4909  src.isConstantStride (),
4910  dstWhichVectors,
4911  getMultiVectorWhichVectors (src));
4912  }
4913 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4914 
4915  template <class Scalar, class LO, class GO, class NT>
4916  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4917  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4918  size_t numVectors)
4919  {
4921  return Teuchos::rcp (new MV (map, numVectors));
4922  }
4923 
4924  template <class ST, class LO, class GO, class NT>
4927  {
4928  typedef MultiVector<ST, LO, GO, NT> MV;
4929  MV cpy (src.getMap (), src.getNumVectors (), false);
4930  cpy.assign (src);
4931  return cpy;
4932  }
4933 
4934 } // namespace Tpetra
4935 
4936 //
4937 // Explicit instantiation macro
4938 //
4939 // Must be expanded from within the Tpetra namespace!
4940 //
4941 
4942 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4943 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4944  template class MultiVector< SCALAR , LO , GO , NODE >; \
4945  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4946  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4947  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4948  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4949 
4950 #else
4951 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4952  template class MultiVector< SCALAR , LO , GO , NODE >; \
4953  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4954 
4955 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4956 
4957 
4958 #define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4959  \
4960  template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4961  MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4962 
4963 
4964 #endif // TPETRA_MULTIVECTOR_DEF_HPP
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
One or more distributed dense vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
bool isDistributed() const
Whether this is a globally distributed object.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
static bool debug()
Whether Tpetra is in debug mode.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
int local_ordinal_type
Default value of Scalar template parameter.
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector &quot;constructor&quot;: Create a MultiVector from a given Map.
Functions for initializing and finalizing Tpetra.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Declaration of Tpetra::Details::isInterComm.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
Replace existing values with new values.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
void start()
Start the deep_copy counter.
A parallel distribution of indices over processes.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
Base class for distributed Tpetra objects that support data redistribution.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
EWhichNorm
Input argument for normImpl() (which see).
Accumulate new values into existing values (may not be supported in all classes)
size_t getOrigNumLocalRows() const
&quot;Original&quot; number of rows in the (local) data.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.