Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 // clang-format off
11 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
12 #define TPETRA_MULTIVECTOR_DEF_HPP
13 
21 
22 #include "Tpetra_Core.hpp"
23 #include "Tpetra_Util.hpp"
24 #include "Tpetra_Vector.hpp"
28 #include "Tpetra_Details_fill.hpp"
36 #include "Tpetra_Details_Random.hpp"
37 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
38 # include "Teuchos_SerialDenseMatrix.hpp"
39 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
40 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
41 #include "KokkosCompat_View.hpp"
42 #include "KokkosBlas.hpp"
43 #include "KokkosKernels_Utils.hpp"
44 #include "Kokkos_Random.hpp"
45 #include "Kokkos_ArithTraits.hpp"
46 #include <memory>
47 #include <sstream>
48 
49 #ifdef HAVE_TPETRA_INST_FLOAT128
50 namespace Kokkos {
51  // FIXME (mfh 04 Sep 2015) Just a stub for now!
52  template<class Generator>
53  struct rand<Generator, __float128> {
54  static KOKKOS_INLINE_FUNCTION __float128 max ()
55  {
56  return static_cast<__float128> (1.0);
57  }
58  static KOKKOS_INLINE_FUNCTION __float128
59  draw (Generator& gen)
60  {
61  // Half the smallest normalized double, is the scaling factor of
62  // the lower-order term in the double-double representation.
63  const __float128 scalingFactor =
64  static_cast<__float128> (std::numeric_limits<double>::min ()) /
65  static_cast<__float128> (2.0);
66  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
67  const __float128 lowerOrderTerm =
68  static_cast<__float128> (gen.drand ()) * scalingFactor;
69  return higherOrderTerm + lowerOrderTerm;
70  }
71  static KOKKOS_INLINE_FUNCTION __float128
72  draw (Generator& gen, const __float128& range)
73  {
74  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
75  const __float128 scalingFactor =
76  static_cast<__float128> (std::numeric_limits<double>::min ()) /
77  static_cast<__float128> (2.0);
78  const __float128 higherOrderTerm =
79  static_cast<__float128> (gen.drand (range));
80  const __float128 lowerOrderTerm =
81  static_cast<__float128> (gen.drand (range)) * scalingFactor;
82  return higherOrderTerm + lowerOrderTerm;
83  }
84  static KOKKOS_INLINE_FUNCTION __float128
85  draw (Generator& gen, const __float128& start, const __float128& end)
86  {
87  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
88  const __float128 scalingFactor =
89  static_cast<__float128> (std::numeric_limits<double>::min ()) /
90  static_cast<__float128> (2.0);
91  const __float128 higherOrderTerm =
92  static_cast<__float128> (gen.drand (start, end));
93  const __float128 lowerOrderTerm =
94  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
95  return higherOrderTerm + lowerOrderTerm;
96  }
97  };
98 } // namespace Kokkos
99 #endif // HAVE_TPETRA_INST_FLOAT128
100 
101 namespace { // (anonymous)
102 
117  template<class ST, class LO, class GO, class NT>
119  allocDualView (const size_t lclNumRows,
120  const size_t numCols,
121  const bool zeroOut = true,
122  const bool allowPadding = false)
123  {
124  using ::Tpetra::Details::Behavior;
125  using Kokkos::AllowPadding;
126  using Kokkos::view_alloc;
127  using Kokkos::WithoutInitializing;
128  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
129  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
130  typedef typename dual_view_type::t_dev dev_view_type;
131 
132  // This needs to be a string and not a char*, if given as an
133  // argument to Kokkos::view_alloc. This is because view_alloc
134  // also allows a raw pointer as its first argument. See
135  // https://github.com/kokkos/kokkos/issues/434.
136  const std::string label ("MV::DualView");
137  const bool debug = Behavior::debug ();
138 
139  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
140  // creation of the DualView's Views works around
141  // Kokkos::DualView's current inability to accept an
142  // AllocationProperties initial argument (as Kokkos::View does).
143  // However, the work-around is harmless, since it does what the
144  // (currently nonexistent) equivalent DualView constructor would
145  // have done anyway.
146 
147  dev_view_type d_view;
148  if (zeroOut) {
149  if (allowPadding) {
150  d_view = dev_view_type (view_alloc (label, AllowPadding),
151  lclNumRows, numCols);
152  }
153  else {
154  d_view = dev_view_type (view_alloc (label),
155  lclNumRows, numCols);
156  }
157  }
158  else {
159  if (allowPadding) {
160  d_view = dev_view_type (view_alloc (label,
161  WithoutInitializing,
162  AllowPadding),
163  lclNumRows, numCols);
164  }
165  else {
166  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
167  lclNumRows, numCols);
168  }
169  if (debug) {
170  // Filling with NaN is a cheap and effective way to tell if
171  // downstream code is trying to use a MultiVector's data
172  // without them having been initialized. ArithTraits lets us
173  // call nan() even if the scalar type doesn't define it; it
174  // just returns some undefined value in the latter case. This
175  // won't hurt anything because by setting zeroOut=false, users
176  // already agreed that they don't care about the contents of
177  // the MultiVector.
178  const ST nan = Kokkos::ArithTraits<ST>::nan ();
179  KokkosBlas::fill (d_view, nan);
180  }
181  }
182  if (debug) {
183  TEUCHOS_TEST_FOR_EXCEPTION
184  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
185  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
186  "allocDualView: d_view's dimensions actual dimensions do not match "
187  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
188  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
189  << ". Please report this bug to the Tpetra developers.");
190  }
191 
192  return wrapped_dual_view_type(d_view);
193  }
194 
195  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
196  //
197  // T: The type of the entries of the View.
198  // ExecSpace: The Kokkos execution space.
199  template<class T, class ExecSpace>
200  struct MakeUnmanagedView {
201  // The 'false' part of the branch carefully ensures that this
202  // won't attempt to use a host execution space that hasn't been
203  // initialized. For example, if Kokkos::OpenMP is disabled and
204  // Kokkos::Threads is enabled, the latter is always the default
205  // execution space of Kokkos::HostSpace, even when ExecSpace is
206  // Kokkos::Serial. That's why we go through the trouble of asking
207  // Kokkos::DualView what _its_ space is. That seems to work
208  // around this default execution space issue.
209  //
210  typedef typename std::conditional<
211  Kokkos::SpaceAccessibility<
212  typename ExecSpace::memory_space,
213  Kokkos::HostSpace>::accessible,
214  typename ExecSpace::device_type,
215  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
216  typedef Kokkos::LayoutLeft array_layout;
217  typedef Kokkos::View<T*, array_layout, host_exec_space,
218  Kokkos::MemoryUnmanaged> view_type;
219 
220  static view_type getView (const Teuchos::ArrayView<T>& x_in)
221  {
222  const size_t numEnt = static_cast<size_t> (x_in.size ());
223  if (numEnt == 0) {
224  return view_type ();
225  } else {
226  return view_type (x_in.getRawPtr (), numEnt);
227  }
228  }
229  };
230 
231 
232  template<class WrappedDualViewType>
233  WrappedDualViewType
234  takeSubview (const WrappedDualViewType& X,
235  const std::pair<size_t, size_t>& rowRng,
236  const Kokkos::ALL_t& colRng)
237 
238  {
239  // The bug we saw below should be harder to trigger here.
240  return WrappedDualViewType(X,rowRng,colRng);
241  }
242 
243  template<class WrappedDualViewType>
244  WrappedDualViewType
245  takeSubview (const WrappedDualViewType& X,
246  const Kokkos::ALL_t& rowRng,
247  const std::pair<size_t, size_t>& colRng)
248  {
249  using DualViewType = typename WrappedDualViewType::DVT;
250  // Look carefullly at the comment in the below version of this function.
251  // The comment applies here as well.
252  if (X.extent (0) == 0 && X.extent (1) != 0) {
253  return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
254  }
255  else {
256  return WrappedDualViewType(X,rowRng,colRng);
257  }
258  }
259 
260  template<class WrappedDualViewType>
261  WrappedDualViewType
262  takeSubview (const WrappedDualViewType& X,
263  const std::pair<size_t, size_t>& rowRng,
264  const std::pair<size_t, size_t>& colRng)
265  {
266  using DualViewType = typename WrappedDualViewType::DVT;
267  // If you take a subview of a view with zero rows Kokkos::subview()
268  // always returns a DualView with the same data pointers. This will break
269  // pointer equality testing in between two subviews of the same 2D View if
270  // it has zero row extent. While the one (known) case where this was actually used
271  // has been fixed, that sort of check could very easily be reintroduced in the future,
272  // hence I've added this if check here.
273  //
274  // This is not a bug in Kokkos::subview(), just some very subtle behavior which
275  // future developers should be wary of.
276  if (X.extent (0) == 0 && X.extent (1) != 0) {
277  return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
278  }
279  else {
280  return WrappedDualViewType(X,rowRng,colRng);
281  }
282  }
283 
284  template<class WrappedOrNotDualViewType>
285  size_t
286  getDualViewStride (const WrappedOrNotDualViewType& dv)
287  {
288  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
289  // method yet, but its Views do.
290  // NOTE: dv.stride() returns a vector of length one
291  // more than its rank
292  size_t strides[WrappedOrNotDualViewType::t_dev::rank+1];
293  dv.stride(strides);
294  const size_t LDA = strides[1];
295  const size_t numRows = dv.extent (0);
296 
297  if (LDA == 0) {
298  return (numRows == 0) ? size_t (1) : numRows;
299  }
300  else {
301  return LDA;
302  }
303  }
304 
305  template<class ViewType>
306  size_t
307  getViewStride (const ViewType& view)
308  {
309  const size_t LDA = view.stride (1);
310  const size_t numRows = view.extent (0);
311 
312  if (LDA == 0) {
313  return (numRows == 0) ? size_t (1) : numRows;
314  }
315  else {
316  return LDA;
317  }
318  }
319 
320  template <class impl_scalar_type, class buffer_device_type>
321  bool
322  runKernelOnHost (
323  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
324  )
325  {
326  if (! imports.need_sync_device ()) {
327  return false; // most up-to-date on device
328  }
329  else { // most up-to-date on host,
330  // but if large enough, worth running on device anyway
331  size_t localLengthThreshold =
333  return imports.extent(0) <= localLengthThreshold;
334  }
335  }
336 
337 
338  template <class SC, class LO, class GO, class NT>
339  bool
340  runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
341  {
342  if (! X.need_sync_device ()) {
343  return false; // most up-to-date on device
344  }
345  else { // most up-to-date on host
346  // but if large enough, worth running on device anyway
347  size_t localLengthThreshold =
349  return X.getLocalLength () <= localLengthThreshold;
350  }
351  }
352 
353  template <class SC, class LO, class GO, class NT>
354  void
355  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
358  {
359  using namespace Tpetra;
362  using val_type = typename MV::impl_scalar_type;
363  using mag_type = typename MV::mag_type;
364  using dual_view_type = typename MV::dual_view_type;
365 
366  auto map = X.getMap ();
367  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
368  auto whichVecs = getMultiVectorWhichVectors (X);
369  const bool isConstantStride = X.isConstantStride ();
370  const bool isDistributed = X.isDistributed ();
371 
372  const bool runOnHost = runKernelOnHost (X);
373  if (runOnHost) {
374  using view_type = typename dual_view_type::t_host;
375  using array_layout = typename view_type::array_layout;
376  using device_type = typename view_type::device_type;
377 
378  auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
379  normImpl<val_type, array_layout, device_type,
380  mag_type> (norms, X_lcl, whichNorm, whichVecs,
381  isConstantStride, isDistributed, comm);
382  }
383  else {
384  using view_type = typename dual_view_type::t_dev;
385  using array_layout = typename view_type::array_layout;
386  using device_type = typename view_type::device_type;
387 
388  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
389  normImpl<val_type, array_layout, device_type,
390  mag_type> (norms, X_lcl, whichNorm, whichVecs,
391  isConstantStride, isDistributed, comm);
392  }
393  }
394 } // namespace (anonymous)
395 
396 
397 namespace Tpetra {
398 
399  namespace Details {
400  template <typename DstView, typename SrcView>
401  struct AddAssignFunctor {
402  // This functor would be better as a lambda, but CUDA cannot compile
403  // lambdas in protected functions. It compiles fine with the functor.
404  AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
405 
406  KOKKOS_INLINE_FUNCTION void
407  operator () (const size_t k) const { tgt(k) += src(k); }
408 
409  DstView tgt;
410  SrcView src;
411  };
412  }
413 
414  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
415  bool
417  vectorIndexOutOfRange (const size_t VectorIndex) const {
418  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
419  }
420 
421  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
424  base_type (Teuchos::rcp (new map_type ()))
425  {}
426 
427  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
429  MultiVector (const Teuchos::RCP<const map_type>& map,
430  const size_t numVecs,
431  const bool zeroOut) : /* default is true */
432  base_type (map)
433  {
434  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
435 
436  const size_t lclNumRows = this->getLocalLength ();
437  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
438  }
439 
440  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
443  const Teuchos::DataAccess copyOrView) :
444  base_type (source),
445  view_ (source.view_),
446  whichVectors_ (source.whichVectors_)
447  {
449  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
450  "const Teuchos::DataAccess): ";
451 
452  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
453 
454  if (copyOrView == Teuchos::Copy) {
455  // Reuse the conveniently already existing function that creates
456  // a deep copy.
457  MV cpy = createCopy (source);
458  this->view_ = cpy.view_;
459  this->whichVectors_ = cpy.whichVectors_;
460  }
461  else if (copyOrView == Teuchos::View) {
462  }
463  else {
464  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
465  true, std::invalid_argument, "Second argument 'copyOrView' has an "
466  "invalid value " << copyOrView << ". Valid values include "
467  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
468  << Teuchos::View << ".");
469  }
470  }
471 
472  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
474  MultiVector (const Teuchos::RCP<const map_type>& map,
475  const dual_view_type& view) :
476  base_type (map),
477  view_ (wrapped_dual_view_type(view))
478  {
479  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
480  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
481  map->getLocalNumElements ();
482  const size_t lclNumRows_view = view.extent (0);
483  const size_t LDA = getDualViewStride (view_);
484 
485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
486  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
487  std::invalid_argument, "Kokkos::DualView does not match Map. "
488  "map->getLocalNumElements() = " << lclNumRows_map
489  << ", view.extent(0) = " << lclNumRows_view
490  << ", and getStride() = " << LDA << ".");
491 
492  using ::Tpetra::Details::Behavior;
493  const bool debug = Behavior::debug ();
494  if (debug) {
495  using ::Tpetra::Details::checkGlobalDualViewValidity;
496  std::ostringstream gblErrStrm;
497  const bool verbose = Behavior::verbose ();
498  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
499  const bool gblValid =
500  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
501  comm.getRawPtr ());
502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
503  (! gblValid, std::runtime_error, gblErrStrm.str ());
504  }
505  }
506 
507 
508  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
510  MultiVector (const Teuchos::RCP<const map_type>& map,
511  const wrapped_dual_view_type& view) :
512  base_type (map),
513  view_ (view)
514  {
515  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
516  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
517  map->getLocalNumElements ();
518  const size_t lclNumRows_view = view.extent (0);
519  const size_t LDA = getDualViewStride (view);
520 
521  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
522  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
523  std::invalid_argument, "Kokkos::DualView does not match Map. "
524  "map->getLocalNumElements() = " << lclNumRows_map
525  << ", view.extent(0) = " << lclNumRows_view
526  << ", and getStride() = " << LDA << ".");
527 
528  using ::Tpetra::Details::Behavior;
529  const bool debug = Behavior::debug ();
530  if (debug) {
531  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
532  std::ostringstream gblErrStrm;
533  const bool verbose = Behavior::verbose ();
534  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
535  const bool gblValid =
536  checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
537  comm.getRawPtr ());
538  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
539  (! gblValid, std::runtime_error, gblErrStrm.str ());
540  }
541  }
542 
543 
544 
545  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
547  MultiVector (const Teuchos::RCP<const map_type>& map,
548  const typename dual_view_type::t_dev& d_view) :
549  base_type (map)
550  {
551  using Teuchos::ArrayRCP;
552  using Teuchos::RCP;
553  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
554 
555  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
556 
557  const size_t LDA = getViewStride (d_view);
558  const size_t lclNumRows = map->getLocalNumElements ();
559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
560  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
561  "Kokkos::View. map->getLocalNumElements() = " << lclNumRows
562  << ", View's column stride = " << LDA
563  << ", and View's extent(0) = " << d_view.extent (0) << ".");
564 
565  auto h_view = Kokkos::create_mirror_view (d_view);
566  auto dual_view = dual_view_type (d_view, h_view);
567  view_ = wrapped_dual_view_type(dual_view);
568 
569  using ::Tpetra::Details::Behavior;
570  const bool debug = Behavior::debug ();
571  if (debug) {
572  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
573  std::ostringstream gblErrStrm;
574  const bool verbose = Behavior::verbose ();
575  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
576  const bool gblValid =
577  checkGlobalWrappedDualViewValidity (&gblErrStrm, view_, verbose,
578  comm.getRawPtr ());
579  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
580  (! gblValid, std::runtime_error, gblErrStrm.str ());
581  }
582  // The user gave us a device view. In order to respect its
583  // initial contents, we mark the DualView as "modified on device."
584  // That way, the next sync will synchronize it with the host view.
585  dual_view.modify_device ();
586  }
587 
588  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
590  MultiVector (const Teuchos::RCP<const map_type>& map,
591  const dual_view_type& view,
592  const dual_view_type& origView) :
593  base_type (map),
594  view_ (wrapped_dual_view_type(view,origView))
595  {
596  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
597 
598  const size_t LDA = getDualViewStride (origView);
599  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
600  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
601  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
602  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
603  << ". This may also mean that the input origView's first dimension (number "
604  "of rows = " << origView.extent (0) << ") does not not match the number "
605  "of entries in the Map on the calling process.");
606 
607  using ::Tpetra::Details::Behavior;
608  const bool debug = Behavior::debug ();
609  if (debug) {
610  using ::Tpetra::Details::checkGlobalDualViewValidity;
611  std::ostringstream gblErrStrm;
612  const bool verbose = Behavior::verbose ();
613  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
614  const bool gblValid_0 =
615  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
616  comm.getRawPtr ());
617  const bool gblValid_1 =
618  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
619  comm.getRawPtr ());
620  const bool gblValid = gblValid_0 && gblValid_1;
621  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
622  (! gblValid, std::runtime_error, gblErrStrm.str ());
623  }
624  }
625 
626 
627  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
629  MultiVector (const Teuchos::RCP<const map_type>& map,
630  const dual_view_type& view,
631  const Teuchos::ArrayView<const size_t>& whichVectors) :
632  base_type (map),
633  view_ (view),
634  whichVectors_ (whichVectors.begin (), whichVectors.end ())
635  {
636  using Kokkos::ALL;
637  using Kokkos::subview;
638  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
639 
640  using ::Tpetra::Details::Behavior;
641  const bool debug = Behavior::debug ();
642  if (debug) {
643  using ::Tpetra::Details::checkGlobalDualViewValidity;
644  std::ostringstream gblErrStrm;
645  const bool verbose = Behavior::verbose ();
646  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
647  const bool gblValid =
648  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
649  comm.getRawPtr ());
650  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
651  (! gblValid, std::runtime_error, gblErrStrm.str ());
652  }
653 
654  const size_t lclNumRows = map.is_null () ? size_t (0) :
655  map->getLocalNumElements ();
656  // Check dimensions of the input DualView. We accept that Kokkos
657  // might not allow construction of a 0 x m (Dual)View with m > 0,
658  // so we only require the number of rows to match if the
659  // (Dual)View has more than zero columns. Likewise, we only
660  // require the number of columns to match if the (Dual)View has
661  // more than zero rows.
662  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
663  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
664  std::invalid_argument, "view.extent(0) = " << view.extent (0)
665  << " < map->getLocalNumElements() = " << lclNumRows << ".");
666  if (whichVectors.size () != 0) {
667  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
668  view.extent (1) != 0 && view.extent (1) == 0,
669  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
670  " = " << whichVectors.size () << " > 0.");
671  size_t maxColInd = 0;
672  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
673  for (size_type k = 0; k < whichVectors.size (); ++k) {
674  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
675  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
676  std::invalid_argument, "whichVectors[" << k << "] = "
677  "Teuchos::OrdinalTraits<size_t>::invalid().");
678  maxColInd = std::max (maxColInd, whichVectors[k]);
679  }
680  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
681  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
682  std::invalid_argument, "view.extent(1) = " << view.extent (1)
683  << " <= max(whichVectors) = " << maxColInd << ".");
684  }
685 
686  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
687  // zero strides, so modify in that case.
688  const size_t LDA = getDualViewStride (view);
689  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
690  (LDA < lclNumRows, std::invalid_argument,
691  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
692 
693  if (whichVectors.size () == 1) {
694  // If whichVectors has only one entry, we don't need to bother
695  // with nonconstant stride. Just shift the view over so it
696  // points to the desired column.
697  //
698  // NOTE (mfh 10 May 2014) This is a special case where we set
699  // origView_ just to view that one column, not all of the
700  // original columns. This ensures that the use of origView_ in
701  // offsetView works correctly.
702  //
703  const std::pair<size_t, size_t> colRng (whichVectors[0],
704  whichVectors[0] + 1);
705  view_ = takeSubview (view_, ALL (), colRng);
706  // whichVectors_.size() == 0 means "constant stride."
707  whichVectors_.clear ();
708  }
709  }
710 
711  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
713  MultiVector (const Teuchos::RCP<const map_type>& map,
714  const wrapped_dual_view_type& view,
715  const Teuchos::ArrayView<const size_t>& whichVectors) :
716  base_type (map),
717  view_ (view),
718  whichVectors_ (whichVectors.begin (), whichVectors.end ())
719  {
720  using Kokkos::ALL;
721  using Kokkos::subview;
722  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
723 
724  using ::Tpetra::Details::Behavior;
725  const bool debug = Behavior::debug ();
726  if (debug) {
727  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
728  std::ostringstream gblErrStrm;
729  const bool verbose = Behavior::verbose ();
730  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
731  const bool gblValid =
732  checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
733  comm.getRawPtr ());
734  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
735  (! gblValid, std::runtime_error, gblErrStrm.str ());
736  }
737 
738  const size_t lclNumRows = map.is_null () ? size_t (0) :
739  map->getLocalNumElements ();
740  // Check dimensions of the input DualView. We accept that Kokkos
741  // might not allow construction of a 0 x m (Dual)View with m > 0,
742  // so we only require the number of rows to match if the
743  // (Dual)View has more than zero columns. Likewise, we only
744  // require the number of columns to match if the (Dual)View has
745  // more than zero rows.
746  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
747  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
748  std::invalid_argument, "view.extent(0) = " << view.extent (0)
749  << " < map->getLocalNumElements() = " << lclNumRows << ".");
750  if (whichVectors.size () != 0) {
751  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
752  view.extent (1) != 0 && view.extent (1) == 0,
753  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
754  " = " << whichVectors.size () << " > 0.");
755  size_t maxColInd = 0;
756  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
757  for (size_type k = 0; k < whichVectors.size (); ++k) {
758  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
759  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
760  std::invalid_argument, "whichVectors[" << k << "] = "
761  "Teuchos::OrdinalTraits<size_t>::invalid().");
762  maxColInd = std::max (maxColInd, whichVectors[k]);
763  }
764  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
765  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
766  std::invalid_argument, "view.extent(1) = " << view.extent (1)
767  << " <= max(whichVectors) = " << maxColInd << ".");
768  }
769 
770  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
771  // zero strides, so modify in that case.
772  const size_t LDA = getDualViewStride (view);
773  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
774  (LDA < lclNumRows, std::invalid_argument,
775  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
776 
777  if (whichVectors.size () == 1) {
778  // If whichVectors has only one entry, we don't need to bother
779  // with nonconstant stride. Just shift the view over so it
780  // points to the desired column.
781  //
782  // NOTE (mfh 10 May 2014) This is a special case where we set
783  // origView_ just to view that one column, not all of the
784  // original columns. This ensures that the use of origView_ in
785  // offsetView works correctly.
786  //
787  const std::pair<size_t, size_t> colRng (whichVectors[0],
788  whichVectors[0] + 1);
789  view_ = takeSubview (view_, ALL (), colRng);
790  // whichVectors_.size() == 0 means "constant stride."
791  whichVectors_.clear ();
792  }
793  }
794 
795 
796  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
798  MultiVector (const Teuchos::RCP<const map_type>& map,
799  const dual_view_type& view,
800  const dual_view_type& origView,
801  const Teuchos::ArrayView<const size_t>& whichVectors) :
802  base_type (map),
803  view_ (wrapped_dual_view_type(view,origView)),
804  whichVectors_ (whichVectors.begin (), whichVectors.end ())
805  {
806  using Kokkos::ALL;
807  using Kokkos::subview;
808  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
809 
810  using ::Tpetra::Details::Behavior;
811  const bool debug = Behavior::debug ();
812  if (debug) {
813  using ::Tpetra::Details::checkGlobalDualViewValidity;
814  std::ostringstream gblErrStrm;
815  const bool verbose = Behavior::verbose ();
816  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
817  const bool gblValid_0 =
818  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
819  comm.getRawPtr ());
820  const bool gblValid_1 =
821  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
822  comm.getRawPtr ());
823  const bool gblValid = gblValid_0 && gblValid_1;
824  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
825  (! gblValid, std::runtime_error, gblErrStrm.str ());
826  }
827 
828  const size_t lclNumRows = this->getLocalLength ();
829  // Check dimensions of the input DualView. We accept that Kokkos
830  // might not allow construction of a 0 x m (Dual)View with m > 0,
831  // so we only require the number of rows to match if the
832  // (Dual)View has more than zero columns. Likewise, we only
833  // require the number of columns to match if the (Dual)View has
834  // more than zero rows.
835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
836  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
837  std::invalid_argument, "view.extent(0) = " << view.extent (0)
838  << " < map->getLocalNumElements() = " << lclNumRows << ".");
839  if (whichVectors.size () != 0) {
840  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
841  view.extent (1) != 0 && view.extent (1) == 0,
842  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
843  " = " << whichVectors.size () << " > 0.");
844  size_t maxColInd = 0;
845  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
846  for (size_type k = 0; k < whichVectors.size (); ++k) {
847  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
848  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
849  std::invalid_argument, "whichVectors[" << k << "] = "
850  "Teuchos::OrdinalTraits<size_t>::invalid().");
851  maxColInd = std::max (maxColInd, whichVectors[k]);
852  }
853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
854  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
855  std::invalid_argument, "view.extent(1) = " << view.extent (1)
856  << " <= max(whichVectors) = " << maxColInd << ".");
857  }
858 
859  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
860  // zero strides, so modify in that case.
861  const size_t LDA = getDualViewStride (origView);
862  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
863  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
864  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
865  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
866  << ", origView.stride(1) = " << origView.view_device().stride(1) << ".");
867 
868  if (whichVectors.size () == 1) {
869  // If whichVectors has only one entry, we don't need to bother
870  // with nonconstant stride. Just shift the view over so it
871  // points to the desired column.
872  //
873  // NOTE (mfh 10 May 2014) This is a special case where we set
874  // origView_ just to view that one column, not all of the
875  // original columns. This ensures that the use of origView_ in
876  // offsetView works correctly.
877  const std::pair<size_t, size_t> colRng (whichVectors[0],
878  whichVectors[0] + 1);
879  view_ = takeSubview (view_, ALL (), colRng);
880  // origView_ = takeSubview (origView_, ALL (), colRng);
881  // whichVectors_.size() == 0 means "constant stride."
882  whichVectors_.clear ();
883  }
884  }
885 
886  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
888  MultiVector (const Teuchos::RCP<const map_type>& map,
889  const Teuchos::ArrayView<const Scalar>& data,
890  const size_t LDA,
891  const size_t numVecs) :
892  base_type (map)
893  {
894  typedef LocalOrdinal LO;
895  typedef GlobalOrdinal GO;
896  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
897  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
898 
899  // Deep copy constructor, constant stride (NO whichVectors_).
900  // There is no need for a deep copy constructor with nonconstant stride.
901 
902  const size_t lclNumRows =
903  map.is_null () ? size_t (0) : map->getLocalNumElements ();
904  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
905  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
906  "map->getLocalNumElements() = " << lclNumRows << ".");
907  if (numVecs != 0) {
908  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
909  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
910  (static_cast<size_t> (data.size ()) < minNumEntries,
911  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
912  "entries, given the input Map and number of vectors in the MultiVector."
913  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
914  "map->getLocalNumElements () = " << minNumEntries << ".");
915  }
916 
917  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
918  //Note: X_out will be completely overwritten
919  auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
920 
921  // Make an unmanaged host Kokkos::View of the input data. First
922  // create a View (X_in_orig) with the original stride. Then,
923  // create a subview (X_in) with the right number of columns.
924  const impl_scalar_type* const X_in_raw =
925  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
926  Kokkos::View<const impl_scalar_type**,
927  Kokkos::LayoutLeft,
928  Kokkos::HostSpace,
929  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
930  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
931  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
932 
933  // If LDA != X_out's column stride, then we need to copy one
934  // column at a time; Kokkos::deep_copy refuses to work in that
935  // case.
936  const size_t outStride =
937  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
938  if (LDA == outStride) { // strides are the same; deep_copy once
939  // This only works because MultiVector uses LayoutLeft.
940  // We would need a custom copy functor otherwise.
941  // DEEP_COPY REVIEW - HOST-TO-DEVICE
942  Kokkos::deep_copy (execution_space(), X_out, X_in);
943  }
944  else { // strides differ; copy one column at a time
945  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
946  out_col_view_type;
947  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
948  in_col_view_type;
949  for (size_t j = 0; j < numVecs; ++j) {
950  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
951  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
952  // DEEP_COPY REVIEW - HOST-TO-DEVICE
953  Kokkos::deep_copy (execution_space(), X_out_j, X_in_j);
954  }
955  }
956  }
957 
958  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
960  MultiVector (const Teuchos::RCP<const map_type>& map,
961  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
962  const size_t numVecs) :
963  base_type (map)
964  {
965  typedef impl_scalar_type IST;
966  typedef LocalOrdinal LO;
967  typedef GlobalOrdinal GO;
968  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
969  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
970 
971  const size_t lclNumRows =
972  map.is_null () ? size_t (0) : map->getLocalNumElements ();
973  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
974  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
975  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
976  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
977  for (size_t j = 0; j < numVecs; ++j) {
978  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
979  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
980  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
981  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
982  << X_j_av.size () << " < map->getLocalNumElements() = " << lclNumRows
983  << ".");
984  }
985 
986  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
987  auto X_out = this->getLocalViewDevice(Access::ReadWrite);
988 
989  // Make sure that the type of a single input column has the same
990  // array layout as each output column does, so we can deep_copy.
991  using array_layout = typename decltype (X_out)::array_layout;
992  using input_col_view_type = typename Kokkos::View<const IST*,
993  array_layout,
994  Kokkos::HostSpace,
995  Kokkos::MemoryUnmanaged>;
996 
997  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
998  for (size_t j = 0; j < numVecs; ++j) {
999  Teuchos::ArrayView<const IST> X_j_av =
1000  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
1001  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
1002  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
1003  // DEEP_COPY REVIEW - HOST-TO-DEVICE
1004  Kokkos::deep_copy (execution_space(), X_j_out, X_j_in);
1005  }
1006  }
1007 
1008  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1011  return whichVectors_.empty ();
1012  }
1013 
1014  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1015  size_t
1018  {
1019  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1020  return static_cast<size_t> (0);
1021  } else {
1022  return this->getMap ()->getLocalNumElements ();
1023  }
1024  }
1025 
1026  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1030  {
1031  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1032  return static_cast<size_t> (0);
1033  } else {
1034  return this->getMap ()->getGlobalNumElements ();
1035  }
1036  }
1037 
1038  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1039  size_t
1041  getStride () const
1042  {
1043  return isConstantStride () ? getDualViewStride (view_) : size_t (0);
1044  }
1045 
1046  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1047  bool
1050  {
1051  //Don't actually get a view, just get pointers.
1052  auto thisData = view_.getDualView().view_host().data();
1053  auto otherData = other.view_.getDualView().view_host().data();
1054  size_t thisSpan = view_.getDualView().view_host().span();
1055  size_t otherSpan = other.view_.getDualView().view_host().span();
1056  return (otherData <= thisData && thisData < otherData + otherSpan)
1057  || (thisData <= otherData && otherData < thisData + thisSpan);
1058  }
1059 
1060  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1061  bool
1063  checkSizes (const SrcDistObject& sourceObj)
1064  {
1065  // Check whether the source object is a MultiVector. If not, then
1066  // we can't even compare sizes, so it's definitely not OK to
1067  // Import or Export from it.
1069  const MV* src = dynamic_cast<const MV*> (&sourceObj);
1070  if (src == nullptr) {
1071  return false;
1072  }
1073  else {
1074  // The target of the Import or Export calls checkSizes() in
1075  // DistObject::doTransfer(). By that point, we've already
1076  // constructed an Import or Export object using the two
1077  // multivectors' Maps, which means that (hopefully) we've
1078  // already checked other attributes of the multivectors. Thus,
1079  // all we need to do here is check the number of columns.
1080  return src->getNumVectors () == this->getNumVectors ();
1081  }
1082  }
1083 
1084  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1085  size_t
1088  return this->getNumVectors ();
1089  }
1090 
1091  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1092  void
1095  (const SrcDistObject& sourceObj,
1096  const size_t numSameIDs,
1097  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1098  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1099  const CombineMode CM,
1100  const execution_space &space)
1101  {
1102  using ::Tpetra::Details::Behavior;
1104  using ::Tpetra::Details::ProfilingRegion;
1105  using std::endl;
1106  using KokkosRefactor::Details::permute_array_multi_column;
1107  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1108  using Kokkos::Compat::create_const_view;
1110 
1111  // We've already called checkSizes(), so this cast must succeed.
1112  MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1113 
1114  bool copyOnHost;
1115  if (importsAreAliased () && (this->constantNumberOfPackets () != 0) && Behavior::enableGranularTransfers()) {
1116  // imports are aliased to the target. We already posted Irecvs
1117  // into imports using a memory space that depends on GPU
1118  // awareness. Therefore we want to modify target in the same
1119  // memory space. See comments in DistObject::beginTransfer.
1120  copyOnHost = ! Behavior::assumeMpiIsGPUAware ();
1121  } else {
1122  // We are free to choose the better memory space for copyAndPermute.
1123  copyOnHost = runKernelOnHost(sourceMV);
1124  }
1125  const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1126  const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1127  const char tfecfFuncName[] = "copyAndPermute: ";
1128  ProfilingRegion regionCAP (copyOnHost ? longFuncNameHost : longFuncNameDevice);
1129 
1130  const bool verbose = Behavior::verbose ();
1131  std::unique_ptr<std::string> prefix;
1132  if (verbose) {
1133  auto map = this->getMap ();
1134  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1135  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1136  std::ostringstream os;
1137  os << "Proc " << myRank << ": MV::copyAndPermute: ";
1138  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1139  os << "Start" << endl;
1140  std::cerr << os.str ();
1141  }
1142 
1143  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1144  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1145  std::logic_error, "permuteToLIDs.extent(0) = "
1146  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1147  << permuteFromLIDs.extent (0) << ".");
1148  const size_t numCols = this->getNumVectors ();
1149 
1150  // sourceMV doesn't belong to us, so we can't sync it. Do the
1151  // copying where it's currently sync'd.
1152  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1153  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1154  std::logic_error, "Input MultiVector needs sync to both host "
1155  "and device.");
1156  if (verbose) {
1157  std::ostringstream os;
1158  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1159  std::cerr << os.str ();
1160  }
1161 
1162  if (verbose) {
1163  std::ostringstream os;
1164  os << *prefix << "Copy first " << numSameIDs << " elements" << endl;
1165  std::cerr << os.str ();
1166  }
1167 
1168  // TODO (mfh 15 Sep 2013) When we replace
1169  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1170  // ways to copy the data:
1171  //
1172  // 1. Get a (sub)view of each column and call deep_copy on that.
1173  // 2. Write a custom kernel to copy the data.
1174  //
1175  // The first is easier, but the second might be more performant in
1176  // case we decide to use layouts other than LayoutLeft. It might
1177  // even make sense to hide whichVectors_ in an entirely new layout
1178  // for Kokkos Views.
1179 
1180  // Copy rows [0, numSameIDs-1] of the local multivectors.
1181  //
1182  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1183  // time, even when both multivectors are constant-stride, since
1184  // deep_copy between strided subviews with more than one column
1185  // doesn't currently work.
1186 
1187  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1188  // both source and target are constant stride and have multiple
1189  // columns.
1190  if (numSameIDs > 0) {
1191  const std::pair<size_t, size_t> rows (0, numSameIDs);
1192  if (copyOnHost) {
1193  ProfilingRegion regionC ("Tpetra::MultiVector::copy[Host]");
1194  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1195  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1196 
1197  for (size_t j = 0; j < numCols; ++j) {
1198  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1199  const size_t srcCol =
1200  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1201 
1202  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1203  auto src_j = Kokkos::subview (src_h, rows, srcCol);
1204  if (CM == ADD_ASSIGN) {
1205  // Sum src_j into tgt_j
1206  using range_t =
1207  Kokkos::RangePolicy<execution_space, size_t>;
1208  range_t rp(space, 0,numSameIDs);
1209  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1210  aaf(tgt_j, src_j);
1211  Kokkos::parallel_for(rp, aaf);
1212  }
1213  else {
1214  // Copy src_j into tgt_j
1215  // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1216  Kokkos::deep_copy (space, tgt_j, src_j);
1217  space.fence();
1218  }
1219  }
1220  }
1221  else { // copy on device
1222  ProfilingRegion regionC ("Tpetra::MultiVector::copy[Device]");
1223  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1224  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1225 
1226  for (size_t j = 0; j < numCols; ++j) {
1227  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1228  const size_t srcCol =
1229  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1230 
1231  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1232  auto src_j = Kokkos::subview (src_d, rows, srcCol);
1233  if (CM == ADD_ASSIGN) {
1234  // Sum src_j into tgt_j
1235  using range_t =
1236  Kokkos::RangePolicy<execution_space, size_t>;
1237  range_t rp(space, 0,numSameIDs);
1238  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1239  aaf(tgt_j, src_j);
1240  Kokkos::parallel_for(rp, aaf);
1241  }
1242  else {
1243  // Copy src_j into tgt_j
1244  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1245  Kokkos::deep_copy (space, tgt_j, src_j);
1246  space.fence();
1247  }
1248  }
1249  }
1250  }
1251 
1252 
1253  // For the remaining GIDs, execute the permutations. This may
1254  // involve noncontiguous access of both source and destination
1255  // vectors, depending on the LID lists.
1256  //
1257  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1258  // the same process, this merges their values by replacement of
1259  // the last encountered GID, not by the specified merge rule
1260  // (such as ADD).
1261 
1262  // If there are no permutations, we are done
1263  if (permuteFromLIDs.extent (0) == 0 ||
1264  permuteToLIDs.extent (0) == 0) {
1265  if (verbose) {
1266  std::ostringstream os;
1267  os << *prefix << "No permutations. Done!" << endl;
1268  std::cerr << os.str ();
1269  }
1270  return;
1271  }
1272  ProfilingRegion regionP ("Tpetra::MultiVector::permute");
1273 
1274  if (verbose) {
1275  std::ostringstream os;
1276  os << *prefix << "Permute" << endl;
1277  std::cerr << os.str ();
1278  }
1279 
1280  // We could in theory optimize for the case where exactly one of
1281  // them is constant stride, but we don't currently do that.
1282  const bool nonConstStride =
1283  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1284 
1285  if (verbose) {
1286  std::ostringstream os;
1287  os << *prefix << "nonConstStride="
1288  << (nonConstStride ? "true" : "false") << endl;
1289  std::cerr << os.str ();
1290  }
1291 
1292  // We only need the "which vectors" arrays if either the source or
1293  // target MV is not constant stride. Since we only have one
1294  // kernel that must do double-duty, we have to create a "which
1295  // vectors" array for the MV that _is_ constant stride.
1296  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1297  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1298  if (nonConstStride) {
1299  if (this->whichVectors_.size () == 0) {
1300  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1301  tmpTgt.modify_host ();
1302  for (size_t j = 0; j < numCols; ++j) {
1303  tmpTgt.view_host()(j) = j;
1304  }
1305  if (! copyOnHost) {
1306  tmpTgt.sync_device ();
1307  }
1308  tgtWhichVecs = tmpTgt;
1309  }
1310  else {
1311  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1312  tgtWhichVecs =
1313  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1314  "tgtWhichVecs",
1315  copyOnHost);
1316  }
1317  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1318  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1319  this->getNumVectors (),
1320  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1321  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1322  this->getNumVectors () << ".");
1323 
1324  if (sourceMV.whichVectors_.size () == 0) {
1325  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1326  tmpSrc.modify_host ();
1327  for (size_t j = 0; j < numCols; ++j) {
1328  tmpSrc.view_host()(j) = j;
1329  }
1330  if (! copyOnHost) {
1331  tmpSrc.sync_device ();
1332  }
1333  srcWhichVecs = tmpSrc;
1334  }
1335  else {
1336  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1337  sourceMV.whichVectors_ ();
1338  srcWhichVecs =
1339  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1340  "srcWhichVecs",
1341  copyOnHost);
1342  }
1343  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1344  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1345  sourceMV.getNumVectors (), std::logic_error,
1346  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1347  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1348  << ".");
1349  }
1350 
1351  if (copyOnHost) { // permute on host too
1352  if (verbose) {
1353  std::ostringstream os;
1354  os << *prefix << "Get permute LIDs on host" << std::endl;
1355  std::cerr << os.str ();
1356  }
1357  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1358  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1359 
1360  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1361  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1362  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1363  auto permuteFromLIDs_h =
1364  create_const_view (permuteFromLIDs.view_host ());
1365 
1366  if (verbose) {
1367  std::ostringstream os;
1368  os << *prefix << "Permute on host" << endl;
1369  std::cerr << os.str ();
1370  }
1371  if (nonConstStride) {
1372  // No need to sync first, because copyOnHost argument to
1373  // getDualViewCopyFromArrayView puts them in the right place.
1374  auto tgtWhichVecs_h =
1375  create_const_view (tgtWhichVecs.view_host ());
1376  auto srcWhichVecs_h =
1377  create_const_view (srcWhichVecs.view_host ());
1378  if (CM == ADD_ASSIGN) {
1379  using op_type = KokkosRefactor::Details::AddOp;
1380  permute_array_multi_column_variable_stride (tgt_h, src_h,
1381  permuteToLIDs_h,
1382  permuteFromLIDs_h,
1383  tgtWhichVecs_h,
1384  srcWhichVecs_h, numCols,
1385  op_type());
1386  }
1387  else {
1388  using op_type = KokkosRefactor::Details::InsertOp;
1389  permute_array_multi_column_variable_stride (tgt_h, src_h,
1390  permuteToLIDs_h,
1391  permuteFromLIDs_h,
1392  tgtWhichVecs_h,
1393  srcWhichVecs_h, numCols,
1394  op_type());
1395  }
1396  }
1397  else {
1398  if (CM == ADD_ASSIGN) {
1399  using op_type = KokkosRefactor::Details::AddOp;
1400  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1401  permuteFromLIDs_h, numCols, op_type());
1402  }
1403  else {
1404  using op_type = KokkosRefactor::Details::InsertOp;
1405  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1406  permuteFromLIDs_h, numCols, op_type());
1407  }
1408  }
1409  }
1410  else { // permute on device
1411  if (verbose) {
1412  std::ostringstream os;
1413  os << *prefix << "Get permute LIDs on device" << endl;
1414  std::cerr << os.str ();
1415  }
1416  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1417  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1418 
1419  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1420  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1421  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1422  auto permuteFromLIDs_d =
1423  create_const_view (permuteFromLIDs.view_device ());
1424 
1425  if (verbose) {
1426  std::ostringstream os;
1427  os << *prefix << "Permute on device" << endl;
1428  std::cerr << os.str ();
1429  }
1430  if (nonConstStride) {
1431  // No need to sync first, because copyOnHost argument to
1432  // getDualViewCopyFromArrayView puts them in the right place.
1433  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1434  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1435  if (CM == ADD_ASSIGN) {
1436  using op_type = KokkosRefactor::Details::AddOp;
1437  permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1438  permuteToLIDs_d,
1439  permuteFromLIDs_d,
1440  tgtWhichVecs_d,
1441  srcWhichVecs_d, numCols,
1442  op_type());
1443  }
1444  else {
1445  using op_type = KokkosRefactor::Details::InsertOp;
1446  permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1447  permuteToLIDs_d,
1448  permuteFromLIDs_d,
1449  tgtWhichVecs_d,
1450  srcWhichVecs_d, numCols,
1451  op_type());
1452  }
1453  }
1454  else {
1455  if (CM == ADD_ASSIGN) {
1456  using op_type = KokkosRefactor::Details::AddOp;
1457  permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1458  permuteFromLIDs_d, numCols, op_type());
1459  }
1460  else {
1461  using op_type = KokkosRefactor::Details::InsertOp;
1462  permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1463  permuteFromLIDs_d, numCols, op_type());
1464  }
1465  }
1466  }
1467 
1468  if (verbose) {
1469  std::ostringstream os;
1470  os << *prefix << "Done!" << endl;
1471  std::cerr << os.str ();
1472  }
1473  }
1474 
1475 // clang-format on
1476 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1478  const SrcDistObject &sourceObj, const size_t numSameIDs,
1479  const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1480  &permuteToLIDs,
1481  const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1482  &permuteFromLIDs,
1483  const CombineMode CM) {
1484  copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1485  execution_space());
1486 }
1487 // clang-format off
1488 
1489  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1490  void
1493  (const SrcDistObject& sourceObj,
1494  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1495  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1496  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1497  size_t& constantNumPackets,
1498  const execution_space &space)
1499  {
1500  using ::Tpetra::Details::Behavior;
1501  using ::Tpetra::Details::ProfilingRegion;
1503  using Kokkos::Compat::create_const_view;
1504  using Kokkos::Compat::getKokkosViewDeepCopy;
1505  using std::endl;
1507 
1508  // We've already called checkSizes(), so this cast must succeed.
1509  MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1510 
1511  const bool packOnHost = runKernelOnHost(sourceMV);
1512  const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1513  const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1514  const char tfecfFuncName[] = "packAndPrepare: ";
1515  ProfilingRegion regionPAP (packOnHost ? longFuncNameHost : longFuncNameDevice);
1516 
1517  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1518  // have the option to check indices. We do so when Tpetra is in
1519  // debug mode. It is in debug mode by default in a debug build,
1520  // but you may control this at run time, before launching the
1521  // executable, by setting the TPETRA_DEBUG environment variable to
1522  // "1" (or "TRUE").
1523  const bool debugCheckIndices = Behavior::debug ();
1524  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1525  // environment variable to "1" (or "TRUE") for copious debug
1526  // output to std::cerr on every MPI process. This is unwise for
1527  // runs with large numbers of MPI processes.
1528  const bool printDebugOutput = Behavior::verbose ();
1529 
1530  std::unique_ptr<std::string> prefix;
1531  if (printDebugOutput) {
1532  auto map = this->getMap ();
1533  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1534  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1535  std::ostringstream os;
1536  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1537  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1538  os << "Start" << endl;
1539  std::cerr << os.str ();
1540  }
1541 
1542 
1543  const size_t numCols = sourceMV.getNumVectors ();
1544 
1545  // This spares us from needing to fill numExportPacketsPerLID.
1546  // Setting constantNumPackets to a nonzero value signals that
1547  // all packets have the same number of entries.
1548  constantNumPackets = numCols;
1549 
1550  // If we have no exports, there is nothing to do. Make sure this
1551  // goes _after_ setting constantNumPackets correctly.
1552  if (exportLIDs.extent (0) == 0) {
1553  if (printDebugOutput) {
1554  std::ostringstream os;
1555  os << *prefix << "No exports on this proc, DONE" << std::endl;
1556  std::cerr << os.str ();
1557  }
1558  return;
1559  }
1560 
1561  /* The layout in the export for MultiVectors is as follows:
1562  exports = { all of the data from row exportLIDs.front() ;
1563  ....
1564  all of the data from row exportLIDs.back() }
1565  This doesn't have the best locality, but is necessary because
1566  the data for a Packet (all data associated with an LID) is
1567  required to be contiguous. */
1568 
1569  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1570  // packing scheme in the above comment? The data going to a
1571  // particular process must be contiguous, of course, but those
1572  // data could include entries from multiple LIDs. DistObject just
1573  // needs to know how to index into that data. Kokkos is good at
1574  // decoupling storage intent from data layout choice.
1575 
1576  const size_t numExportLIDs = exportLIDs.extent (0);
1577  const size_t newExportsSize = numCols * numExportLIDs;
1578  if (printDebugOutput) {
1579  std::ostringstream os;
1580  os << *prefix << "realloc: "
1581  << "numExportLIDs: " << numExportLIDs
1582  << ", exports.extent(0): " << exports.extent (0)
1583  << ", newExportsSize: " << newExportsSize << std::endl;
1584  std::cerr << os.str ();
1585  }
1586  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1587 
1588  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1589  // sync it. Pack it where it's currently sync'd.
1590  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1591  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1592  std::logic_error, "Input MultiVector needs sync to both host "
1593  "and device.");
1594  if (printDebugOutput) {
1595  std::ostringstream os;
1596  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1597  std::cerr << os.str ();
1598  }
1599 
1600  // Mark 'exports' here, since we might have resized it above.
1601  // Resizing currently requires calling the constructor, which
1602  // clears out the 'modified' flags.
1603  if (packOnHost) {
1604  // nde 06 Feb 2020: If 'exports' does not require resize
1605  // when reallocDualViewIfNeeded is called, the modified flags
1606  // are not cleared out. This can result in host and device views
1607  // being out-of-sync, resuling in an error in exports.modify_* calls.
1608  // Clearing the sync flags prevents this possible case.
1609  exports.clear_sync_state ();
1610  exports.modify_host ();
1611  }
1612  else {
1613  // nde 06 Feb 2020: If 'exports' does not require resize
1614  // when reallocDualViewIfNeeded is called, the modified flags
1615  // are not cleared out. This can result in host and device views
1616  // being out-of-sync, resuling in an error in exports.modify_* calls.
1617  // Clearing the sync flags prevents this possible case.
1618  exports.clear_sync_state ();
1619  exports.modify_device ();
1620  }
1621 
1622  if (numCols == 1) { // special case for one column only
1623  // MultiVector always represents a single column with constant
1624  // stride, but it doesn't hurt to implement both cases anyway.
1625  //
1626  // ETP: I'm not sure I agree with the above statement. Can't a single-
1627  // column multivector be a subview of another multi-vector, in which case
1628  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1629  // separately.
1630  //
1631  // mfh 18 Jan 2016: In answer to ETP's comment above:
1632  // MultiVector treats single-column MultiVectors created using a
1633  // "nonconstant stride constructor" as a special case, and makes
1634  // them constant stride (by making whichVectors_ have length 0).
1635  if (sourceMV.isConstantStride ()) {
1636  using KokkosRefactor::Details::pack_array_single_column;
1637  if (printDebugOutput) {
1638  std::ostringstream os;
1639  os << *prefix << "Pack numCols=1 const stride" << endl;
1640  std::cerr << os.str ();
1641  }
1642  if (packOnHost) {
1643  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1644  pack_array_single_column (exports.view_host (),
1645  src_host,
1646  exportLIDs.view_host (),
1647  0,
1648  debugCheckIndices);
1649  }
1650  else { // pack on device
1651  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1652  pack_array_single_column (exports.view_device (),
1653  src_dev,
1654  exportLIDs.view_device (),
1655  0,
1656  debugCheckIndices,
1657  space);
1658  }
1659  }
1660  else {
1661  using KokkosRefactor::Details::pack_array_single_column;
1662  if (printDebugOutput) {
1663  std::ostringstream os;
1664  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1665  std::cerr << os.str ();
1666  }
1667  if (packOnHost) {
1668  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1669  pack_array_single_column (exports.view_host (),
1670  src_host,
1671  exportLIDs.view_host (),
1672  sourceMV.whichVectors_[0],
1673  debugCheckIndices);
1674  }
1675  else { // pack on device
1676  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1677  pack_array_single_column (exports.view_device (),
1678  src_dev,
1679  exportLIDs.view_device (),
1680  sourceMV.whichVectors_[0],
1681  debugCheckIndices,
1682  space);
1683  }
1684  }
1685  }
1686  else { // the source MultiVector has multiple columns
1687  if (sourceMV.isConstantStride ()) {
1688  using KokkosRefactor::Details::pack_array_multi_column;
1689  if (printDebugOutput) {
1690  std::ostringstream os;
1691  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1692  std::cerr << os.str ();
1693  }
1694  if (packOnHost) {
1695  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1696  pack_array_multi_column (exports.view_host (),
1697  src_host,
1698  exportLIDs.view_host (),
1699  numCols,
1700  debugCheckIndices);
1701  }
1702  else { // pack on device
1703  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1704  pack_array_multi_column (exports.view_device (),
1705  src_dev,
1706  exportLIDs.view_device (),
1707  numCols,
1708  debugCheckIndices,
1709  space);
1710  }
1711  }
1712  else {
1713  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1714  if (printDebugOutput) {
1715  std::ostringstream os;
1716  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1717  << endl;
1718  std::cerr << os.str ();
1719  }
1720  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1721  // whichVectors_ can be expensive, but pack and unpack for
1722  // nonconstant-stride MultiVectors is slower anyway.
1723  using IST = impl_scalar_type;
1724  using DV = Kokkos::DualView<IST*, device_type>;
1725  using HES = typename DV::t_host::execution_space;
1726  using DES = typename DV::t_dev::execution_space;
1727  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1728  if (packOnHost) {
1729  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1730  pack_array_multi_column_variable_stride
1731  (exports.view_host (),
1732  src_host,
1733  exportLIDs.view_host (),
1734  getKokkosViewDeepCopy<HES> (whichVecs),
1735  numCols,
1736  debugCheckIndices);
1737  }
1738  else { // pack on device
1739  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1740  pack_array_multi_column_variable_stride
1741  (exports.view_device (),
1742  src_dev,
1743  exportLIDs.view_device (),
1744  getKokkosViewDeepCopy<DES> (whichVecs),
1745  numCols,
1746  debugCheckIndices, space);
1747  }
1748  }
1749  }
1750 
1751  if (printDebugOutput) {
1752  std::ostringstream os;
1753  os << *prefix << "Done!" << endl;
1754  std::cerr << os.str ();
1755  }
1756 
1757  }
1758 
1759 // clang-format on
1760  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1761  void
1764  (const SrcDistObject& sourceObj,
1765  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1766  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1767  Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1768  size_t& constantNumPackets) {
1769  packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space());
1770  }
1771 // clang-format off
1772 
1773  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1774  template <class NO>
1775  typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1776  typename NO::device_type::memory_space>::value,
1777  bool>::type
1779  reallocImportsIfNeededImpl (const size_t newSize,
1780  const bool verbose,
1781  const std::string* prefix,
1782  const bool areRemoteLIDsContiguous,
1783  const CombineMode CM)
1784  {
1785  // This implementation of reallocImportsIfNeeded is an
1786  // optimization that is specific to MultiVector. We check if the
1787  // imports_ view can be aliased to the end of the data view_. If
1788  // that is the case, we can skip the unpackAndCombine call.
1789 
1790  if (verbose) {
1791  std::ostringstream os;
1792  os << *prefix << "Realloc (if needed) imports_ from "
1793  << this->imports_.extent (0) << " to " << newSize << std::endl;
1794  std::cerr << os.str ();
1795  }
1796 
1797  bool reallocated = false;
1798 
1799  using IST = impl_scalar_type;
1800  using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1801 
1802  // Conditions for aliasing memory:
1803  // - When both sides of the dual view are in the same memory
1804  // space, we do not need to worry about syncing things.
1805  // - When both memory spaces are different, we only alias if this
1806  // does not incur additional sync'ing.
1807  // - The remote LIDs need to be contiguous, so that we do not need
1808  // to reorder received information.
1809  // - CombineMode needs to be INSERT.
1810  // - The number of vectors needs to be 1, otherwise we need to
1811  // reorder the received data.
1812  if ((std::is_same_v<typename dual_view_type::t_dev::device_type, typename dual_view_type::t_host::device_type> ||
1813  (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) ||
1814  (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) &&
1815  areRemoteLIDsContiguous &&
1816  (CM == INSERT || CM == REPLACE) &&
1817  (getNumVectors() == 1) &&
1818  (newSize > 0)) {
1819 
1820  size_t offset = getLocalLength () - newSize;
1821  reallocated = this->imports_.view_device().data() != view_.getDualView().view_device().data() + offset;
1822  if (reallocated) {
1823  typedef std::pair<size_t, size_t> range_type;
1824  this->imports_ = DV(view_.getDualView(),
1825  range_type (offset, getLocalLength () ),
1826  0);
1827 
1828  if (verbose) {
1829  std::ostringstream os;
1830  os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1831  std::cerr << os.str ();
1832  }
1833  }
1834  return reallocated;
1835  }
1836  {
1838  reallocated =
1839  reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1840  if (verbose) {
1841  std::ostringstream os;
1842  os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1843  std::cerr << os.str ();
1844  }
1845  this->imports_ = this->unaliased_imports_;
1846  }
1847  return reallocated;
1848  }
1849 
1850  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1851  template <class NO>
1852  typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1853  typename NO::device_type::memory_space>::value,
1854  bool>::type
1856  reallocImportsIfNeededImpl (const size_t newSize,
1857  const bool verbose,
1858  const std::string* prefix,
1859  const bool areRemoteLIDsContiguous,
1860  const CombineMode CM)
1861  {
1862  return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1863  }
1864 
1865  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1866  bool
1868  reallocImportsIfNeeded(const size_t newSize,
1869  const bool verbose,
1870  const std::string* prefix,
1871  const bool areRemoteLIDsContiguous,
1872  const CombineMode CM) {
1874  return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1875  }
1876 
1877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1878  bool
1881  return (this->imports_.view_device().data() + this->imports_.view_device().extent(0) ==
1882  view_.getDualView().view_device().data() + view_.getDualView().view_device().extent(0));
1883  }
1884 
1885 
1886  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1887  void
1890  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1891  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1892  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1893  const size_t constantNumPackets,
1894  const CombineMode CM,
1895  const execution_space &space)
1896  {
1897  using ::Tpetra::Details::Behavior;
1898  using ::Tpetra::Details::ProfilingRegion;
1899  using KokkosRefactor::Details::unpack_array_multi_column;
1900  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1901  using Kokkos::Compat::getKokkosViewDeepCopy;
1902  using std::endl;
1903 
1904  const bool unpackOnHost = runKernelOnHost(imports);
1905 
1906  const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1907  const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1908  const char * longFuncName = unpackOnHost ? longFuncNameHost : longFuncNameDevice;
1909  const char tfecfFuncName[] = "unpackAndCombine: ";
1910 
1911  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1912  // have the option to check indices. We do so when Tpetra is in
1913  // debug mode. It is in debug mode by default in a debug build,
1914  // but you may control this at run time, before launching the
1915  // executable, by setting the TPETRA_DEBUG environment variable to
1916  // "1" (or "TRUE").
1917  const bool debugCheckIndices = Behavior::debug ();
1918 
1919  const bool printDebugOutput = Behavior::verbose ();
1920  std::unique_ptr<std::string> prefix;
1921  if (printDebugOutput) {
1922  auto map = this->getMap ();
1923  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1924  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1925  std::ostringstream os;
1926  os << "Proc " << myRank << ": " << longFuncName << ": ";
1927  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1928  os << "Start" << endl;
1929  std::cerr << os.str ();
1930  }
1931 
1932  // If we have no imports, there is nothing to do
1933  if (importLIDs.extent (0) == 0) {
1934  if (printDebugOutput) {
1935  std::ostringstream os;
1936  os << *prefix << "No imports. Done!" << endl;
1937  std::cerr << os.str ();
1938  }
1939  return;
1940  }
1941 
1942  // Check, whether imports_ is a subview of the MV view.
1943  if (importsAreAliased()) {
1944  if (printDebugOutput) {
1945  std::ostringstream os;
1946  os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1947  std::cerr << os.str ();
1948  }
1949  return;
1950  }
1951 
1952  ProfilingRegion regionUAC (longFuncName);
1953 
1954  const size_t numVecs = getNumVectors ();
1955  if (debugCheckIndices) {
1956  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1957  (static_cast<size_t> (imports.extent (0)) !=
1958  numVecs * importLIDs.extent (0),
1959  std::runtime_error,
1960  "imports.extent(0) = " << imports.extent (0)
1961  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1962  << " * " << importLIDs.extent (0) << " = "
1963  << numVecs * importLIDs.extent (0) << ".");
1964 
1965  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1966  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1967  "constantNumPackets input argument must be nonzero.");
1968 
1969  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1970  (static_cast<size_t> (numVecs) !=
1971  static_cast<size_t> (constantNumPackets),
1972  std::runtime_error, "constantNumPackets must equal numVecs.");
1973  }
1974 
1975  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1976  // the memory space in which the imports buffer was last modified and
1977  // the size of the imports buffer.
1978  // DistObject::doTransferNew decides where it was last modified (based on
1979  // whether communication buffers used were on host or device).
1980  if (unpackOnHost) {
1981  if (this->imports_.need_sync_host()) this->imports_.sync_host();
1982  }
1983  else {
1984  if (this->imports_.need_sync_device()) this->imports_.sync_device();
1985  }
1986 
1987  if (printDebugOutput) {
1988  std::ostringstream os;
1989  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1990  << endl;
1991  std::cerr << os.str ();
1992  }
1993 
1994  // We have to sync before modifying, because this method may read
1995  // as well as write (depending on the CombineMode).
1996  auto imports_d = imports.view_device ();
1997  auto imports_h = imports.view_host ();
1998  auto importLIDs_d = importLIDs.view_device ();
1999  auto importLIDs_h = importLIDs.view_host ();
2000 
2001  Kokkos::DualView<size_t*, device_type> whichVecs;
2002  if (! isConstantStride ()) {
2003  Kokkos::View<const size_t*, Kokkos::HostSpace,
2004  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
2005  numVecs);
2006  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
2007  if (unpackOnHost) {
2008  whichVecs.modify_host ();
2009  // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
2010  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
2011  }
2012  else {
2013  whichVecs.modify_device ();
2014  // DEEP_COPY REVIEW - HOST-TO-DEVICE
2015  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
2016  }
2017  }
2018  auto whichVecs_d = whichVecs.view_device ();
2019  auto whichVecs_h = whichVecs.view_host ();
2020 
2021  /* The layout in the export for MultiVectors is as follows:
2022  imports = { all of the data from row exportLIDs.front() ;
2023  ....
2024  all of the data from row exportLIDs.back() }
2025  This doesn't have the best locality, but is necessary because
2026  the data for a Packet (all data associated with an LID) is
2027  required to be contiguous. */
2028 
2029  if (numVecs > 0 && importLIDs.extent (0) > 0) {
2030  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
2031  using host_exec_space = typename dual_view_type::t_host::execution_space;
2032 
2033  // This fixes GitHub Issue #4418.
2034  const bool use_atomic_updates = unpackOnHost ?
2035  host_exec_space().concurrency () != 1 :
2036  dev_exec_space().concurrency () != 1;
2037 
2038  if (printDebugOutput) {
2039  std::ostringstream os;
2040  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
2041  std::cerr << os.str ();
2042  }
2043 
2044  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
2045  // custom combine modes, start editing here.
2046 
2047  if (CM == INSERT || CM == REPLACE) {
2048  using op_type = KokkosRefactor::Details::InsertOp;
2049  if (isConstantStride ()) {
2050  if (unpackOnHost) {
2051  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2052  unpack_array_multi_column (host_exec_space (),
2053  X_h, imports_h, importLIDs_h,
2054  op_type (), numVecs,
2055  use_atomic_updates,
2056  debugCheckIndices);
2057 
2058  }
2059  else { // unpack on device
2060  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2061  unpack_array_multi_column (space,
2062  X_d, imports_d, importLIDs_d,
2063  op_type (), numVecs,
2064  use_atomic_updates,
2065  debugCheckIndices);
2066  }
2067  }
2068  else { // not constant stride
2069  if (unpackOnHost) {
2070  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2071  unpack_array_multi_column_variable_stride (host_exec_space (),
2072  X_h, imports_h,
2073  importLIDs_h,
2074  whichVecs_h,
2075  op_type (),
2076  numVecs,
2077  use_atomic_updates,
2078  debugCheckIndices);
2079  }
2080  else { // unpack on device
2081  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2082  unpack_array_multi_column_variable_stride (space,
2083  X_d, imports_d,
2084  importLIDs_d,
2085  whichVecs_d,
2086  op_type (),
2087  numVecs,
2088  use_atomic_updates,
2089  debugCheckIndices);
2090  }
2091  }
2092  }
2093  else if (CM == ADD || CM == ADD_ASSIGN) {
2094  using op_type = KokkosRefactor::Details::AddOp;
2095  if (isConstantStride ()) {
2096  if (unpackOnHost) {
2097  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2098  unpack_array_multi_column (host_exec_space (),
2099  X_h, imports_h, importLIDs_h,
2100  op_type (), numVecs,
2101  use_atomic_updates,
2102  debugCheckIndices);
2103  }
2104  else { // unpack on device
2105  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2106  unpack_array_multi_column (space,
2107  X_d, imports_d, importLIDs_d,
2108  op_type (), numVecs,
2109  use_atomic_updates,
2110  debugCheckIndices);
2111  }
2112  }
2113  else { // not constant stride
2114  if (unpackOnHost) {
2115  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2116  unpack_array_multi_column_variable_stride (host_exec_space (),
2117  X_h, imports_h,
2118  importLIDs_h,
2119  whichVecs_h,
2120  op_type (),
2121  numVecs,
2122  use_atomic_updates,
2123  debugCheckIndices);
2124  }
2125  else { // unpack on device
2126  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2127  unpack_array_multi_column_variable_stride (space,
2128  X_d, imports_d,
2129  importLIDs_d,
2130  whichVecs_d,
2131  op_type (),
2132  numVecs,
2133  use_atomic_updates,
2134  debugCheckIndices);
2135  }
2136  }
2137  }
2138  else if (CM == ABSMAX) {
2139  using op_type = KokkosRefactor::Details::AbsMaxOp;
2140  if (isConstantStride ()) {
2141  if (unpackOnHost) {
2142  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2143  unpack_array_multi_column (host_exec_space (),
2144  X_h, imports_h, importLIDs_h,
2145  op_type (), numVecs,
2146  use_atomic_updates,
2147  debugCheckIndices);
2148  }
2149  else { // unpack on device
2150  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2151  unpack_array_multi_column (space,
2152  X_d, imports_d, importLIDs_d,
2153  op_type (), numVecs,
2154  use_atomic_updates,
2155  debugCheckIndices);
2156  }
2157  }
2158  else {
2159  if (unpackOnHost) {
2160  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2161  unpack_array_multi_column_variable_stride (host_exec_space (),
2162  X_h, imports_h,
2163  importLIDs_h,
2164  whichVecs_h,
2165  op_type (),
2166  numVecs,
2167  use_atomic_updates,
2168  debugCheckIndices);
2169  }
2170  else { // unpack on device
2171  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2172  unpack_array_multi_column_variable_stride (space,
2173  X_d, imports_d,
2174  importLIDs_d,
2175  whichVecs_d,
2176  op_type (),
2177  numVecs,
2178  use_atomic_updates,
2179  debugCheckIndices);
2180  }
2181  }
2182  }
2183  else {
2184  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2185  (true, std::logic_error, "Invalid CombineMode");
2186  }
2187  }
2188  else {
2189  if (printDebugOutput) {
2190  std::ostringstream os;
2191  os << *prefix << "Nothing to unpack" << endl;
2192  std::cerr << os.str ();
2193  }
2194  }
2195 
2196  if (printDebugOutput) {
2197  std::ostringstream os;
2198  os << *prefix << "Done!" << endl;
2199  std::cerr << os.str ();
2200  }
2201  }
2202 
2203  // clang-format on
2204  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2205  void
2208  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2209  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2210  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2211  const size_t constantNumPackets,
2212  const CombineMode CM) {
2213  unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2214  }
2215  // clang-format off
2216 
2217  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2218  size_t
2221  {
2222  if (isConstantStride ()) {
2223  return static_cast<size_t> (view_.extent (1));
2224  } else {
2225  return static_cast<size_t> (whichVectors_.size ());
2226  }
2227  }
2228 
2229  namespace { // (anonymous)
2230 
2231  template<class RV>
2232  void
2233  gblDotImpl (const RV& dotsOut,
2234  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2235  const bool distributed)
2236  {
2237  using Teuchos::REDUCE_MAX;
2238  using Teuchos::REDUCE_SUM;
2239  using Teuchos::reduceAll;
2240  typedef typename RV::non_const_value_type dot_type;
2241 
2242  const size_t numVecs = dotsOut.extent (0);
2243 
2244  // If the MultiVector is distributed over multiple processes, do
2245  // the distributed (interprocess) part of the dot product. We
2246  // assume that the MPI implementation can read from and write to
2247  // device memory.
2248  //
2249  // replaceMap() may have removed some processes. Those
2250  // processes have a null Map. They must not participate in any
2251  // collective operations. We ask first whether the Map is null,
2252  // because isDistributed() defers that question to the Map. We
2253  // still compute and return local dots for processes not
2254  // participating in collective operations; those probably don't
2255  // make any sense, but it doesn't hurt to do them, since it's
2256  // illegal to call dot() on those processes anyway.
2257  if (distributed && ! comm.is_null ()) {
2258  // The calling process only participates in the collective if
2259  // both the Map and its Comm on that process are nonnull.
2260  const int nv = static_cast<int> (numVecs);
2261  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
2262 
2263  if (commIsInterComm) {
2264  // If comm is an intercomm, then we may not alias input and
2265  // output buffers, so we have to make a copy of the local
2266  // sum.
2267  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2268  // DEEP_COPY REVIEW - NOT TESTED
2269  Kokkos::deep_copy (lclDots, dotsOut);
2270  const dot_type* const lclSum = lclDots.data ();
2271  dot_type* const gblSum = dotsOut.data ();
2272  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2273  }
2274  else {
2275  dot_type* const inout = dotsOut.data ();
2276  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
2277  }
2278  }
2279  }
2280  } // namespace (anonymous)
2281 
2282  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2283  void
2286  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2287  {
2288  using ::Tpetra::Details::Behavior;
2289  using Kokkos::subview;
2290  using Teuchos::Comm;
2291  using Teuchos::null;
2292  using Teuchos::RCP;
2293  // View of all the dot product results.
2294  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2295  typedef typename dual_view_type::t_dev_const XMV;
2296  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2297 
2298  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2299 
2300  const size_t numVecs = this->getNumVectors ();
2301  if (numVecs == 0) {
2302  return; // nothing to do
2303  }
2304  const size_t lclNumRows = this->getLocalLength ();
2305  const size_t numDots = static_cast<size_t> (dots.extent (0));
2306  const bool debug = Behavior::debug ();
2307 
2308  if (debug) {
2309  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2310  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2311  (! compat, std::invalid_argument, "'*this' MultiVector is not "
2312  "compatible with the input MultiVector A. We only test for this "
2313  "in debug mode.");
2314  }
2315 
2316  // FIXME (mfh 11 Jul 2014) These exception tests may not
2317  // necessarily be thrown on all processes consistently. We should
2318  // instead pass along error state with the inner product. We
2319  // could do this by setting an extra slot to
2320  // Kokkos::ArithTraits<dot_type>::one() on error. The
2321  // final sum should be
2322  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2323  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2324  lclNumRows != A.getLocalLength (), std::runtime_error,
2325  "MultiVectors do not have the same local length. "
2326  "this->getLocalLength() = " << lclNumRows << " != "
2327  "A.getLocalLength() = " << A.getLocalLength () << ".");
2328  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2329  numVecs != A.getNumVectors (), std::runtime_error,
2330  "MultiVectors must have the same number of columns (vectors). "
2331  "this->getNumVectors() = " << numVecs << " != "
2332  "A.getNumVectors() = " << A.getNumVectors () << ".");
2333  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2334  numDots != numVecs, std::runtime_error,
2335  "The output array 'dots' must have the same number of entries as the "
2336  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2337  numDots << " != this->getNumVectors() = " << numVecs << ".");
2338 
2339  const std::pair<size_t, size_t> colRng (0, numVecs);
2340  RV dotsOut = subview (dots, colRng);
2341  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2342  this->getMap ()->getComm ();
2343 
2344  auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2345  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2346 
2347  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2348  this->whichVectors_.getRawPtr (),
2349  A.whichVectors_.getRawPtr (),
2350  this->isConstantStride (), A.isConstantStride ());
2351 
2352  // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2353  // functions unless they explicitely return a value to Host.
2354  // Here while the lclDot are on host, they are not a return
2355  // value, therefore they might be avaible to us immediately.
2356  // Adding a frnce here guarantees that we will have the lclDot
2357  // ahead of the MPI reduction.
2358  execution_space exec_space_instance = execution_space();
2359  exec_space_instance.fence();
2360 
2361  gblDotImpl (dotsOut, comm, this->isDistributed ());
2362  }
2363 
2364  namespace { // (anonymous)
2365  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2367  multiVectorSingleColumnDot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
2369  {
2370  using ::Tpetra::Details::ProfilingRegion;
2372  using dot_type = typename MV::dot_type;
2373  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2374 
2375  auto map = x.getMap ();
2376  Teuchos::RCP<const Teuchos::Comm<int> > comm =
2377  map.is_null () ? Teuchos::null : map->getComm ();
2378  if (comm.is_null ()) {
2379  return Kokkos::ArithTraits<dot_type>::zero ();
2380  }
2381  else {
2382  using LO = LocalOrdinal;
2383  // The min just ensures that we don't overwrite memory that
2384  // doesn't belong to us, in the erroneous input case where x
2385  // and y have different numbers of rows.
2386  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2387  y.getLocalLength ()));
2388  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2389  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2390  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2391 
2392  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2393  //const_cast<MV&>(x).sync_device ();
2394  //const_cast<MV&>(y).sync_device ();
2395 
2396  auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2397  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2398  auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2399  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2400  lclDot = KokkosBlas::dot (x_1d, y_1d);
2401 
2402  if (x.isDistributed ()) {
2403  using Teuchos::outArg;
2404  using Teuchos::REDUCE_SUM;
2405  using Teuchos::reduceAll;
2406  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2407  }
2408  else {
2409  gblDot = lclDot;
2410  }
2411  return gblDot;
2412  }
2413  }
2414  } // namespace (anonymous)
2415 
2416 
2417 
2418  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2419  void
2422  const Teuchos::ArrayView<dot_type>& dots) const
2423  {
2424  const char tfecfFuncName[] = "dot: ";
2425  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2426 
2427  const size_t numVecs = this->getNumVectors ();
2428  const size_t lclNumRows = this->getLocalLength ();
2429  const size_t numDots = static_cast<size_t> (dots.size ());
2430 
2431  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2432  // not necessarily be thrown on all processes consistently. We
2433  // keep them for now, because MultiVector's unit tests insist on
2434  // them. In the future, we should instead pass along error state
2435  // with the inner product. We could do this by setting an extra
2436  // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2437  // The final sum should be
2438  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2439  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2440  (lclNumRows != A.getLocalLength (), std::runtime_error,
2441  "MultiVectors do not have the same local length. "
2442  "this->getLocalLength() = " << lclNumRows << " != "
2443  "A.getLocalLength() = " << A.getLocalLength () << ".");
2444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2445  (numVecs != A.getNumVectors (), std::runtime_error,
2446  "MultiVectors must have the same number of columns (vectors). "
2447  "this->getNumVectors() = " << numVecs << " != "
2448  "A.getNumVectors() = " << A.getNumVectors () << ".");
2449  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2450  (numDots != numVecs, std::runtime_error,
2451  "The output array 'dots' must have the same number of entries as the "
2452  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2453  numDots << " != this->getNumVectors() = " << numVecs << ".");
2454 
2455  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2456  const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
2457  dots[0] = gblDot;
2458  }
2459  else {
2460  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2461  }
2462  }
2463 
2464  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2465  void
2467  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2468  {
2470  using ::Tpetra::Details::NORM_TWO;
2471  using ::Tpetra::Details::ProfilingRegion;
2472  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2473 
2474  // The function needs to be able to sync X.
2475  MV& X = const_cast<MV&> (*this);
2476  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2477  }
2478 
2479  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2480  void
2482  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2483  {
2484  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2485  this->norm2 (norms_av);
2486  }
2487 
2488 
2489  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2490  void
2492  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2493  {
2495  using ::Tpetra::Details::NORM_ONE;
2496  using ::Tpetra::Details::ProfilingRegion;
2497  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2498 
2499  // The function needs to be able to sync X.
2500  MV& X = const_cast<MV&> (*this);
2501  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2502  }
2503 
2504  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2505  void
2507  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2508  {
2509  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2510  this->norm1 (norms_av);
2511  }
2512 
2513  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2514  void
2516  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2517  {
2519  using ::Tpetra::Details::NORM_INF;
2520  using ::Tpetra::Details::ProfilingRegion;
2521  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2522 
2523  // The function needs to be able to sync X.
2524  MV& X = const_cast<MV&> (*this);
2525  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2526  }
2527 
2528  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2529  void
2531  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2532  {
2533  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2534  this->normInf (norms_av);
2535  }
2536 
2537  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2538  void
2540  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2541  {
2542  // KR FIXME Overload this method to take a View.
2543  using Kokkos::ALL;
2544  using Kokkos::subview;
2545  using Teuchos::Comm;
2546  using Teuchos::RCP;
2547  using Teuchos::reduceAll;
2548  using Teuchos::REDUCE_SUM;
2549  typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2550 
2551  const size_t lclNumRows = this->getLocalLength ();
2552  const size_t numVecs = this->getNumVectors ();
2553  const size_t numMeans = static_cast<size_t> (means.size ());
2554 
2555  TEUCHOS_TEST_FOR_EXCEPTION(
2556  numMeans != numVecs, std::runtime_error,
2557  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2558  << " != this->getNumVectors() = " << numVecs << ".");
2559 
2560  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2561  const std::pair<size_t, size_t> colRng (0, numVecs);
2562 
2563  // Make sure that the final output view has the same layout as the
2564  // temporary view's HostMirror. Left or Right doesn't matter for
2565  // a 1-D array anyway; this is just to placate the compiler.
2566  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2567  typedef Kokkos::View<impl_scalar_type*,
2568  typename local_view_type::HostMirror::array_layout,
2569  Kokkos::HostSpace,
2570  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2571  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2572 
2573  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2574  this->getMap ()->getComm ();
2575 
2576  // If we need sync to device, then host has the most recent version.
2577  const bool useHostVersion = this->need_sync_device ();
2578  if (useHostVersion) {
2579  // DualView was last modified on host, so run the local kernel there.
2580  auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2581  rowRng, Kokkos::ALL ());
2582  // Compute the local sum of each column.
2583  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2584  if (isConstantStride ()) {
2585  KokkosBlas::sum (lclSums, X_lcl);
2586  }
2587  else {
2588  for (size_t j = 0; j < numVecs; ++j) {
2589  const size_t col = whichVectors_[j];
2590  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2591  }
2592  }
2593 
2594  // If there are multiple MPI processes, the all-reduce reads
2595  // from lclSums, and writes to meansOut. Otherwise, we just
2596  // copy lclSums into meansOut.
2597  if (! comm.is_null () && this->isDistributed ()) {
2598  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2599  lclSums.data (), meansOut.data ());
2600  }
2601  else {
2602  // DEEP_COPY REVIEW - NOT TESTED
2603  Kokkos::deep_copy (meansOut, lclSums);
2604  }
2605  }
2606  else {
2607  // DualView was last modified on device, so run the local kernel there.
2608  auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2609  rowRng, Kokkos::ALL ());
2610 
2611  // Compute the local sum of each column.
2612  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2613  if (isConstantStride ()) {
2614  KokkosBlas::sum (lclSums, X_lcl);
2615  }
2616  else {
2617  for (size_t j = 0; j < numVecs; ++j) {
2618  const size_t col = whichVectors_[j];
2619  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2620  }
2621  }
2622  // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2623  // functions unless they explicitly return a value to Host.
2624  // Here while the lclSums are on the host, they are not a return
2625  // value, therefore they might be available to us immediately.
2626  // Adding a fence here guarantees that we will have the lclSums
2627  // ahead of the MPI reduction.
2628  execution_space exec_space_instance = execution_space();
2629  exec_space_instance.fence();
2630 
2631  // If there are multiple MPI processes, the all-reduce reads
2632  // from lclSums, and writes to meansOut. (We assume that MPI
2633  // can read device memory.) Otherwise, we just copy lclSums
2634  // into meansOut.
2635  if (! comm.is_null () && this->isDistributed ()) {
2636  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2637  lclSums.data (), meansOut.data ());
2638  }
2639  else {
2640  // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2641  Kokkos::deep_copy (meansOut, lclSums);
2642  }
2643  }
2644 
2645  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2646  // to the magnitude type, since operator/ (std::complex<T>, int)
2647  // isn't necessarily defined.
2648  const impl_scalar_type OneOverN =
2649  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2650  for (size_t k = 0; k < numMeans; ++k) {
2651  meansOut(k) = meansOut(k) * OneOverN;
2652  }
2653  }
2654 
2655 
2656  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2657  void
2660  {
2661  typedef impl_scalar_type IST;
2662  typedef Kokkos::ArithTraits<IST> ATS;
2663  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2664  typedef typename pool_type::generator_type generator_type;
2665 
2666  const IST max = Kokkos::rand<generator_type, IST>::max ();
2667  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2668 
2669  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2670  }
2671 
2672 
2673  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2674  void
2676  randomize (const Scalar& minVal, const Scalar& maxVal)
2677  {
2678  typedef impl_scalar_type IST;
2679  typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2680  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2681 
2682  // Seed the pool based on the system RNG and the MPI rank, if needed
2683  if(!tpetra_pool_type::isSet())
2684  tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2685 
2686  pool_type & rand_pool = tpetra_pool_type::getPool();
2687  const IST max = static_cast<IST> (maxVal);
2688  const IST min = static_cast<IST> (minVal);
2689 
2690  auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2691 
2692  if (isConstantStride ()) {
2693  Kokkos::fill_random (thisView, rand_pool, min, max);
2694  }
2695  else {
2696  const size_t numVecs = getNumVectors ();
2697  for (size_t k = 0; k < numVecs; ++k) {
2698  const size_t col = whichVectors_[k];
2699  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2700  Kokkos::fill_random (X_k, rand_pool, min, max);
2701  }
2702  }
2703  }
2704 
2705  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2706  void
2708  putScalar (const Scalar& alpha)
2709  {
2710  using ::Tpetra::Details::ProfilingRegion;
2711  using ::Tpetra::Details::Blas::fill;
2712  using DES = typename dual_view_type::t_dev::execution_space;
2713  using HES = typename dual_view_type::t_host::execution_space;
2714  using LO = LocalOrdinal;
2715  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2716 
2717  // We need this cast for cases like Scalar = std::complex<T> but
2718  // impl_scalar_type = Kokkos::complex<T>.
2719  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2720  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2721  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2722 
2723  // Modify the most recently updated version of the data. This
2724  // avoids sync'ing, which could violate users' expectations.
2725  //
2726  // If we need sync to device, then host has the most recent version.
2727  const bool runOnHost = runKernelOnHost(*this);
2728  if (! runOnHost) {
2729  auto X = this->getLocalViewDevice(Access::OverwriteAll);
2730  if (this->isConstantStride ()) {
2731  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2732  }
2733  else {
2734  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2735  this->whichVectors_.getRawPtr ());
2736  }
2737  }
2738  else { // last modified in host memory, so modify data there.
2739  auto X = this->getLocalViewHost(Access::OverwriteAll);
2740  if (this->isConstantStride ()) {
2741  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2742  }
2743  else {
2744  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2745  this->whichVectors_.getRawPtr ());
2746  }
2747  }
2748  }
2749 
2750 
2751  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2752  void
2754  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2755  {
2756  using Teuchos::ArrayRCP;
2757  using Teuchos::Comm;
2758  using Teuchos::RCP;
2759  using ST = Scalar;
2760  using LO = LocalOrdinal;
2761  using GO = GlobalOrdinal;
2762 
2763  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2764  // it might work if the MV is a column view of another MV.
2765  // However, things might go wrong when restoring the original
2766  // Map, so we don't allow this case for now.
2767  TEUCHOS_TEST_FOR_EXCEPTION(
2768  ! this->isConstantStride (), std::logic_error,
2769  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2770  "if the MultiVector is a column view of another MultiVector (that is, if "
2771  "isConstantStride() == false).");
2772 
2773  // Case 1: current Map and new Map are both nonnull on this process.
2774  // Case 2: current Map is nonnull, new Map is null.
2775  // Case 3: current Map is null, new Map is nonnull.
2776  // Case 4: both Maps are null: forbidden.
2777  //
2778  // Case 1 means that we don't have to do anything on this process,
2779  // other than assign the new Map. (We always have to do that.)
2780  // It's an error for the user to supply a Map that requires
2781  // resizing in this case.
2782  //
2783  // Case 2 means that the calling process is in the current Map's
2784  // communicator, but will be excluded from the new Map's
2785  // communicator. We don't have to do anything on the calling
2786  // process; just leave whatever data it may have alone.
2787  //
2788  // Case 3 means that the calling process is excluded from the
2789  // current Map's communicator, but will be included in the new
2790  // Map's communicator. This means we need to (re)allocate the
2791  // local DualView if it does not have the right number of rows.
2792  // If the new number of rows is nonzero, we'll fill the newly
2793  // allocated local data with zeros, as befits a projection
2794  // operation.
2795  //
2796  // The typical use case for Case 3 is that the MultiVector was
2797  // first created with the Map with more processes, then that Map
2798  // was replaced with a Map with fewer processes, and finally the
2799  // original Map was restored on this call to replaceMap.
2800 
2801  if (this->getMap ().is_null ()) { // current Map is null
2802  // If this->getMap() is null, that means that this MultiVector
2803  // has already had replaceMap happen to it. In that case, just
2804  // reallocate the DualView with the right size.
2805 
2806  TEUCHOS_TEST_FOR_EXCEPTION(
2807  newMap.is_null (), std::invalid_argument,
2808  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2809  "This probably means that the input Map is incorrect.");
2810 
2811  // Case 3: current Map is null, new Map is nonnull.
2812  // Reallocate the DualView with the right dimensions.
2813  const size_t newNumRows = newMap->getLocalNumElements ();
2814  const size_t origNumRows = view_.extent (0);
2815  const size_t numCols = this->getNumVectors ();
2816 
2817  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2818  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2819  }
2820  }
2821  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2822  // I am an excluded process. Reinitialize my data so that I
2823  // have 0 rows. Keep the number of columns as before.
2824  const size_t newNumRows = static_cast<size_t> (0);
2825  const size_t numCols = this->getNumVectors ();
2826  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2827  }
2828 
2829  this->map_ = newMap;
2830  }
2831 
2832  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2833  void
2835  scale (const Scalar& alpha)
2836  {
2837  using Kokkos::ALL;
2838  using IST = impl_scalar_type;
2839 
2840  const IST theAlpha = static_cast<IST> (alpha);
2841  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2842  return; // do nothing
2843  }
2844  const size_t lclNumRows = getLocalLength ();
2845  const size_t numVecs = getNumVectors ();
2846  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2847  const std::pair<size_t, size_t> colRng (0, numVecs);
2848 
2849  // We can't substitute putScalar(0.0) for scale(0.0), because the
2850  // former will overwrite NaNs present in the MultiVector. The
2851  // semantics of this call require multiplying them by 0, which
2852  // IEEE 754 requires to be NaN.
2853 
2854  // If we need sync to device, then host has the most recent version.
2855  const bool useHostVersion = need_sync_device ();
2856  if (useHostVersion) {
2857  auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2858  if (isConstantStride ()) {
2859  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2860  }
2861  else {
2862  for (size_t k = 0; k < numVecs; ++k) {
2863  const size_t Y_col = whichVectors_[k];
2864  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2865  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2866  }
2867  }
2868  }
2869  else { // work on device
2870  auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2871  if (isConstantStride ()) {
2872  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2873  }
2874  else {
2875  for (size_t k = 0; k < numVecs; ++k) {
2876  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2877  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2878  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2879  }
2880  }
2881  }
2882  }
2883 
2884 
2885  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2886  void
2888  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2889  {
2890  const size_t numVecs = this->getNumVectors ();
2891  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2892  TEUCHOS_TEST_FOR_EXCEPTION(
2893  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2894  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2895  << numVecs << ".");
2896 
2897  // Use a DualView to copy the scaling constants onto the device.
2898  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2899  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2900  k_alphas.modify_host ();
2901  for (size_t i = 0; i < numAlphas; ++i) {
2902  k_alphas.view_host()(i) = static_cast<impl_scalar_type> (alphas[i]);
2903  }
2904  k_alphas.sync_device ();
2905  // Invoke the scale() overload that takes a device View of coefficients.
2906  this->scale (k_alphas.view_device ());
2907  }
2908 
2909  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2910  void
2912  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2913  {
2914  using Kokkos::ALL;
2915  using Kokkos::subview;
2916 
2917  const size_t lclNumRows = this->getLocalLength ();
2918  const size_t numVecs = this->getNumVectors ();
2919  TEUCHOS_TEST_FOR_EXCEPTION(
2920  static_cast<size_t> (alphas.extent (0)) != numVecs,
2921  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2922  "alphas.extent(0) = " << alphas.extent (0)
2923  << " != this->getNumVectors () = " << numVecs << ".");
2924  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2925  const std::pair<size_t, size_t> colRng (0, numVecs);
2926 
2927  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2928  // type of the return value of subview. This is because if we
2929  // switch the array layout from LayoutLeft to LayoutRight
2930  // (preferred for performance of block operations), the types
2931  // below won't be valid. (A view of a column of a LayoutRight
2932  // multivector has LayoutStride, not LayoutLeft.)
2933 
2934  // If we need sync to device, then host has the most recent version.
2935  const bool useHostVersion = this->need_sync_device ();
2936  if (useHostVersion) {
2937  // Work in host memory. This means we need to create a host
2938  // mirror of the input View of coefficients.
2939  auto alphas_h = Kokkos::create_mirror_view (alphas);
2940  // DEEP_COPY REVIEW - NOT TESTED
2941  Kokkos::deep_copy (alphas_h, alphas);
2942 
2943  auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2944  if (isConstantStride ()) {
2945  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2946  }
2947  else {
2948  for (size_t k = 0; k < numVecs; ++k) {
2949  const size_t Y_col = this->isConstantStride () ? k :
2950  this->whichVectors_[k];
2951  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2952  // We don't have to use the entire 1-D View here; we can use
2953  // the version that takes a scalar coefficient.
2954  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2955  }
2956  }
2957  }
2958  else { // Work in device memory, using the input View 'alphas' directly.
2959  auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2960  if (isConstantStride ()) {
2961  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2962  }
2963  else {
2964  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2965  // as values on host, so copy them to host. Another approach
2966  // would be to fix scal() so that it takes a 0-D View as the
2967  // second argument.
2968  auto alphas_h = Kokkos::create_mirror_view (alphas);
2969  // DEEP_COPY REVIEW - NOT TESTED
2970  Kokkos::deep_copy (alphas_h, alphas);
2971 
2972  for (size_t k = 0; k < numVecs; ++k) {
2973  const size_t Y_col = this->isConstantStride () ? k :
2974  this->whichVectors_[k];
2975  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2976  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2977  }
2978  }
2979  }
2980  }
2981 
2982  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2983  void
2985  scale (const Scalar& alpha,
2987  {
2988  using Kokkos::ALL;
2989  using Kokkos::subview;
2990  const char tfecfFuncName[] = "scale: ";
2991 
2992  const size_t lclNumRows = getLocalLength ();
2993  const size_t numVecs = getNumVectors ();
2994 
2995  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2996  lclNumRows != A.getLocalLength (), std::invalid_argument,
2997  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2998  << A.getLocalLength () << ".");
2999  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3000  numVecs != A.getNumVectors (), std::invalid_argument,
3001  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3002  << A.getNumVectors () << ".");
3003 
3004  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3005  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3006  const std::pair<size_t, size_t> colRng (0, numVecs);
3007 
3008  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3009  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3010  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
3011  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
3012 
3013  if (isConstantStride () && A.isConstantStride ()) {
3014  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
3015  }
3016  else {
3017  // Make sure that Kokkos only uses the local length for add.
3018  for (size_t k = 0; k < numVecs; ++k) {
3019  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3020  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3021  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3022  auto X_k = subview (X_lcl, ALL (), X_col);
3023 
3024  KokkosBlas::scal (Y_k, theAlpha, X_k);
3025  }
3026  }
3027  }
3028 
3029 
3030 
3031  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3032  void
3035  {
3036  const char tfecfFuncName[] = "reciprocal: ";
3037 
3038  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3039  getLocalLength () != A.getLocalLength (), std::runtime_error,
3040  "MultiVectors do not have the same local length. "
3041  "this->getLocalLength() = " << getLocalLength ()
3042  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3044  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3045  ": MultiVectors do not have the same number of columns (vectors). "
3046  "this->getNumVectors() = " << getNumVectors ()
3047  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3048 
3049  const size_t numVecs = getNumVectors ();
3050 
3051  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3052  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3053 
3054  if (isConstantStride () && A.isConstantStride ()) {
3055  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
3056  }
3057  else {
3058  using Kokkos::ALL;
3059  using Kokkos::subview;
3060  for (size_t k = 0; k < numVecs; ++k) {
3061  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3062  auto vector_k = subview (this_view_dev, ALL (), this_col);
3063  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3064  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3065  KokkosBlas::reciprocal (vector_k, vector_Ak);
3066  }
3067  }
3068  }
3069 
3070  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3071  void
3074  {
3075  const char tfecfFuncName[] = "abs";
3076 
3077  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3078  getLocalLength () != A.getLocalLength (), std::runtime_error,
3079  ": MultiVectors do not have the same local length. "
3080  "this->getLocalLength() = " << getLocalLength ()
3081  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3082  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3083  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3084  ": MultiVectors do not have the same number of columns (vectors). "
3085  "this->getNumVectors() = " << getNumVectors ()
3086  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3087  const size_t numVecs = getNumVectors ();
3088 
3089  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3090  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3091 
3092  if (isConstantStride () && A.isConstantStride ()) {
3093  KokkosBlas::abs (this_view_dev, A_view_dev);
3094  }
3095  else {
3096  using Kokkos::ALL;
3097  using Kokkos::subview;
3098 
3099  for (size_t k=0; k < numVecs; ++k) {
3100  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3101  auto vector_k = subview (this_view_dev, ALL (), this_col);
3102  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3103  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3104  KokkosBlas::abs (vector_k, vector_Ak);
3105  }
3106  }
3107  }
3108 
3109  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3110  void
3112  update (const Scalar& alpha,
3114  const Scalar& beta)
3115  {
3116  const char tfecfFuncName[] = "update: ";
3117  using Kokkos::subview;
3118  using Kokkos::ALL;
3119 
3120  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3121 
3122  const size_t lclNumRows = getLocalLength ();
3123  const size_t numVecs = getNumVectors ();
3124 
3126  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3127  lclNumRows != A.getLocalLength (), std::invalid_argument,
3128  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3129  << A.getLocalLength () << ".");
3130  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3131  numVecs != A.getNumVectors (), std::invalid_argument,
3132  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3133  << A.getNumVectors () << ".");
3134  }
3135 
3136  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3137  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3138  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3139  const std::pair<size_t, size_t> colRng (0, numVecs);
3140 
3141  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3142  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3143  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3144  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3145 
3146  // The device memory of *this is about to be modified
3147  if (isConstantStride () && A.isConstantStride ()) {
3148  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3149  }
3150  else {
3151  // Make sure that Kokkos only uses the local length for add.
3152  for (size_t k = 0; k < numVecs; ++k) {
3153  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3154  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3155  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3156  auto X_k = subview (X_lcl, ALL (), X_col);
3157 
3158  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3159  }
3160  }
3161  }
3162 
3163  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3164  void
3166  update (const Scalar& alpha,
3168  const Scalar& beta,
3170  const Scalar& gamma)
3171  {
3172  using Kokkos::ALL;
3173  using Kokkos::subview;
3174 
3175  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3176 
3177  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3178 
3179  const size_t lclNumRows = this->getLocalLength ();
3180  const size_t numVecs = getNumVectors ();
3181 
3183  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3184  lclNumRows != A.getLocalLength (), std::invalid_argument,
3185  "The input MultiVector A has " << A.getLocalLength () << " local "
3186  "row(s), but this MultiVector has " << lclNumRows << " local "
3187  "row(s).");
3188  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3189  lclNumRows != B.getLocalLength (), std::invalid_argument,
3190  "The input MultiVector B has " << B.getLocalLength () << " local "
3191  "row(s), but this MultiVector has " << lclNumRows << " local "
3192  "row(s).");
3193  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3194  A.getNumVectors () != numVecs, std::invalid_argument,
3195  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3196  "but this MultiVector has " << numVecs << " column(s).");
3197  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3198  B.getNumVectors () != numVecs, std::invalid_argument,
3199  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3200  "but this MultiVector has " << numVecs << " column(s).");
3201  }
3202 
3203  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3204  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3205  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3206 
3207  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3208  const std::pair<size_t, size_t> colRng (0, numVecs);
3209 
3210  // Prefer 'auto' over specifying the type explicitly. This avoids
3211  // issues with a subview possibly having a different type than the
3212  // original view.
3213  auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3214  auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3215  auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3216 
3217  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3218  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3219  }
3220  else {
3221  // Some input (or *this) is not constant stride,
3222  // so perform the update one column at a time.
3223  for (size_t k = 0; k < numVecs; ++k) {
3224  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3225  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3226  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3227  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3228  theBeta, subview (B_lcl, rowRng, B_col),
3229  theGamma, subview (C_lcl, rowRng, this_col));
3230  }
3231  }
3232  }
3233 
3234  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3235  Teuchos::ArrayRCP<const Scalar>
3237  getData (size_t j) const
3238  {
3239  using Kokkos::ALL;
3240  using IST = impl_scalar_type;
3241  const char tfecfFuncName[] = "getData: ";
3242 
3243  // Any MultiVector method that called the (classic) Kokkos Node's
3244  // viewBuffer or viewBufferNonConst methods always implied a
3245  // device->host synchronization. Thus, we synchronize here as
3246  // well.
3247 
3248  auto hostView = getLocalViewHost(Access::ReadOnly);
3249  const size_t col = isConstantStride () ? j : whichVectors_[j];
3250  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3251  Teuchos::ArrayRCP<const IST> dataAsArcp =
3252  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3253 
3254  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3255  (static_cast<size_t> (hostView_j.extent (0)) <
3256  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3257  "hostView_j.extent(0) = " << hostView_j.extent (0)
3258  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3259  "Please report this bug to the Tpetra developers.");
3260 
3261  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3262  }
3263 
3264  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3265  Teuchos::ArrayRCP<Scalar>
3268  {
3269  using Kokkos::ALL;
3270  using Kokkos::subview;
3271  using IST = impl_scalar_type;
3272  const char tfecfFuncName[] = "getDataNonConst: ";
3273 
3274  auto hostView = getLocalViewHost(Access::ReadWrite);
3275  const size_t col = isConstantStride () ? j : whichVectors_[j];
3276  auto hostView_j = subview (hostView, ALL (), col);
3277  Teuchos::ArrayRCP<IST> dataAsArcp =
3278  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3279 
3280  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3281  (static_cast<size_t> (hostView_j.extent (0)) <
3282  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3283  "hostView_j.extent(0) = " << hostView_j.extent (0)
3284  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3285  "Please report this bug to the Tpetra developers.");
3286 
3287  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3288  }
3289 
3290  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3291  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3293  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3294  {
3295  using Teuchos::RCP;
3297 
3298  // Check whether the index set in cols is contiguous. If it is,
3299  // use the more efficient Range1D version of subCopy.
3300  bool contiguous = true;
3301  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3302  for (size_t j = 1; j < numCopyVecs; ++j) {
3303  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3304  contiguous = false;
3305  break;
3306  }
3307  }
3308  if (contiguous && numCopyVecs > 0) {
3309  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3310  }
3311  else {
3312  RCP<const MV> X_sub = this->subView (cols);
3313  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3314  Y->assign (*X_sub);
3315  return Y;
3316  }
3317  }
3318 
3319  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3320  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3322  subCopy (const Teuchos::Range1D &colRng) const
3323  {
3324  using Teuchos::RCP;
3326 
3327  RCP<const MV> X_sub = this->subView (colRng);
3328  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3329  Y->assign (*X_sub);
3330  return Y;
3331  }
3332 
3333  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3334  size_t
3337  return view_.origExtent(0);
3338  }
3339 
3340  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3341  size_t
3344  return view_.origExtent(1);
3345  }
3346 
3347  template <class Scalar, class LO, class GO, class Node>
3350  const Teuchos::RCP<const map_type>& subMap,
3351  const local_ordinal_type rowOffset) :
3352  base_type (subMap)
3353  {
3354  using Kokkos::ALL;
3355  using Kokkos::subview;
3356  using Teuchos::outArg;
3357  using Teuchos::RCP;
3358  using Teuchos::rcp;
3359  using Teuchos::reduceAll;
3360  using Teuchos::REDUCE_MIN;
3361  using std::endl;
3363  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3364  const char suffix[] = "Please report this bug to the Tpetra developers.";
3365  int lclGood = 1;
3366  int gblGood = 1;
3367  std::unique_ptr<std::ostringstream> errStrm;
3368  const bool debug = ::Tpetra::Details::Behavior::debug ();
3369  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3370 
3371  // Be careful to use the input Map's communicator, not X's. The
3372  // idea is that, on return, *this is a subview of X, using the
3373  // input Map.
3374  const auto comm = subMap->getComm ();
3375  TEUCHOS_ASSERT( ! comm.is_null () );
3376  const int myRank = comm->getRank ();
3377 
3378  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3379  const LO numCols = static_cast<LO> (X.getNumVectors ());
3380  const LO newNumRows = static_cast<LO> (subMap->getLocalNumElements ());
3381  if (verbose) {
3382  std::ostringstream os;
3383  os << "Proc " << myRank << ": " << prefix
3384  << "X: {lclNumRows: " << lclNumRowsBefore
3385  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3386  << ", numCols: " << numCols << "}, "
3387  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3388  std::cerr << os.str ();
3389  }
3390  // We ask for the _original_ number of rows in X, because X could
3391  // be a shorter (fewer rows) view of a longer MV. (For example, X
3392  // could be a domain Map view of a column Map MV.)
3393  const bool tooManyElts =
3394  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3395  if (tooManyElts) {
3396  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3397  *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3398  << newNumRows << ") + rowOffset (=" << rowOffset
3399  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3400  << ")." << endl;
3401  lclGood = 0;
3402  TEUCHOS_TEST_FOR_EXCEPTION
3403  (! debug && tooManyElts, std::invalid_argument,
3404  prefix << errStrm->str () << suffix);
3405  }
3406 
3407  if (debug) {
3408  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3409  if (gblGood != 1) {
3410  std::ostringstream gblErrStrm;
3411  const std::string myErrStr =
3412  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3413  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3414  TEUCHOS_TEST_FOR_EXCEPTION
3415  (true, std::invalid_argument, gblErrStrm.str ());
3416  }
3417  }
3418 
3419  using range_type = std::pair<LO, LO>;
3420  const range_type origRowRng
3421  (rowOffset, static_cast<LO> (X.view_.origExtent (0)));
3422  const range_type rowRng
3423  (rowOffset, rowOffset + newNumRows);
3424 
3425  wrapped_dual_view_type newView = takeSubview (X.view_, rowRng, ALL ());
3426 
3427  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3428  // handling subviews of degenerate Views quite so well. For some
3429  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3430  // 0. We work around by creating a new empty DualView of the
3431  // desired (degenerate) dimensions.
3432  if (newView.extent (0) == 0 &&
3433  newView.extent (1) != X.view_.extent (1)) {
3434  newView =
3435  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3436  }
3437 
3438  MV subViewMV = X.isConstantStride () ?
3439  MV (subMap, newView) :
3440  MV (subMap, newView, X.whichVectors_ ());
3441 
3442  if (debug) {
3443  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3444  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3445  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3446  lclGood = 0;
3447  if (errStrm.get () == nullptr) {
3448  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3449  }
3450  *errStrm << " Proc " << myRank <<
3451  ": subMap.getLocalNumElements(): " << newNumRows <<
3452  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3453  ", X.getNumVectors(): " << numCols <<
3454  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3455  }
3456  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3457  if (gblGood != 1) {
3458  std::ostringstream gblErrStrm;
3459  if (myRank == 0) {
3460  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3461  "dimensions on one or more processes:" << endl;
3462  }
3463  const std::string myErrStr =
3464  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3465  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3466  gblErrStrm << suffix << endl;
3467  TEUCHOS_TEST_FOR_EXCEPTION
3468  (true, std::invalid_argument, gblErrStrm.str ());
3469  }
3470  }
3471 
3472  if (verbose) {
3473  std::ostringstream os;
3474  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3475  std::cerr << os.str ();
3476  }
3477 
3478  *this = subViewMV; // shallow copy
3479 
3480  if (verbose) {
3481  std::ostringstream os;
3482  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3483  std::cerr << os.str ();
3484  }
3485  }
3486 
3487  template <class Scalar, class LO, class GO, class Node>
3490  const map_type& subMap,
3491  const size_t rowOffset) :
3492  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3493  static_cast<local_ordinal_type> (rowOffset))
3494  {}
3495 
3496  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3497  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3499  offsetView (const Teuchos::RCP<const map_type>& subMap,
3500  const size_t offset) const
3501  {
3503  return Teuchos::rcp (new MV (*this, *subMap, offset));
3504  }
3505 
3506  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3507  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3509  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3510  const size_t offset)
3511  {
3513  return Teuchos::rcp (new MV (*this, *subMap, offset));
3514  }
3515 
3516  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3517  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3519  subView (const Teuchos::ArrayView<const size_t>& cols) const
3520  {
3521  using Teuchos::Array;
3522  using Teuchos::rcp;
3524 
3525  const size_t numViewCols = static_cast<size_t> (cols.size ());
3526  TEUCHOS_TEST_FOR_EXCEPTION(
3527  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3528  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3529  "contain at least one entry, but cols.size() = " << cols.size ()
3530  << " == 0.");
3531 
3532  // Check whether the index set in cols is contiguous. If it is,
3533  // use the more efficient Range1D version of subView.
3534  bool contiguous = true;
3535  for (size_t j = 1; j < numViewCols; ++j) {
3536  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3537  contiguous = false;
3538  break;
3539  }
3540  }
3541  if (contiguous) {
3542  if (numViewCols == 0) {
3543  // The output MV has no columns, so there is nothing to view.
3544  return rcp (new MV (this->getMap (), numViewCols));
3545  } else {
3546  // Use the more efficient contiguous-index-range version.
3547  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3548  }
3549  }
3550 
3551  if (isConstantStride ()) {
3552  return rcp (new MV (this->getMap (), view_, cols));
3553  }
3554  else {
3555  Array<size_t> newcols (cols.size ());
3556  for (size_t j = 0; j < numViewCols; ++j) {
3557  newcols[j] = whichVectors_[cols[j]];
3558  }
3559  return rcp (new MV (this->getMap (), view_, newcols ()));
3560  }
3561  }
3562 
3563 
3564  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3565  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3567  subView (const Teuchos::Range1D& colRng) const
3568  {
3569  using ::Tpetra::Details::Behavior;
3570  using Kokkos::ALL;
3571  using Kokkos::subview;
3572  using Teuchos::Array;
3573  using Teuchos::RCP;
3574  using Teuchos::rcp;
3576  const char tfecfFuncName[] = "subView(Range1D): ";
3577 
3578  const size_t lclNumRows = this->getLocalLength ();
3579  const size_t numVecs = this->getNumVectors ();
3580  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3581  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3582  // "at least one vector.");
3583  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3584  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3585  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3586  << numVecs << ".");
3587  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3588  numVecs != 0 && colRng.size () != 0 &&
3589  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3590  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3591  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3592  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3593  "[0, " << numVecs << "].");
3594 
3595  RCP<const MV> X_ret; // the MultiVector subview to return
3596 
3597  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3598  // broken for the case of views with zero rows. I will brutally
3599  // enforce that the subview has the correct dimensions. In
3600  // particular, in the case of zero rows, I will, if necessary,
3601  // create a new dual_view_type with zero rows and the correct
3602  // number of columns. In a debug build, I will use an all-reduce
3603  // to ensure that it has the correct dimensions on all processes.
3604 
3605  const std::pair<size_t, size_t> rows (0, lclNumRows);
3606  if (colRng.size () == 0) {
3607  const std::pair<size_t, size_t> cols (0, 0); // empty range
3608  wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3609  X_ret = rcp (new MV (this->getMap (), X_sub));
3610  }
3611  else {
3612  // Returned MultiVector is constant stride only if *this is.
3613  if (isConstantStride ()) {
3614  const std::pair<size_t, size_t> cols (colRng.lbound (),
3615  colRng.ubound () + 1);
3616  wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3617  X_ret = rcp (new MV (this->getMap (), X_sub));
3618  }
3619  else {
3620  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3621  // We're only asking for one column, so the result does have
3622  // constant stride, even though this MultiVector does not.
3623  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3624  whichVectors_[0] + colRng.ubound () + 1);
3625  wrapped_dual_view_type X_sub = takeSubview (view_, ALL (), col);
3626  X_ret = rcp (new MV (this->getMap (), X_sub));
3627  }
3628  else {
3629  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3630  whichVectors_.begin () + colRng.ubound () + 1);
3631  X_ret = rcp (new MV (this->getMap (), view_, which));
3632  }
3633  }
3634  }
3635 
3636  const bool debug = Behavior::debug ();
3637  if (debug) {
3638  using Teuchos::Comm;
3639  using Teuchos::outArg;
3640  using Teuchos::REDUCE_MIN;
3641  using Teuchos::reduceAll;
3642 
3643  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3644  Teuchos::null : this->getMap ()->getComm ();
3645  if (! comm.is_null ()) {
3646  int lclSuccess = 1;
3647  int gblSuccess = 1;
3648 
3649  if (X_ret.is_null ()) {
3650  lclSuccess = 0;
3651  }
3652  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3653  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3654  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3655  "MultiVector; the return value of this method) is null on some MPI "
3656  "process in this MultiVector's communicator. This should never "
3657  "happen. Please report this bug to the Tpetra developers.");
3658  if (! X_ret.is_null () &&
3659  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3660  lclSuccess = 0;
3661  }
3662  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3663  outArg (gblSuccess));
3664  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3665  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3666  "colRng.size(), on at least one MPI process in this MultiVector's "
3667  "communicator. This should never happen. "
3668  "Please report this bug to the Tpetra developers.");
3669  }
3670  }
3671  return X_ret;
3672  }
3673 
3674 
3675  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3676  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3678  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3679  {
3681  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3682  }
3683 
3684 
3685  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3686  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3688  subViewNonConst (const Teuchos::Range1D &colRng)
3689  {
3691  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3692  }
3693 
3694 
3695  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3698  const size_t j)
3699  : base_type (X.getMap ())
3700  {
3701  using Kokkos::subview;
3702  typedef std::pair<size_t, size_t> range_type;
3703  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3704 
3705  const size_t numCols = X.getNumVectors ();
3706  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3707  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3708  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3709  const size_t jj = X.isConstantStride () ?
3710  static_cast<size_t> (j) :
3711  static_cast<size_t> (X.whichVectors_[j]);
3712  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3713 
3714  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3715  // Export or Import operations with different subviews of a
3716  // MultiVector. Thus, it is safe to reuse communication buffers.
3717  // See #1560 discussion.
3718  //
3719  // We only need one column's worth of buffer for imports_ and
3720  // exports_. Taking subviews now ensures that their lengths will
3721  // be exactly what we need, so we won't have to resize them later.
3722  {
3723  const size_t newSize = X.imports_.extent (0) / numCols;
3724  const size_t offset = jj*newSize;
3725  auto device_view = subview (X.imports_.view_device(),
3726  range_type (offset, offset+newSize));
3727  auto host_view = subview (X.imports_.view_host(),
3728  range_type (offset, offset+newSize));
3729  this->imports_ = decltype(X.imports_)(device_view, host_view);
3730  }
3731  {
3732  const size_t newSize = X.exports_.extent (0) / numCols;
3733  const size_t offset = jj*newSize;
3734  auto device_view = subview (X.exports_.view_device(),
3735  range_type (offset, offset+newSize));
3736  auto host_view = subview (X.exports_.view_host(),
3737  range_type (offset, offset+newSize));
3738  this->exports_ = decltype(X.exports_)(device_view, host_view);
3739  }
3740  // These two DualViews already either have the right number of
3741  // entries, or zero entries. This means that we don't need to
3742  // resize them.
3745  }
3746 
3747 
3748  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3749  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3751  getVector (const size_t j) const
3752  {
3754  return Teuchos::rcp (new V (*this, j));
3755  }
3756 
3757 
3758  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3759  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3761  getVectorNonConst (const size_t j)
3762  {
3764  return Teuchos::rcp (new V (*this, j));
3765  }
3766 
3767 
3768  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3769  void
3771  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3772  {
3773  using IST = impl_scalar_type;
3774  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3775  Kokkos::HostSpace,
3776  Kokkos::MemoryUnmanaged>;
3777  const char tfecfFuncName[] = "get1dCopy: ";
3778 
3779  const size_t numRows = this->getLocalLength ();
3780  const size_t numCols = this->getNumVectors ();
3781 
3782  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3783  (LDA < numRows, std::runtime_error,
3784  "LDA = " << LDA << " < numRows = " << numRows << ".");
3785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3786  (numRows > size_t (0) && numCols > size_t (0) &&
3787  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3788  std::runtime_error,
3789  "A.size() = " << A.size () << ", but its size must be at least "
3790  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3791 
3792  const std::pair<size_t, size_t> rowRange (0, numRows);
3793  const std::pair<size_t, size_t> colRange (0, numCols);
3794 
3795  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3796  LDA, numCols);
3797  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3798 
3810  if (this->need_sync_host() && this->need_sync_device()) {
3813  throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3814  }
3815  else {
3816  const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3817  if (this->isConstantStride ()) {
3818  if (useHostView) {
3819  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3820  // DEEP_COPY REVIEW - NOT TESTED
3821  Kokkos::deep_copy (A_view, srcView_host);
3822  } else {
3823  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3824  // DEEP_COPY REVIEW - NOT TESTED
3825  Kokkos::deep_copy (A_view, srcView_device);
3826  }
3827  }
3828  else {
3829  for (size_t j = 0; j < numCols; ++j) {
3830  const size_t srcCol = this->whichVectors_[j];
3831  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3832 
3833  if (useHostView) {
3834  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3835  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3836  // DEEP_COPY REVIEW - NOT TESTED
3837  Kokkos::deep_copy (dstColView, srcColView_host);
3838  } else {
3839  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3840  auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3841  // DEEP_COPY REVIEW - NOT TESTED
3842  Kokkos::deep_copy (dstColView, srcColView_device);
3843  }
3844  }
3845  }
3846  }
3847  }
3848 
3849 
3850  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3851  void
3853  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3854  {
3856  const char tfecfFuncName[] = "get2dCopy: ";
3857  const size_t numRows = this->getLocalLength ();
3858  const size_t numCols = this->getNumVectors ();
3859 
3860  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3861  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3862  std::runtime_error, "Input array of pointers must contain as many "
3863  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3864  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3865 
3866  if (numRows != 0 && numCols != 0) {
3867  // No side effects until we've validated the input.
3868  for (size_t j = 0; j < numCols; ++j) {
3869  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3870  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3871  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3872  "the input array of arrays is not long enough to fit all entries in "
3873  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3874  << " < getLocalLength() = " << numRows << ".");
3875  }
3876 
3877  // We've validated the input, so it's safe to start copying.
3878  for (size_t j = 0; j < numCols; ++j) {
3879  Teuchos::RCP<const V> X_j = this->getVector (j);
3880  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3881  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3882  }
3883  }
3884  }
3885 
3886 
3887  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3888  Teuchos::ArrayRCP<const Scalar>
3890  get1dView () const
3891  {
3892  if (getLocalLength () == 0 || getNumVectors () == 0) {
3893  return Teuchos::null;
3894  } else {
3895  TEUCHOS_TEST_FOR_EXCEPTION(
3896  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3897  "get1dView: This MultiVector does not have constant stride, so it is "
3898  "not possible to view its data as a single array. You may check "
3899  "whether a MultiVector has constant stride by calling "
3900  "isConstantStride().");
3901  // Since get1dView() is and was always marked const, I have to
3902  // cast away const here in order not to break backwards
3903  // compatibility.
3904  auto X_lcl = getLocalViewHost(Access::ReadOnly);
3905  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3906  Kokkos::Compat::persistingView (X_lcl);
3907  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3908  }
3909  }
3910 
3911  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3912  Teuchos::ArrayRCP<Scalar>
3915  {
3916  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3917  return Teuchos::null;
3918  }
3919  else {
3920  TEUCHOS_TEST_FOR_EXCEPTION
3921  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3922  "get1dViewNonConst: This MultiVector does not have constant stride, "
3923  "so it is not possible to view its data as a single array. You may "
3924  "check whether a MultiVector has constant stride by calling "
3925  "isConstantStride().");
3926  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3927  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3928  Kokkos::Compat::persistingView (X_lcl);
3929  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3930  }
3931  }
3932 
3933  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3934  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3937  {
3938  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3939 
3940  // Don't use the row range here on the outside, in order to avoid
3941  // a strided return type (in case Kokkos::subview is conservative
3942  // about that). Instead, use the row range for the column views
3943  // in the loop.
3944  const size_t myNumRows = this->getLocalLength ();
3945  const size_t numCols = this->getNumVectors ();
3946  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3947 
3948  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3949  for (size_t j = 0; j < numCols; ++j) {
3950  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3951  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3952  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3953  Kokkos::Compat::persistingView (X_lcl_j);
3954  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3955  }
3956  return views;
3957  }
3958 
3959  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3962  getLocalViewHost(Access::ReadOnlyStruct s) const
3963  {
3964  return view_.getHostView(s);
3965  }
3966 
3967  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3970  getLocalViewHost(Access::ReadWriteStruct s)
3971  {
3972  return view_.getHostView(s);
3973  }
3974 
3975  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3978  getLocalViewHost(Access::OverwriteAllStruct s)
3979  {
3980  return view_.getHostView(s);
3981  }
3982 
3983  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3986  getLocalViewDevice(Access::ReadOnlyStruct s) const
3987  {
3988  return view_.getDeviceView(s);
3989  }
3990 
3991  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3994  getLocalViewDevice(Access::ReadWriteStruct s)
3995  {
3996  return view_.getDeviceView(s);
3997  }
3998 
3999  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4002  getLocalViewDevice(Access::OverwriteAllStruct s)
4003  {
4004  return view_.getDeviceView(s);
4005  }
4006 
4007  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4011  return view_;
4012  }
4013 
4014  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4015  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
4017  get2dView () const
4018  {
4019  // Since get2dView() is and was always marked const, I have to
4020  // cast away const here in order not to break backwards
4021  // compatibility.
4022  auto X_lcl = getLocalViewHost(Access::ReadOnly);
4023 
4024  // Don't use the row range here on the outside, in order to avoid
4025  // a strided return type (in case Kokkos::subview is conservative
4026  // about that). Instead, use the row range for the column views
4027  // in the loop.
4028  const size_t myNumRows = this->getLocalLength ();
4029  const size_t numCols = this->getNumVectors ();
4030  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
4031 
4032  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
4033  for (size_t j = 0; j < numCols; ++j) {
4034  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
4035  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
4036  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
4037  Kokkos::Compat::persistingView (X_lcl_j);
4038  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
4039  }
4040  return views;
4041  }
4042 
4043  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4044  void
4046  multiply (Teuchos::ETransp transA,
4047  Teuchos::ETransp transB,
4048  const Scalar& alpha,
4051  const Scalar& beta)
4052  {
4053  using ::Tpetra::Details::ProfilingRegion;
4054  using Teuchos::CONJ_TRANS;
4055  using Teuchos::NO_TRANS;
4056  using Teuchos::TRANS;
4057  using Teuchos::RCP;
4058  using Teuchos::rcp;
4059  using Teuchos::rcpFromRef;
4060  using std::endl;
4061  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
4062  using LO = local_ordinal_type;
4063  using STS = Teuchos::ScalarTraits<Scalar>;
4065  const char tfecfFuncName[] = "multiply: ";
4066  ProfilingRegion region ("Tpetra::MV::multiply");
4067 
4068  // This routine performs a variety of matrix-matrix multiply
4069  // operations, interpreting the MultiVector (this-aka C , A and B)
4070  // as 2D matrices. Variations are due to the fact that A, B and C
4071  // can be locally replicated or globally distributed MultiVectors
4072  // and that we may or may not operate with the transpose of A and
4073  // B. Possible cases are:
4074  //
4075  // Operations # Cases Notes
4076  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4077  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4078  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4079  //
4080  // The following operations are not meaningful for 1-D
4081  // distributions:
4082  //
4083  // u1) C(local) = A^T(distr) * B^T(distr) 1
4084  // u2) C(local) = A (distr) * B^X(distr) 2
4085  // u3) C(distr) = A^X(local) * B^X(local) 4
4086  // u4) C(distr) = A^X(local) * B^X(distr) 4
4087  // u5) C(distr) = A^T(distr) * B^X(local) 2
4088  // u6) C(local) = A^X(distr) * B^X(local) 4
4089  // u7) C(distr) = A^X(distr) * B^X(local) 4
4090  // u8) C(local) = A^X(local) * B^X(distr) 4
4091  //
4092  // Total number of cases: 32 (= 2^5).
4093 
4094  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4095 
4096  const bool A_is_local = ! A.isDistributed ();
4097  const bool B_is_local = ! B.isDistributed ();
4098  const bool C_is_local = ! this->isDistributed ();
4099 
4100  // In debug mode, check compatibility of local dimensions. We
4101  // only do this in debug mode, since it requires an all-reduce
4102  // to ensure correctness on all processses. It's entirely
4103  // possible that only some processes may have incompatible local
4104  // dimensions. Throwing an exception only on those processes
4105  // could cause this method to hang.
4106  const bool debug = ::Tpetra::Details::Behavior::debug ();
4107  if (debug) {
4108  auto myMap = this->getMap ();
4109  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4110  using Teuchos::REDUCE_MIN;
4111  using Teuchos::reduceAll;
4112  using Teuchos::outArg;
4113 
4114  auto comm = myMap->getComm ();
4115  const size_t A_nrows =
4116  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4117  const size_t A_ncols =
4118  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4119  const size_t B_nrows =
4120  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4121  const size_t B_ncols =
4122  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4123 
4124  const bool lclBad = this->getLocalLength () != A_nrows ||
4125  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4126 
4127  const int myRank = comm->getRank ();
4128  std::ostringstream errStrm;
4129  if (this->getLocalLength () != A_nrows) {
4130  errStrm << "Proc " << myRank << ": this->getLocalLength()="
4131  << this->getLocalLength () << " != A_nrows=" << A_nrows
4132  << "." << std::endl;
4133  }
4134  if (this->getNumVectors () != B_ncols) {
4135  errStrm << "Proc " << myRank << ": this->getNumVectors()="
4136  << this->getNumVectors () << " != B_ncols=" << B_ncols
4137  << "." << std::endl;
4138  }
4139  if (A_ncols != B_nrows) {
4140  errStrm << "Proc " << myRank << ": A_ncols="
4141  << A_ncols << " != B_nrows=" << B_nrows
4142  << "." << std::endl;
4143  }
4144 
4145  const int lclGood = lclBad ? 0 : 1;
4146  int gblGood = 0;
4147  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4148  if (gblGood != 1) {
4149  std::ostringstream os;
4150  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
4151 
4152  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4153  (true, std::runtime_error, "Inconsistent local dimensions on at "
4154  "least one process in this object's communicator." << std::endl
4155  << "Operation: "
4156  << "C(" << (C_is_local ? "local" : "distr") << ") = "
4157  << alpha << "*A"
4158  << (transA == Teuchos::TRANS ? "^T" :
4159  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4160  << "(" << (A_is_local ? "local" : "distr") << ") * "
4161  << beta << "*B"
4162  << (transA == Teuchos::TRANS ? "^T" :
4163  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4164  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4165  << "Global dimensions: C(" << this->getGlobalLength () << ", "
4166  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4167  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4168  << ", " << B.getNumVectors () << ")." << std::endl
4169  << os.str ());
4170  }
4171  }
4172  }
4173 
4174  // Case 1: C(local) = A^X(local) * B^X(local)
4175  const bool Case1 = C_is_local && A_is_local && B_is_local;
4176  // Case 2: C(local) = A^T(distr) * B (distr)
4177  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4178  transA != NO_TRANS &&
4179  transB == NO_TRANS;
4180  // Case 3: C(distr) = A (distr) * B^X(local)
4181  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4182  transA == NO_TRANS;
4183 
4184  // Test that we are considering a meaningful case
4185  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4186  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4187  "Multiplication of op(A) and op(B) into *this is not a "
4188  "supported use case.");
4189 
4190  if (beta != STS::zero () && Case2) {
4191  // If Case2, then C is local and contributions must be summed
4192  // across all processes. However, if beta != 0, then accumulate
4193  // beta*C into the sum. When summing across all processes, we
4194  // only want to accumulate this once, so set beta == 0 on all
4195  // processes except Process 0.
4196  const int myRank = this->getMap ()->getComm ()->getRank ();
4197  if (myRank != 0) {
4198  beta_local = ATS::zero ();
4199  }
4200  }
4201 
4202  // We only know how to do matrix-matrix multiplies if all the
4203  // MultiVectors have constant stride. If not, we have to make
4204  // temporary copies of those MultiVectors (including possibly
4205  // *this) that don't have constant stride.
4206  RCP<MV> C_tmp;
4207  if (! isConstantStride ()) {
4208  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4209  } else {
4210  C_tmp = rcp (this, false);
4211  }
4212 
4213  RCP<const MV> A_tmp;
4214  if (! A.isConstantStride ()) {
4215  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4216  } else {
4217  A_tmp = rcpFromRef (A);
4218  }
4219 
4220  RCP<const MV> B_tmp;
4221  if (! B.isConstantStride ()) {
4222  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4223  } else {
4224  B_tmp = rcpFromRef (B);
4225  }
4226 
4227  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4228  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4229  ! A_tmp->isConstantStride (), std::logic_error,
4230  "Failed to make temporary constant-stride copies of MultiVectors.");
4231 
4232  {
4233  const LO A_lclNumRows = A_tmp->getLocalLength ();
4234  const LO A_numVecs = A_tmp->getNumVectors ();
4235  auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4236  auto A_sub = Kokkos::subview (A_lcl,
4237  std::make_pair (LO (0), A_lclNumRows),
4238  std::make_pair (LO (0), A_numVecs));
4239 
4240 
4241  const LO B_lclNumRows = B_tmp->getLocalLength ();
4242  const LO B_numVecs = B_tmp->getNumVectors ();
4243  auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4244  auto B_sub = Kokkos::subview (B_lcl,
4245  std::make_pair (LO (0), B_lclNumRows),
4246  std::make_pair (LO (0), B_numVecs));
4247 
4248  const LO C_lclNumRows = C_tmp->getLocalLength ();
4249  const LO C_numVecs = C_tmp->getNumVectors ();
4250 
4251  auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4252  auto C_sub = Kokkos::subview (C_lcl,
4253  std::make_pair (LO (0), C_lclNumRows),
4254  std::make_pair (LO (0), C_numVecs));
4255  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4256  (transA == Teuchos::TRANS ? 'T' : 'C'));
4257  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4258  (transB == Teuchos::TRANS ? 'T' : 'C'));
4259  const impl_scalar_type alpha_IST (alpha);
4260 
4261  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4262 
4263  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4264  beta_local, C_sub);
4265  }
4266 
4267  if (! isConstantStride ()) {
4268  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4269  }
4270 
4271  // Dispose of (possibly) extra copies of A and B.
4272  A_tmp = Teuchos::null;
4273  B_tmp = Teuchos::null;
4274 
4275  // If Case 2 then sum up *this and distribute it to all processes.
4276  if (Case2) {
4277  this->reduce ();
4278  }
4279  }
4280 
4281  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4282  void
4284  elementWiseMultiply (Scalar scalarAB,
4287  Scalar scalarThis)
4288  {
4289  using Kokkos::ALL;
4290  using Kokkos::subview;
4291  const char tfecfFuncName[] = "elementWiseMultiply: ";
4292 
4293  const size_t lclNumRows = this->getLocalLength ();
4294  const size_t numVecs = this->getNumVectors ();
4295 
4296  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4297  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4298  std::runtime_error, "MultiVectors do not have the same local length.");
4299  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4300  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4301  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4302  << ".");
4303 
4304  auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4305  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4306  auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4307 
4308  if (isConstantStride () && B.isConstantStride ()) {
4309  // A is just a Vector; it only has one column, so it always has
4310  // constant stride.
4311  //
4312  // If both *this and B have constant stride, we can do an
4313  // element-wise multiply on all columns at once.
4314  KokkosBlas::mult (scalarThis,
4315  this_view,
4316  scalarAB,
4317  subview (A_view, ALL (), 0),
4318  B_view);
4319  }
4320  else {
4321  for (size_t j = 0; j < numVecs; ++j) {
4322  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4323  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4324  KokkosBlas::mult (scalarThis,
4325  subview (this_view, ALL (), C_col),
4326  scalarAB,
4327  subview (A_view, ALL (), 0),
4328  subview (B_view, ALL (), B_col));
4329  }
4330  }
4331  }
4332 
4333  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4334  void
4337  {
4339  using ::Tpetra::Details::ProfilingRegion;
4340  ProfilingRegion region ("Tpetra::MV::reduce");
4341 
4342  const auto map = this->getMap ();
4343  if (map.get () == nullptr) {
4344  return;
4345  }
4346  const auto comm = map->getComm ();
4347  if (comm.get () == nullptr) {
4348  return;
4349  }
4350 
4351  // Avoid giving device buffers to MPI. Even if MPI handles them
4352  // correctly, doing so may not perform well.
4353  const bool changed_on_device = this->need_sync_host ();
4354  if (changed_on_device) {
4355  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4356  // and host will be separate allocations. In that case, it may
4357  // pay to do the all-reduce from device to host.
4358  Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4359  auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4360  allReduceView (X_lcl, X_lcl, *comm);
4361  }
4362  else {
4363  auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4364  allReduceView (X_lcl, X_lcl, *comm);
4365  }
4366  }
4367 
4368  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4369  void
4371  replaceLocalValue (const LocalOrdinal lclRow,
4372  const size_t col,
4373  const impl_scalar_type& ScalarValue)
4374  {
4376  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4377  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4378  TEUCHOS_TEST_FOR_EXCEPTION(
4379  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4380  std::runtime_error,
4381  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4382  << " is invalid. The range of valid row indices on this process "
4383  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4384  << ", " << maxLocalIndex << "].");
4385  TEUCHOS_TEST_FOR_EXCEPTION(
4386  vectorIndexOutOfRange(col),
4387  std::runtime_error,
4388  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4389  << " of the multivector is invalid.");
4390  }
4391 
4392  auto view = this->getLocalViewHost(Access::ReadWrite);
4393  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4394  view (lclRow, colInd) = ScalarValue;
4395  }
4396 
4397 
4398  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4399  void
4401  sumIntoLocalValue (const LocalOrdinal lclRow,
4402  const size_t col,
4403  const impl_scalar_type& value,
4404  const bool atomic)
4405  {
4407  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4408  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4409  TEUCHOS_TEST_FOR_EXCEPTION(
4410  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4411  std::runtime_error,
4412  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4413  << " is invalid. The range of valid row indices on this process "
4414  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4415  << ", " << maxLocalIndex << "].");
4416  TEUCHOS_TEST_FOR_EXCEPTION(
4417  vectorIndexOutOfRange(col),
4418  std::runtime_error,
4419  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4420  << " of the multivector is invalid.");
4421  }
4422 
4423  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4424 
4425  auto view = this->getLocalViewHost(Access::ReadWrite);
4426  if (atomic) {
4427  Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4428  }
4429  else {
4430  view(lclRow, colInd) += value;
4431  }
4432  }
4433 
4434 
4435  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4436  void
4438  replaceGlobalValue (const GlobalOrdinal gblRow,
4439  const size_t col,
4440  const impl_scalar_type& ScalarValue)
4441  {
4442  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4443  // touches the RCP's reference count, which isn't thread safe.
4444  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4445 
4447  const char tfecfFuncName[] = "replaceGlobalValue: ";
4448  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4449  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4450  std::runtime_error,
4451  "Global row index " << gblRow << " is not present on this process "
4452  << this->getMap ()->getComm ()->getRank () << ".");
4453  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4454  (this->vectorIndexOutOfRange (col), std::runtime_error,
4455  "Vector index " << col << " of the MultiVector is invalid.");
4456  }
4457 
4458  this->replaceLocalValue (lclRow, col, ScalarValue);
4459  }
4460 
4461  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4462  void
4464  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4465  const size_t col,
4466  const impl_scalar_type& value,
4467  const bool atomic)
4468  {
4469  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4470  // touches the RCP's reference count, which isn't thread safe.
4471  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4472 
4474  TEUCHOS_TEST_FOR_EXCEPTION(
4475  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4476  std::runtime_error,
4477  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4478  << " is not present on this process "
4479  << this->getMap ()->getComm ()->getRank () << ".");
4480  TEUCHOS_TEST_FOR_EXCEPTION(
4481  vectorIndexOutOfRange(col),
4482  std::runtime_error,
4483  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4484  << " of the multivector is invalid.");
4485  }
4486 
4487  this->sumIntoLocalValue (lclRow, col, value, atomic);
4488  }
4489 
4490  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4491  template <class T>
4492  Teuchos::ArrayRCP<T>
4494  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4495  size_t j) const
4496  {
4497  typedef Kokkos::DualView<impl_scalar_type*,
4498  typename dual_view_type::array_layout,
4499  execution_space> col_dual_view_type;
4500  const size_t col = isConstantStride () ? j : whichVectors_[j];
4501  col_dual_view_type X_col =
4502  Kokkos::subview (view_, Kokkos::ALL (), col);
4503  return Kokkos::Compat::persistingView (X_col.view_device());
4504  }
4505 
4506  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4507  bool
4510  return view_.need_sync_host ();
4511  }
4512 
4513  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4514  bool
4517  return view_.need_sync_device ();
4518  }
4519 
4520  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4521  std::string
4523  descriptionImpl (const std::string& className) const
4524  {
4525  using Teuchos::TypeNameTraits;
4526 
4527  std::ostringstream out;
4528  out << "\"" << className << "\": {";
4529  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4530  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4531  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4532  << ", Node" << Node::name ()
4533  << "}, ";
4534  if (this->getObjectLabel () != "") {
4535  out << "Label: \"" << this->getObjectLabel () << "\", ";
4536  }
4537  out << ", numRows: " << this->getGlobalLength ();
4538  if (className != "Tpetra::Vector") {
4539  out << ", numCols: " << this->getNumVectors ()
4540  << ", isConstantStride: " << this->isConstantStride ();
4541  }
4542  if (this->isConstantStride ()) {
4543  out << ", columnStride: " << this->getStride ();
4544  }
4545  out << "}";
4546 
4547  return out.str ();
4548  }
4549 
4550  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4551  std::string
4554  {
4555  return this->descriptionImpl ("Tpetra::MultiVector");
4556  }
4557 
4558  namespace Details
4559  {
4560  template<typename ViewType>
4561  void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4562  {
4563  using std::endl;
4564  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4565  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4566  static_assert(ViewType::rank == 2,
4567  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4568  // The square braces [] and their contents are in Matlab
4569  // format, so users may copy and paste directly into Matlab.
4570  out << "Values("<<prefix<<"): " << std::endl
4571  << "[";
4572  const size_t numRows = v.extent(0);
4573  const size_t numCols = v.extent(1);
4574  if (numCols == 1) {
4575  for (size_t i = 0; i < numRows; ++i) {
4576  out << v(i,0);
4577  if (i + 1 < numRows) {
4578  out << "; ";
4579  }
4580  }
4581  }
4582  else {
4583  for (size_t i = 0; i < numRows; ++i) {
4584  for (size_t j = 0; j < numCols; ++j) {
4585  out << v(i,j);
4586  if (j + 1 < numCols) {
4587  out << ", ";
4588  }
4589  }
4590  if (i + 1 < numRows) {
4591  out << "; ";
4592  }
4593  }
4594  }
4595  out << "]" << endl;
4596  }
4597  }
4598 
4599  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4600  std::string
4602  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4603  {
4604  typedef LocalOrdinal LO;
4605  using std::endl;
4606 
4607  if (vl <= Teuchos::VERB_LOW) {
4608  return std::string ();
4609  }
4610  auto map = this->getMap ();
4611  if (map.is_null ()) {
4612  return std::string ();
4613  }
4614  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4615  auto outp = Teuchos::getFancyOStream (outStringP);
4616  Teuchos::FancyOStream& out = *outp;
4617  auto comm = map->getComm ();
4618  const int myRank = comm->getRank ();
4619  const int numProcs = comm->getSize ();
4620 
4621  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4622  Teuchos::OSTab tab1 (out);
4623 
4624  // VERB_MEDIUM and higher prints getLocalLength()
4625  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4626  out << "Local number of rows: " << lclNumRows << endl;
4627 
4628  if (vl > Teuchos::VERB_MEDIUM) {
4629  // VERB_HIGH and higher prints isConstantStride() and getStride().
4630  // The first is only relevant if the Vector has multiple columns.
4631  if (this->getNumVectors () != static_cast<size_t> (1)) {
4632  out << "isConstantStride: " << this->isConstantStride () << endl;
4633  }
4634  if (this->isConstantStride ()) {
4635  out << "Column stride: " << this->getStride () << endl;
4636  }
4637 
4638  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4639  // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4641  // so we can't use our regular accessor functins
4642 
4643  // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4644  // to trigger (since this function is conceptually const). Thus, we
4645  // get *copies* of the view's data instead.
4646  auto X_dev = view_.getDeviceCopy();
4647  auto X_host = view_.getHostCopy();
4648 
4649  if(X_dev.data() == X_host.data()) {
4650  // One single allocation
4651  Details::print_vector(out,"unified",X_host);
4652  }
4653  else {
4654  Details::print_vector(out,"host",X_host);
4655  Details::print_vector(out,"dev",X_dev);
4656  }
4657  }
4658  }
4659  out.flush (); // make sure the ostringstream got everything
4660  return outStringP->str ();
4661  }
4662 
4663  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4664  void
4666  describeImpl (Teuchos::FancyOStream& out,
4667  const std::string& className,
4668  const Teuchos::EVerbosityLevel verbLevel) const
4669  {
4670  using Teuchos::TypeNameTraits;
4671  using Teuchos::VERB_DEFAULT;
4672  using Teuchos::VERB_NONE;
4673  using Teuchos::VERB_LOW;
4674  using std::endl;
4675  const Teuchos::EVerbosityLevel vl =
4676  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4677 
4678  if (vl == VERB_NONE) {
4679  return; // don't print anything
4680  }
4681  // If this Vector's Comm is null, then the Vector does not
4682  // participate in collective operations with the other processes.
4683  // In that case, it is not even legal to call this method. The
4684  // reasonable thing to do in that case is nothing.
4685  auto map = this->getMap ();
4686  if (map.is_null ()) {
4687  return;
4688  }
4689  auto comm = map->getComm ();
4690  if (comm.is_null ()) {
4691  return;
4692  }
4693 
4694  const int myRank = comm->getRank ();
4695 
4696  // Only Process 0 should touch the output stream, but this method
4697  // in general may need to do communication. Thus, we may need to
4698  // preserve the current tab level across multiple "if (myRank ==
4699  // 0) { ... }" inner scopes. This is why we sometimes create
4700  // OSTab instances by pointer, instead of by value. We only need
4701  // to create them by pointer if the tab level must persist through
4702  // multiple inner scopes.
4703  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4704 
4705  // VERB_LOW and higher prints the equivalent of description().
4706  if (myRank == 0) {
4707  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4708  out << "\"" << className << "\":" << endl;
4709  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4710  {
4711  out << "Template parameters:" << endl;
4712  Teuchos::OSTab tab2 (out);
4713  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4714  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4715  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4716  << "Node: " << Node::name () << endl;
4717  }
4718  if (this->getObjectLabel () != "") {
4719  out << "Label: \"" << this->getObjectLabel () << "\", ";
4720  }
4721  out << "Global number of rows: " << this->getGlobalLength () << endl;
4722  if (className != "Tpetra::Vector") {
4723  out << "Number of columns: " << this->getNumVectors () << endl;
4724  }
4725  // getStride() may differ on different processes, so it (and
4726  // isConstantStride()) properly belong to per-process data.
4727  }
4728 
4729  // This is collective over the Map's communicator.
4730  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4731  const std::string lclStr = this->localDescribeToString (vl);
4732  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4733  }
4734  }
4735 
4736  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4737  void
4739  describe (Teuchos::FancyOStream &out,
4740  const Teuchos::EVerbosityLevel verbLevel) const
4741  {
4742  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4743  }
4744 
4745  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4746  void
4748  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4749  {
4750  replaceMap (newMap);
4751  }
4752 
4753  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4754  void
4757  {
4758  using ::Tpetra::Details::localDeepCopy;
4759  const char prefix[] = "Tpetra::MultiVector::assign: ";
4760 
4761  TEUCHOS_TEST_FOR_EXCEPTION
4762  (this->getGlobalLength () != src.getGlobalLength () ||
4763  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4764  prefix << "Global dimensions of the two Tpetra::MultiVector "
4765  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4766  << "," << src.getNumVectors () << "], and *this has dimensions ["
4767  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4768  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4769  TEUCHOS_TEST_FOR_EXCEPTION
4770  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4771  prefix << "The local row counts of the two Tpetra::MultiVector "
4772  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4773  << " and *this has " << this->getLocalLength () << " row(s).");
4774 
4775 
4776  // See #1510. We're writing to *this, so we don't care about its
4777  // contents in either memory space, and we don't want
4778  // DualView::modify to complain about "concurrent modification" of
4779  // host and device Views.
4780 
4784 
4785  // If need sync to device, then host has most recent version.
4786  const bool src_last_updated_on_host = src.need_sync_device ();
4787 
4788  if (src_last_updated_on_host) {
4789  localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4790  src.getLocalViewHost(Access::ReadOnly),
4791  this->isConstantStride (),
4792  src.isConstantStride (),
4793  this->whichVectors_ (),
4794  src.whichVectors_ ());
4795  }
4796  else {
4797  localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4798  src.getLocalViewDevice(Access::ReadOnly),
4799  this->isConstantStride (),
4800  src.isConstantStride (),
4801  this->whichVectors_ (),
4802  src.whichVectors_ ());
4803  }
4804  }
4805 
4806  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4807  template<class T>
4808  Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4810  convert () const
4811  {
4812  using Teuchos::RCP;
4813 
4814  auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4815  this->getNumVectors(),
4816  /*zeroOut=*/false));
4817  Tpetra::deep_copy(*newMV, *this);
4818  return newMV;
4819  }
4820 
4821  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4822  bool
4825  {
4826  using ::Tpetra::Details::PackTraits;
4827 
4828  const size_t l1 = this->getLocalLength();
4829  const size_t l2 = vec.getLocalLength();
4830  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4831  return false;
4832  }
4833 
4834  return true;
4835  }
4836 
4837  template <class ST, class LO, class GO, class NT>
4840  std::swap(mv.map_, this->map_);
4841  std::swap(mv.view_, this->view_);
4842  std::swap(mv.whichVectors_, this->whichVectors_);
4843  }
4844 
4845 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4846  template <class ST, class LO, class GO, class NT>
4847  void
4849  const Teuchos::SerialDenseMatrix<int, ST>& src)
4850  {
4851  using ::Tpetra::Details::localDeepCopy;
4852  using MV = MultiVector<ST, LO, GO, NT>;
4853  using IST = typename MV::impl_scalar_type;
4854  using input_view_type =
4855  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4856  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4857  using pair_type = std::pair<LO, LO>;
4858 
4859  const LO numRows = static_cast<LO> (src.numRows ());
4860  const LO numCols = static_cast<LO> (src.numCols ());
4861 
4862  TEUCHOS_TEST_FOR_EXCEPTION
4863  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4864  numCols != static_cast<LO> (dst.getNumVectors ()),
4865  std::invalid_argument, "Tpetra::deep_copy: On Process "
4866  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4867  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4868  << ", but src is " << numRows << " x " << numCols << ".");
4869 
4870  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4871  input_view_type src_orig (src_raw, src.stride (), numCols);
4872  input_view_type src_view =
4873  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4874 
4875  constexpr bool src_isConstantStride = true;
4876  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4877  localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4878  src_view,
4879  dst.isConstantStride (),
4880  src_isConstantStride,
4881  getMultiVectorWhichVectors (dst),
4882  srcWhichVectors);
4883  }
4884 
4885  template <class ST, class LO, class GO, class NT>
4886  void
4887  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4888  const MultiVector<ST, LO, GO, NT>& src)
4889  {
4890  using ::Tpetra::Details::localDeepCopy;
4891  using MV = MultiVector<ST, LO, GO, NT>;
4892  using IST = typename MV::impl_scalar_type;
4893  using output_view_type =
4894  Kokkos::View<IST**, Kokkos::LayoutLeft,
4895  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4896  using pair_type = std::pair<LO, LO>;
4897 
4898  const LO numRows = static_cast<LO> (dst.numRows ());
4899  const LO numCols = static_cast<LO> (dst.numCols ());
4900 
4901  TEUCHOS_TEST_FOR_EXCEPTION
4902  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4903  numCols != static_cast<LO> (src.getNumVectors ()),
4904  std::invalid_argument, "Tpetra::deep_copy: On Process "
4905  << src.getMap ()->getComm ()->getRank () << ", src is "
4906  << src.getLocalLength () << " x " << src.getNumVectors ()
4907  << ", but dst is " << numRows << " x " << numCols << ".");
4908 
4909  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4910  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4911  auto dst_view =
4912  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4913 
4914  constexpr bool dst_isConstantStride = true;
4915  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4916 
4917  // Prefer the host version of src's data.
4918  localDeepCopy (dst_view,
4919  src.getLocalViewHost(Access::ReadOnly),
4920  dst_isConstantStride,
4921  src.isConstantStride (),
4922  dstWhichVectors,
4923  getMultiVectorWhichVectors (src));
4924  }
4925 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4926 
4927  template <class Scalar, class LO, class GO, class NT>
4928  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4929  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4930  size_t numVectors)
4931  {
4933  return Teuchos::rcp (new MV (map, numVectors));
4934  }
4935 
4936  template <class ST, class LO, class GO, class NT>
4939  {
4940  typedef MultiVector<ST, LO, GO, NT> MV;
4941  MV cpy (src.getMap (), src.getNumVectors (), false);
4942  cpy.assign (src);
4943  return cpy;
4944  }
4945 
4946 } // namespace Tpetra
4947 
4948 //
4949 // Explicit instantiation macro
4950 //
4951 // Must be expanded from within the Tpetra namespace!
4952 //
4953 
4954 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4955 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4956  template class MultiVector< SCALAR , LO , GO , NODE >; \
4957  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4958  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4959  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4960  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4961 
4962 #else
4963 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4964  template class MultiVector< SCALAR , LO , GO , NODE >; \
4965  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4966 
4967 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4968 
4969 
4970 #define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4971  \
4972  template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4973  MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4974 
4975 
4976 #endif // TPETRA_MULTIVECTOR_DEF_HPP
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
One or more distributed dense vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
bool isDistributed() const
Whether this is a globally distributed object.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
static bool debug()
Whether Tpetra is in debug mode.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
int local_ordinal_type
Default value of Scalar template parameter.
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector &quot;constructor&quot;: Create a MultiVector from a given Map.
Functions for initializing and finalizing Tpetra.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Declaration of Tpetra::Details::isInterComm.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
Replace existing values with new values.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
void start()
Start the deep_copy counter.
A parallel distribution of indices over processes.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
Base class for distributed Tpetra objects that support data redistribution.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
EWhichNorm
Input argument for normImpl() (which see).
Accumulate new values into existing values (may not be supported in all classes)
size_t getOrigNumLocalRows() const
&quot;Original&quot; number of rows in the (local) data.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.