Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
11 #define TPETRA_MULTIVECTOR_DEF_HPP
12 
20 
21 #include "Tpetra_Core.hpp"
22 #include "Tpetra_Util.hpp"
23 #include "Tpetra_Vector.hpp"
27 #include "Tpetra_Details_fill.hpp"
35 #include "Tpetra_Details_Random.hpp"
36 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
37 #include "Teuchos_SerialDenseMatrix.hpp"
38 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
39 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
40 #include "KokkosCompat_View.hpp"
41 #include "KokkosBlas.hpp"
42 #include "KokkosKernels_Utils.hpp"
43 #include "Kokkos_Random.hpp"
44 #if KOKKOS_VERSION >= 40799
45 #include "KokkosKernels_ArithTraits.hpp"
46 #else
47 #include "Kokkos_ArithTraits.hpp"
48 #endif
49 #include <memory>
50 #include <sstream>
51 
52 #ifdef HAVE_TPETRA_INST_FLOAT128
53 namespace Kokkos {
54 // FIXME (mfh 04 Sep 2015) Just a stub for now!
55 template <class Generator>
56 struct rand<Generator, __float128> {
57  static KOKKOS_INLINE_FUNCTION __float128 max() {
58  return static_cast<__float128>(1.0);
59  }
60  static KOKKOS_INLINE_FUNCTION __float128
61  draw(Generator& gen) {
62  // Half the smallest normalized double, is the scaling factor of
63  // the lower-order term in the double-double representation.
64  const __float128 scalingFactor =
65  static_cast<__float128>(std::numeric_limits<double>::min()) /
66  static_cast<__float128>(2.0);
67  const __float128 higherOrderTerm = static_cast<__float128>(gen.drand());
68  const __float128 lowerOrderTerm =
69  static_cast<__float128>(gen.drand()) * scalingFactor;
70  return higherOrderTerm + lowerOrderTerm;
71  }
72  static KOKKOS_INLINE_FUNCTION __float128
73  draw(Generator& gen, const __float128& range) {
74  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
75  const __float128 scalingFactor =
76  static_cast<__float128>(std::numeric_limits<double>::min()) /
77  static_cast<__float128>(2.0);
78  const __float128 higherOrderTerm =
79  static_cast<__float128>(gen.drand(range));
80  const __float128 lowerOrderTerm =
81  static_cast<__float128>(gen.drand(range)) * scalingFactor;
82  return higherOrderTerm + lowerOrderTerm;
83  }
84  static KOKKOS_INLINE_FUNCTION __float128
85  draw(Generator& gen, const __float128& start, const __float128& end) {
86  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
87  const __float128 scalingFactor =
88  static_cast<__float128>(std::numeric_limits<double>::min()) /
89  static_cast<__float128>(2.0);
90  const __float128 higherOrderTerm =
91  static_cast<__float128>(gen.drand(start, end));
92  const __float128 lowerOrderTerm =
93  static_cast<__float128>(gen.drand(start, end)) * scalingFactor;
94  return higherOrderTerm + lowerOrderTerm;
95  }
96 };
97 } // namespace Kokkos
98 #endif // HAVE_TPETRA_INST_FLOAT128
99 
100 namespace { // (anonymous)
101 
116 template <class ST, class LO, class GO, class NT>
118 allocDualView(const size_t lclNumRows,
119  const size_t numCols,
120  const bool zeroOut = true,
121  const bool allowPadding = false) {
122  using Kokkos::AllowPadding;
123  using Kokkos::view_alloc;
124  using Kokkos::WithoutInitializing;
125  using ::Tpetra::Details::Behavior;
126  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
127  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
128  typedef typename dual_view_type::t_dev dev_view_type;
129 
130  // This needs to be a string and not a char*, if given as an
131  // argument to Kokkos::view_alloc. This is because view_alloc
132  // also allows a raw pointer as its first argument. See
133  // https://github.com/kokkos/kokkos/issues/434.
134  const std::string label("MV::DualView");
135  const bool debug = Behavior::debug();
136 
137  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
138  // creation of the DualView's Views works around
139  // Kokkos::DualView's current inability to accept an
140  // AllocationProperties initial argument (as Kokkos::View does).
141  // However, the work-around is harmless, since it does what the
142  // (currently nonexistent) equivalent DualView constructor would
143  // have done anyway.
144 
145  dev_view_type d_view;
146  if (zeroOut) {
147  if (allowPadding) {
148  d_view = dev_view_type(view_alloc(label, AllowPadding),
149  lclNumRows, numCols);
150  } else {
151  d_view = dev_view_type(view_alloc(label),
152  lclNumRows, numCols);
153  }
154  } else {
155  if (allowPadding) {
156  d_view = dev_view_type(view_alloc(label,
157  WithoutInitializing,
158  AllowPadding),
159  lclNumRows, numCols);
160  } else {
161  d_view = dev_view_type(view_alloc(label, WithoutInitializing),
162  lclNumRows, numCols);
163  }
164  if (debug) {
165  // Filling with NaN is a cheap and effective way to tell if
166  // downstream code is trying to use a MultiVector's data
167  // without them having been initialized. ArithTraits lets us
168  // call nan() even if the scalar type doesn't define it; it
169  // just returns some undefined value in the latter case. This
170  // won't hurt anything because by setting zeroOut=false, users
171  // already agreed that they don't care about the contents of
172  // the MultiVector.
173 #if KOKKOS_VERSION >= 40799
174  const ST nan = KokkosKernels::ArithTraits<ST>::nan();
175 #else
176  const ST nan = Kokkos::ArithTraits<ST>::nan();
177 #endif
178  KokkosBlas::fill(d_view, nan);
179  }
180  }
181  if (debug) {
182  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(d_view.extent(0)) != lclNumRows ||
183  static_cast<size_t>(d_view.extent(1)) != numCols,
184  std::logic_error,
185  "allocDualView: d_view's dimensions actual dimensions do not match "
186  "requested dimensions. d_view is "
187  << d_view.extent(0) << " x " << d_view.extent(1) << "; requested " << lclNumRows << " x " << numCols
188  << ". Please report this bug to the Tpetra developers.");
189  }
190 
191  return wrapped_dual_view_type(d_view);
192 }
193 
194 // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
195 //
196 // T: The type of the entries of the View.
197 // ExecSpace: The Kokkos execution space.
198 template <class T, class ExecSpace>
199 struct MakeUnmanagedView {
200  // The 'false' part of the branch carefully ensures that this
201  // won't attempt to use a host execution space that hasn't been
202  // initialized. For example, if Kokkos::OpenMP is disabled and
203  // Kokkos::Threads is enabled, the latter is always the default
204  // execution space of Kokkos::HostSpace, even when ExecSpace is
205  // Kokkos::Serial. That's why we go through the trouble of asking
206  // Kokkos::DualView what _its_ space is. That seems to work
207  // around this default execution space issue.
208  //
209  typedef typename std::conditional<
210  Kokkos::SpaceAccessibility<
211  typename ExecSpace::memory_space,
212  Kokkos::HostSpace>::accessible,
213  typename ExecSpace::device_type,
214  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
215  typedef Kokkos::LayoutLeft array_layout;
216  typedef Kokkos::View<T*, array_layout, host_exec_space,
217  Kokkos::MemoryUnmanaged>
218  view_type;
219 
220  static view_type getView(const Teuchos::ArrayView<T>& x_in) {
221  const size_t numEnt = static_cast<size_t>(x_in.size());
222  if (numEnt == 0) {
223  return view_type();
224  } else {
225  return view_type(x_in.getRawPtr(), numEnt);
226  }
227  }
228 };
229 
230 template <class WrappedDualViewType>
231 WrappedDualViewType
232 takeSubview(const WrappedDualViewType& X,
233  const std::pair<size_t, size_t>& rowRng,
234  const Kokkos::ALL_t& colRng)
235 
236 {
237  // The bug we saw below should be harder to trigger here.
238  return WrappedDualViewType(X, rowRng, colRng);
239 }
240 
241 template <class WrappedDualViewType>
242 WrappedDualViewType
243 takeSubview(const WrappedDualViewType& X,
244  const Kokkos::ALL_t& rowRng,
245  const std::pair<size_t, size_t>& colRng) {
246  using DualViewType = typename WrappedDualViewType::DVT;
247  // Look carefullly at the comment in the below version of this function.
248  // The comment applies here as well.
249  if (X.extent(0) == 0 && X.extent(1) != 0) {
250  return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
251  } else {
252  return WrappedDualViewType(X, rowRng, colRng);
253  }
254 }
255 
256 template <class WrappedDualViewType>
257 WrappedDualViewType
258 takeSubview(const WrappedDualViewType& X,
259  const std::pair<size_t, size_t>& rowRng,
260  const std::pair<size_t, size_t>& colRng) {
261  using DualViewType = typename WrappedDualViewType::DVT;
262  // If you take a subview of a view with zero rows Kokkos::subview()
263  // always returns a DualView with the same data pointers. This will break
264  // pointer equality testing in between two subviews of the same 2D View if
265  // it has zero row extent. While the one (known) case where this was actually used
266  // has been fixed, that sort of check could very easily be reintroduced in the future,
267  // hence I've added this if check here.
268  //
269  // This is not a bug in Kokkos::subview(), just some very subtle behavior which
270  // future developers should be wary of.
271  if (X.extent(0) == 0 && X.extent(1) != 0) {
272  return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
273  } else {
274  return WrappedDualViewType(X, rowRng, colRng);
275  }
276 }
277 
278 template <class WrappedOrNotDualViewType>
279 size_t
280 getDualViewStride(const WrappedOrNotDualViewType& dv) {
281  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
282  // method yet, but its Views do.
283  // NOTE: dv.stride() returns a vector of length one
284  // more than its rank
285  size_t strides[WrappedOrNotDualViewType::t_dev::rank + 1];
286  dv.stride(strides);
287  const size_t LDA = strides[1];
288  const size_t numRows = dv.extent(0);
289 
290  if (LDA == 0) {
291  return (numRows == 0) ? size_t(1) : numRows;
292  } else {
293  return LDA;
294  }
295 }
296 
297 template <class ViewType>
298 size_t
299 getViewStride(const ViewType& view) {
300  const size_t LDA = view.stride(1);
301  const size_t numRows = view.extent(0);
302 
303  if (LDA == 0) {
304  return (numRows == 0) ? size_t(1) : numRows;
305  } else {
306  return LDA;
307  }
308 }
309 
310 template <class impl_scalar_type, class buffer_device_type>
311 bool runKernelOnHost(
312  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports) {
313  if (!imports.need_sync_device()) {
314  return false; // most up-to-date on device
315  } else { // most up-to-date on host,
316  // but if large enough, worth running on device anyway
317  size_t localLengthThreshold =
319  return imports.extent(0) <= localLengthThreshold;
320  }
321 }
322 
323 template <class SC, class LO, class GO, class NT>
324 bool runKernelOnHost(const ::Tpetra::MultiVector<SC, LO, GO, NT>& X) {
325  if (!X.need_sync_device()) {
326  return false; // most up-to-date on device
327  } else { // most up-to-date on host
328  // but if large enough, worth running on device anyway
329  size_t localLengthThreshold =
331  return X.getLocalLength() <= localLengthThreshold;
332  }
333 }
334 
335 template <class SC, class LO, class GO, class NT>
336 void multiVectorNormImpl(typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
339  using namespace Tpetra;
342  using val_type = typename MV::impl_scalar_type;
343  using mag_type = typename MV::mag_type;
344  using dual_view_type = typename MV::dual_view_type;
345 
346  auto map = X.getMap();
347  auto comm = map.is_null() ? nullptr : map->getComm().getRawPtr();
348  auto whichVecs = getMultiVectorWhichVectors(X);
349  const bool isConstantStride = X.isConstantStride();
350  const bool isDistributed = X.isDistributed();
351 
352  const bool runOnHost = runKernelOnHost(X);
353  if (runOnHost) {
354  using view_type = typename dual_view_type::t_host;
355  using array_layout = typename view_type::array_layout;
356  using device_type = typename view_type::device_type;
357 
358  auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
359  normImpl<val_type, array_layout, device_type,
360  mag_type>(norms, X_lcl, whichNorm, whichVecs,
361  isConstantStride, isDistributed, comm);
362  } else {
363  using view_type = typename dual_view_type::t_dev;
364  using array_layout = typename view_type::array_layout;
365  using device_type = typename view_type::device_type;
366 
367  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
368  normImpl<val_type, array_layout, device_type,
369  mag_type>(norms, X_lcl, whichNorm, whichVecs,
370  isConstantStride, isDistributed, comm);
371  }
372 }
373 } // namespace
374 
375 namespace Tpetra {
376 
377 namespace Details {
378 template <typename DstView, typename SrcView>
379 struct AddAssignFunctor {
380  // This functor would be better as a lambda, but CUDA cannot compile
381  // lambdas in protected functions. It compiles fine with the functor.
382  AddAssignFunctor(DstView& tgt_, SrcView& src_)
383  : tgt(tgt_)
384  , src(src_) {}
385 
386  KOKKOS_INLINE_FUNCTION void
387  operator()(const size_t k) const { tgt(k) += src(k); }
388 
389  DstView tgt;
390  SrcView src;
391 };
392 } // namespace Details
393 
394 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
396  vectorIndexOutOfRange(const size_t VectorIndex) const {
397  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
398 }
399 
400 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
403  : base_type(Teuchos::rcp(new map_type())) {}
404 
405 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
407  MultiVector(const Teuchos::RCP<const map_type>& map,
408  const size_t numVecs,
409  const bool zeroOut)
410  : /* default is true */
411  base_type(map) {
412  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,numVecs,zeroOut)");
413 
414  const size_t lclNumRows = this->getLocalLength();
415  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node>(lclNumRows, numVecs, zeroOut);
416 }
417 
418 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
421  const Teuchos::DataAccess copyOrView)
422  : base_type(source)
423  , view_(source.view_)
424  , whichVectors_(source.whichVectors_) {
426  const char tfecfFuncName[] =
427  "MultiVector(const MultiVector&, "
428  "const Teuchos::DataAccess): ";
429 
430  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV 2-arg \"copy\" ctor");
431 
432  if (copyOrView == Teuchos::Copy) {
433  // Reuse the conveniently already existing function that creates
434  // a deep copy.
435  MV cpy = createCopy(source);
436  this->view_ = cpy.view_;
437  this->whichVectors_ = cpy.whichVectors_;
438  } else if (copyOrView == Teuchos::View) {
439  } else {
440  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
441  true, std::invalid_argument,
442  "Second argument 'copyOrView' has an "
443  "invalid value "
444  << copyOrView << ". Valid values include "
445  "Teuchos::Copy = "
446  << Teuchos::Copy << " and Teuchos::View = "
447  << Teuchos::View << ".");
448  }
449 }
450 
451 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
453  MultiVector(const Teuchos::RCP<const map_type>& map,
454  const dual_view_type& view)
455  : base_type(map)
456  , view_(wrapped_dual_view_type(view)) {
457  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
458  const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
459  const size_t lclNumRows_view = view.extent(0);
460  const size_t LDA = getDualViewStride(view_);
461 
462  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
463  std::invalid_argument,
464  "Kokkos::DualView does not match Map. "
465  "map->getLocalNumElements() = "
466  << lclNumRows_map
467  << ", view.extent(0) = " << lclNumRows_view
468  << ", and getStride() = " << LDA << ".");
469 
470  using ::Tpetra::Details::Behavior;
471  const bool debug = Behavior::debug();
472  if (debug) {
473  using ::Tpetra::Details::checkGlobalDualViewValidity;
474  std::ostringstream gblErrStrm;
475  const bool verbose = Behavior::verbose();
476  const auto comm = map.is_null() ? Teuchos::null : map->getComm();
477  const bool gblValid =
478  checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
479  comm.getRawPtr());
480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
481  }
482 }
483 
484 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
486  MultiVector(const Teuchos::RCP<const map_type>& map,
487  const wrapped_dual_view_type& view)
488  : base_type(map)
489  , view_(view) {
490  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
491  const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
492  const size_t lclNumRows_view = view.extent(0);
493  const size_t LDA = getDualViewStride(view);
494 
495  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
496  std::invalid_argument,
497  "Kokkos::DualView does not match Map. "
498  "map->getLocalNumElements() = "
499  << lclNumRows_map
500  << ", view.extent(0) = " << lclNumRows_view
501  << ", and getStride() = " << LDA << ".");
502 
503  using ::Tpetra::Details::Behavior;
504  const bool debug = Behavior::debug();
505  if (debug) {
506  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
507  std::ostringstream gblErrStrm;
508  const bool verbose = Behavior::verbose();
509  const auto comm = map.is_null() ? Teuchos::null : map->getComm();
510  const bool gblValid =
511  checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
512  comm.getRawPtr());
513  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
514  }
515 }
516 
517 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
519  MultiVector(const Teuchos::RCP<const map_type>& map,
520  const typename dual_view_type::t_dev& d_view)
521  : base_type(map) {
522  using Teuchos::ArrayRCP;
523  using Teuchos::RCP;
524  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
525 
526  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,d_view)");
527 
528  const size_t LDA = getViewStride(d_view);
529  const size_t lclNumRows = map->getLocalNumElements();
530  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument,
531  "Map does not match "
532  "Kokkos::View. map->getLocalNumElements() = "
533  << lclNumRows
534  << ", View's column stride = " << LDA
535  << ", and View's extent(0) = " << d_view.extent(0) << ".");
536 
537  auto h_view = Kokkos::create_mirror_view(d_view);
538  auto dual_view = dual_view_type(d_view, h_view);
539  view_ = wrapped_dual_view_type(dual_view);
540 
541  using ::Tpetra::Details::Behavior;
542  const bool debug = Behavior::debug();
543  if (debug) {
544  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
545  std::ostringstream gblErrStrm;
546  const bool verbose = Behavior::verbose();
547  const auto comm = map.is_null() ? Teuchos::null : map->getComm();
548  const bool gblValid =
549  checkGlobalWrappedDualViewValidity(&gblErrStrm, view_, verbose,
550  comm.getRawPtr());
551  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
552  }
553  // The user gave us a device view. In order to respect its
554  // initial contents, we mark the DualView as "modified on device."
555  // That way, the next sync will synchronize it with the host view.
556  dual_view.modify_device();
557 }
558 
559 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
561  MultiVector(const Teuchos::RCP<const map_type>& map,
562  const dual_view_type& view,
563  const dual_view_type& origView)
564  : base_type(map)
565  , view_(wrapped_dual_view_type(view, origView)) {
566  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
567 
568  const size_t LDA = getDualViewStride(origView);
569  const size_t lclNumRows = this->getLocalLength(); // comes from the Map
570  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
571  LDA < lclNumRows, std::invalid_argument,
572  "The input Kokkos::DualView's "
573  "column stride LDA = "
574  << LDA << " < getLocalLength() = " << lclNumRows
575  << ". This may also mean that the input origView's first dimension (number "
576  "of rows = "
577  << origView.extent(0) << ") does not not match the number "
578  "of entries in the Map on the calling process.");
579 
580  using ::Tpetra::Details::Behavior;
581  const bool debug = Behavior::debug();
582  if (debug) {
583  using ::Tpetra::Details::checkGlobalDualViewValidity;
584  std::ostringstream gblErrStrm;
585  const bool verbose = Behavior::verbose();
586  const auto comm = map.is_null() ? Teuchos::null : map->getComm();
587  const bool gblValid_0 =
588  checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
589  comm.getRawPtr());
590  const bool gblValid_1 =
591  checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
592  comm.getRawPtr());
593  const bool gblValid = gblValid_0 && gblValid_1;
594  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
595  }
596 }
597 
598 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
600  MultiVector(const Teuchos::RCP<const map_type>& map,
601  const dual_view_type& view,
602  const Teuchos::ArrayView<const size_t>& whichVectors)
603  : base_type(map)
604  , view_(view)
605  , whichVectors_(whichVectors.begin(), whichVectors.end()) {
606  using Kokkos::ALL;
607  using Kokkos::subview;
608  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
609 
610  using ::Tpetra::Details::Behavior;
611  const bool debug = Behavior::debug();
612  if (debug) {
613  using ::Tpetra::Details::checkGlobalDualViewValidity;
614  std::ostringstream gblErrStrm;
615  const bool verbose = Behavior::verbose();
616  const auto comm = map.is_null() ? Teuchos::null : map->getComm();
617  const bool gblValid =
618  checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
619  comm.getRawPtr());
620  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
621  }
622 
623  const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
624  // Check dimensions of the input DualView. We accept that Kokkos
625  // might not allow construction of a 0 x m (Dual)View with m > 0,
626  // so we only require the number of rows to match if the
627  // (Dual)View has more than zero columns. Likewise, we only
628  // require the number of columns to match if the (Dual)View has
629  // more than zero rows.
630  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
631  view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
632  std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
633  if (whichVectors.size() != 0) {
634  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
635  view.extent(1) != 0 && view.extent(1) == 0,
636  std::invalid_argument,
637  "view.extent(1) = 0, but whichVectors.size()"
638  " = "
639  << whichVectors.size() << " > 0.");
640  size_t maxColInd = 0;
641  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
642  for (size_type k = 0; k < whichVectors.size(); ++k) {
643  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
644  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
645  std::invalid_argument, "whichVectors[" << k << "] = "
646  "Teuchos::OrdinalTraits<size_t>::invalid().");
647  maxColInd = std::max(maxColInd, whichVectors[k]);
648  }
649  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
650  view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
651  std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
652  }
653 
654  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
655  // zero strides, so modify in that case.
656  const size_t LDA = getDualViewStride(view);
657  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument,
658  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
659 
660  if (whichVectors.size() == 1) {
661  // If whichVectors has only one entry, we don't need to bother
662  // with nonconstant stride. Just shift the view over so it
663  // points to the desired column.
664  //
665  // NOTE (mfh 10 May 2014) This is a special case where we set
666  // origView_ just to view that one column, not all of the
667  // original columns. This ensures that the use of origView_ in
668  // offsetView works correctly.
669  //
670  const std::pair<size_t, size_t> colRng(whichVectors[0],
671  whichVectors[0] + 1);
672  view_ = takeSubview(view_, ALL(), colRng);
673  // whichVectors_.size() == 0 means "constant stride."
674  whichVectors_.clear();
675  }
676 }
677 
678 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
680  MultiVector(const Teuchos::RCP<const map_type>& map,
681  const wrapped_dual_view_type& view,
682  const Teuchos::ArrayView<const size_t>& whichVectors)
683  : base_type(map)
684  , view_(view)
685  , whichVectors_(whichVectors.begin(), whichVectors.end()) {
686  using Kokkos::ALL;
687  using Kokkos::subview;
688  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
689 
690  using ::Tpetra::Details::Behavior;
691  const bool debug = Behavior::debug();
692  if (debug) {
693  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
694  std::ostringstream gblErrStrm;
695  const bool verbose = Behavior::verbose();
696  const auto comm = map.is_null() ? Teuchos::null : map->getComm();
697  const bool gblValid =
698  checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
699  comm.getRawPtr());
700  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
701  }
702 
703  const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
704  // Check dimensions of the input DualView. We accept that Kokkos
705  // might not allow construction of a 0 x m (Dual)View with m > 0,
706  // so we only require the number of rows to match if the
707  // (Dual)View has more than zero columns. Likewise, we only
708  // require the number of columns to match if the (Dual)View has
709  // more than zero rows.
710  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
711  view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
712  std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
713  if (whichVectors.size() != 0) {
714  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
715  view.extent(1) != 0 && view.extent(1) == 0,
716  std::invalid_argument,
717  "view.extent(1) = 0, but whichVectors.size()"
718  " = "
719  << whichVectors.size() << " > 0.");
720  size_t maxColInd = 0;
721  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
722  for (size_type k = 0; k < whichVectors.size(); ++k) {
723  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
724  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
725  std::invalid_argument, "whichVectors[" << k << "] = "
726  "Teuchos::OrdinalTraits<size_t>::invalid().");
727  maxColInd = std::max(maxColInd, whichVectors[k]);
728  }
729  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
730  view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
731  std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
732  }
733 
734  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
735  // zero strides, so modify in that case.
736  const size_t LDA = getDualViewStride(view);
737  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument,
738  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
739 
740  if (whichVectors.size() == 1) {
741  // If whichVectors has only one entry, we don't need to bother
742  // with nonconstant stride. Just shift the view over so it
743  // points to the desired column.
744  //
745  // NOTE (mfh 10 May 2014) This is a special case where we set
746  // origView_ just to view that one column, not all of the
747  // original columns. This ensures that the use of origView_ in
748  // offsetView works correctly.
749  //
750  const std::pair<size_t, size_t> colRng(whichVectors[0],
751  whichVectors[0] + 1);
752  view_ = takeSubview(view_, ALL(), colRng);
753  // whichVectors_.size() == 0 means "constant stride."
754  whichVectors_.clear();
755  }
756 }
757 
758 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
760  MultiVector(const Teuchos::RCP<const map_type>& map,
761  const dual_view_type& view,
762  const dual_view_type& origView,
763  const Teuchos::ArrayView<const size_t>& whichVectors)
764  : base_type(map)
765  , view_(wrapped_dual_view_type(view, origView))
766  , whichVectors_(whichVectors.begin(), whichVectors.end()) {
767  using Kokkos::ALL;
768  using Kokkos::subview;
769  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
770 
771  using ::Tpetra::Details::Behavior;
772  const bool debug = Behavior::debug();
773  if (debug) {
774  using ::Tpetra::Details::checkGlobalDualViewValidity;
775  std::ostringstream gblErrStrm;
776  const bool verbose = Behavior::verbose();
777  const auto comm = map.is_null() ? Teuchos::null : map->getComm();
778  const bool gblValid_0 =
779  checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
780  comm.getRawPtr());
781  const bool gblValid_1 =
782  checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
783  comm.getRawPtr());
784  const bool gblValid = gblValid_0 && gblValid_1;
785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
786  }
787 
788  const size_t lclNumRows = this->getLocalLength();
789  // Check dimensions of the input DualView. We accept that Kokkos
790  // might not allow construction of a 0 x m (Dual)View with m > 0,
791  // so we only require the number of rows to match if the
792  // (Dual)View has more than zero columns. Likewise, we only
793  // require the number of columns to match if the (Dual)View has
794  // more than zero rows.
795  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
796  view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
797  std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
798  if (whichVectors.size() != 0) {
799  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
800  view.extent(1) != 0 && view.extent(1) == 0,
801  std::invalid_argument,
802  "view.extent(1) = 0, but whichVectors.size()"
803  " = "
804  << whichVectors.size() << " > 0.");
805  size_t maxColInd = 0;
806  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
807  for (size_type k = 0; k < whichVectors.size(); ++k) {
808  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
809  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
810  std::invalid_argument, "whichVectors[" << k << "] = "
811  "Teuchos::OrdinalTraits<size_t>::invalid().");
812  maxColInd = std::max(maxColInd, whichVectors[k]);
813  }
814  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
815  view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
816  std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
817  }
818 
819  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
820  // zero strides, so modify in that case.
821  const size_t LDA = getDualViewStride(origView);
822  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument,
823  "Map and DualView origView "
824  "do not match. LDA = "
825  << LDA << " < this->getLocalLength() = " << lclNumRows << ". origView.extent(0) = " << origView.extent(0)
826  << ", origView.stride(1) = " << origView.view_device().stride(1) << ".");
827 
828  if (whichVectors.size() == 1) {
829  // If whichVectors has only one entry, we don't need to bother
830  // with nonconstant stride. Just shift the view over so it
831  // points to the desired column.
832  //
833  // NOTE (mfh 10 May 2014) This is a special case where we set
834  // origView_ just to view that one column, not all of the
835  // original columns. This ensures that the use of origView_ in
836  // offsetView works correctly.
837  const std::pair<size_t, size_t> colRng(whichVectors[0],
838  whichVectors[0] + 1);
839  view_ = takeSubview(view_, ALL(), colRng);
840  // origView_ = takeSubview (origView_, ALL (), colRng);
841  // whichVectors_.size() == 0 means "constant stride."
842  whichVectors_.clear();
843  }
844 }
845 
846 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
848  MultiVector(const Teuchos::RCP<const map_type>& map,
849  const Teuchos::ArrayView<const Scalar>& data,
850  const size_t LDA,
851  const size_t numVecs)
852  : base_type(map) {
853  typedef LocalOrdinal LO;
854  typedef GlobalOrdinal GO;
855  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
856  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
857 
858  // Deep copy constructor, constant stride (NO whichVectors_).
859  // There is no need for a deep copy constructor with nonconstant stride.
860 
861  const size_t lclNumRows =
862  map.is_null() ? size_t(0) : map->getLocalNumElements();
863  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
864  "map->getLocalNumElements() = "
865  << lclNumRows << ".");
866  if (numVecs != 0) {
867  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
868  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(data.size()) < minNumEntries,
869  std::invalid_argument,
870  "Input Teuchos::ArrayView does not have enough "
871  "entries, given the input Map and number of vectors in the MultiVector."
872  " data.size() = "
873  << data.size() << " < (LDA*(numVecs-1)) + "
874  "map->getLocalNumElements () = "
875  << minNumEntries << ".");
876  }
877 
878  this->view_ = allocDualView<Scalar, LO, GO, Node>(lclNumRows, numVecs);
879  // Note: X_out will be completely overwritten
880  auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
881 
882  // Make an unmanaged host Kokkos::View of the input data. First
883  // create a View (X_in_orig) with the original stride. Then,
884  // create a subview (X_in) with the right number of columns.
885  const impl_scalar_type* const X_in_raw =
886  reinterpret_cast<const impl_scalar_type*>(data.getRawPtr());
887  Kokkos::View<const impl_scalar_type**,
888  Kokkos::LayoutLeft,
889  Kokkos::HostSpace,
890  Kokkos::MemoryUnmanaged>
891  X_in_orig(X_in_raw, LDA, numVecs);
892  const Kokkos::pair<size_t, size_t> rowRng(0, lclNumRows);
893  auto X_in = Kokkos::subview(X_in_orig, rowRng, Kokkos::ALL());
894 
895  // If LDA != X_out's column stride, then we need to copy one
896  // column at a time; Kokkos::deep_copy refuses to work in that
897  // case.
898  const size_t outStride =
899  X_out.extent(1) == 0 ? size_t(1) : X_out.stride(1);
900  if (LDA == outStride) { // strides are the same; deep_copy once
901  // This only works because MultiVector uses LayoutLeft.
902  // We would need a custom copy functor otherwise.
903  // DEEP_COPY REVIEW - HOST-TO-DEVICE
904  Kokkos::deep_copy(execution_space(), X_out, X_in);
905  } else { // strides differ; copy one column at a time
906  typedef decltype(Kokkos::subview(X_out, Kokkos::ALL(), 0))
907  out_col_view_type;
908  typedef decltype(Kokkos::subview(X_in, Kokkos::ALL(), 0))
909  in_col_view_type;
910  for (size_t j = 0; j < numVecs; ++j) {
911  out_col_view_type X_out_j = Kokkos::subview(X_out, Kokkos::ALL(), j);
912  in_col_view_type X_in_j = Kokkos::subview(X_in, Kokkos::ALL(), j);
913  // DEEP_COPY REVIEW - HOST-TO-DEVICE
914  Kokkos::deep_copy(execution_space(), X_out_j, X_in_j);
915  }
916  }
917 }
918 
919 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921  MultiVector(const Teuchos::RCP<const map_type>& map,
922  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
923  const size_t numVecs)
924  : base_type(map) {
925  typedef impl_scalar_type IST;
926  typedef LocalOrdinal LO;
927  typedef GlobalOrdinal GO;
928  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
929  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
930 
931  const size_t lclNumRows =
932  map.is_null() ? size_t(0) : map->getLocalNumElements();
933  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1 || numVecs != static_cast<size_t>(ArrayOfPtrs.size()),
934  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
935  "ArrayOfPtrs.size() (= "
936  << ArrayOfPtrs.size() << ") != numVecs.");
937  for (size_t j = 0; j < numVecs; ++j) {
938  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
939  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
940  static_cast<size_t>(X_j_av.size()) < lclNumRows,
941  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = " << X_j_av.size() << " < map->getLocalNumElements() = " << lclNumRows << ".");
942  }
943 
944  view_ = allocDualView<Scalar, LO, GO, Node>(lclNumRows, numVecs);
945  auto X_out = this->getLocalViewDevice(Access::ReadWrite);
946 
947  // Make sure that the type of a single input column has the same
948  // array layout as each output column does, so we can deep_copy.
949  using array_layout = typename decltype(X_out)::array_layout;
950  using input_col_view_type = typename Kokkos::View<const IST*,
951  array_layout,
952  Kokkos::HostSpace,
953  Kokkos::MemoryUnmanaged>;
954 
955  const std::pair<size_t, size_t> rowRng(0, lclNumRows);
956  for (size_t j = 0; j < numVecs; ++j) {
957  Teuchos::ArrayView<const IST> X_j_av =
958  Teuchos::av_reinterpret_cast<const IST>(ArrayOfPtrs[j]);
959  input_col_view_type X_j_in(X_j_av.getRawPtr(), lclNumRows);
960  auto X_j_out = Kokkos::subview(X_out, rowRng, j);
961  // DEEP_COPY REVIEW - HOST-TO-DEVICE
962  Kokkos::deep_copy(execution_space(), X_j_out, X_j_in);
963  }
964 }
965 
966 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
969  return whichVectors_.empty();
970 }
971 
972 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
973 size_t
976  if (this->getMap().is_null()) { // possible, due to replaceMap().
977  return static_cast<size_t>(0);
978  } else {
979  return this->getMap()->getLocalNumElements();
980  }
981 }
982 
983 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
987  if (this->getMap().is_null()) { // possible, due to replaceMap().
988  return static_cast<size_t>(0);
989  } else {
990  return this->getMap()->getGlobalNumElements();
991  }
992 }
993 
994 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
995 size_t
997  getStride() const {
998  return isConstantStride() ? getDualViewStride(view_) : size_t(0);
999 }
1000 
1001 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1004  // Don't actually get a view, just get pointers.
1005  auto thisData = view_.getDualView().view_host().data();
1006  auto otherData = other.view_.getDualView().view_host().data();
1007  size_t thisSpan = view_.getDualView().view_host().span();
1008  size_t otherSpan = other.view_.getDualView().view_host().span();
1009  return (otherData <= thisData && thisData < otherData + otherSpan) || (thisData <= otherData && otherData < thisData + thisSpan);
1010 }
1011 
1012 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1014  checkSizes(const SrcDistObject& sourceObj) {
1015  // Check whether the source object is a MultiVector. If not, then
1016  // we can't even compare sizes, so it's definitely not OK to
1017  // Import or Export from it.
1019  const MV* src = dynamic_cast<const MV*>(&sourceObj);
1020  if (src == nullptr) {
1021  return false;
1022  } else {
1023  // The target of the Import or Export calls checkSizes() in
1024  // DistObject::doTransfer(). By that point, we've already
1025  // constructed an Import or Export object using the two
1026  // multivectors' Maps, which means that (hopefully) we've
1027  // already checked other attributes of the multivectors. Thus,
1028  // all we need to do here is check the number of columns.
1029  return src->getNumVectors() == this->getNumVectors();
1030  }
1031 }
1032 
1033 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1034 size_t
1037  return this->getNumVectors();
1038 }
1039 
1040 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1042  copyAndPermute(const SrcDistObject& sourceObj,
1043  const size_t numSameIDs,
1044  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1045  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1046  const CombineMode CM,
1047  const execution_space& space) {
1048  using Kokkos::Compat::create_const_view;
1049  using KokkosRefactor::Details::permute_array_multi_column;
1050  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1051  using std::endl;
1052  using ::Tpetra::Details::Behavior;
1054  using ::Tpetra::Details::ProfilingRegion;
1056 
1057  // We've already called checkSizes(), so this cast must succeed.
1058  MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1059 
1060  bool copyOnHost;
1061  if (importsAreAliased() && (this->constantNumberOfPackets() != 0) && Behavior::enableGranularTransfers()) {
1062  // imports are aliased to the target. We already posted Irecvs
1063  // into imports using a memory space that depends on GPU
1064  // awareness. Therefore we want to modify target in the same
1065  // memory space. See comments in DistObject::beginTransfer.
1066  copyOnHost = !Behavior::assumeMpiIsGPUAware();
1067  } else {
1068  // We are free to choose the better memory space for copyAndPermute.
1069  copyOnHost = runKernelOnHost(sourceMV);
1070  }
1071  const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1072  const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1073  const char tfecfFuncName[] = "copyAndPermute: ";
1074  ProfilingRegion regionCAP(copyOnHost ? longFuncNameHost : longFuncNameDevice);
1075 
1076  const bool verbose = Behavior::verbose();
1077  std::unique_ptr<std::string> prefix;
1078  if (verbose) {
1079  auto map = this->getMap();
1080  auto comm = map.is_null() ? Teuchos::null : map->getComm();
1081  const int myRank = comm.is_null() ? -1 : comm->getRank();
1082  std::ostringstream os;
1083  os << "Proc " << myRank << ": MV::copyAndPermute: ";
1084  prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1085  os << "Start" << endl;
1086  std::cerr << os.str();
1087  }
1088 
1089  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(permuteToLIDs.extent(0) != permuteFromLIDs.extent(0),
1090  std::logic_error, "permuteToLIDs.extent(0) = " << permuteToLIDs.extent(0) << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0) << ".");
1091  const size_t numCols = this->getNumVectors();
1092 
1093  // sourceMV doesn't belong to us, so we can't sync it. Do the
1094  // copying where it's currently sync'd.
1095  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1096  std::logic_error,
1097  "Input MultiVector needs sync to both host "
1098  "and device.");
1099  if (verbose) {
1100  std::ostringstream os;
1101  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1102  std::cerr << os.str();
1103  }
1104 
1105  if (verbose) {
1106  std::ostringstream os;
1107  os << *prefix << "Copy first " << numSameIDs << " elements" << endl;
1108  std::cerr << os.str();
1109  }
1110 
1111  // TODO (mfh 15 Sep 2013) When we replace
1112  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1113  // ways to copy the data:
1114  //
1115  // 1. Get a (sub)view of each column and call deep_copy on that.
1116  // 2. Write a custom kernel to copy the data.
1117  //
1118  // The first is easier, but the second might be more performant in
1119  // case we decide to use layouts other than LayoutLeft. It might
1120  // even make sense to hide whichVectors_ in an entirely new layout
1121  // for Kokkos Views.
1122 
1123  // Copy rows [0, numSameIDs-1] of the local multivectors.
1124  //
1125  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1126  // time, even when both multivectors are constant-stride, since
1127  // deep_copy between strided subviews with more than one column
1128  // doesn't currently work.
1129 
1130  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1131  // both source and target are constant stride and have multiple
1132  // columns.
1133  if (numSameIDs > 0) {
1134  const std::pair<size_t, size_t> rows(0, numSameIDs);
1135  if (copyOnHost) {
1136  ProfilingRegion regionC("Tpetra::MultiVector::copy[Host]");
1137  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1138  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1139 
1140  for (size_t j = 0; j < numCols; ++j) {
1141  const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1142  const size_t srcCol =
1143  sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1144 
1145  auto tgt_j = Kokkos::subview(tgt_h, rows, tgtCol);
1146  auto src_j = Kokkos::subview(src_h, rows, srcCol);
1147  if (CM == ADD_ASSIGN) {
1148  // Sum src_j into tgt_j
1149  using range_t =
1150  Kokkos::RangePolicy<execution_space, size_t>;
1151  range_t rp(space, 0, numSameIDs);
1152  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1153  aaf(tgt_j, src_j);
1154  Kokkos::parallel_for(rp, aaf);
1155  } else {
1156  // Copy src_j into tgt_j
1157  // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1158  Kokkos::deep_copy(space, tgt_j, src_j);
1159  space.fence("Tpetra::MultiVector::copyAndPermute-1");
1160  }
1161  }
1162  } else { // copy on device
1163  ProfilingRegion regionC("Tpetra::MultiVector::copy[Device]");
1164  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1165  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1166 
1167  for (size_t j = 0; j < numCols; ++j) {
1168  const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1169  const size_t srcCol =
1170  sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1171 
1172  auto tgt_j = Kokkos::subview(tgt_d, rows, tgtCol);
1173  auto src_j = Kokkos::subview(src_d, rows, srcCol);
1174  if (CM == ADD_ASSIGN) {
1175  // Sum src_j into tgt_j
1176  using range_t =
1177  Kokkos::RangePolicy<execution_space, size_t>;
1178  range_t rp(space, 0, numSameIDs);
1179  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1180  aaf(tgt_j, src_j);
1181  Kokkos::parallel_for(rp, aaf);
1182  } else {
1183  // Copy src_j into tgt_j
1184  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1185  Kokkos::deep_copy(space, tgt_j, src_j);
1186  space.fence("Tpetra::MultiVector::copyAndPermute-2");
1187  }
1188  }
1189  }
1190  }
1191 
1192  // For the remaining GIDs, execute the permutations. This may
1193  // involve noncontiguous access of both source and destination
1194  // vectors, depending on the LID lists.
1195  //
1196  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1197  // the same process, this merges their values by replacement of
1198  // the last encountered GID, not by the specified merge rule
1199  // (such as ADD).
1200 
1201  // If there are no permutations, we are done
1202  if (permuteFromLIDs.extent(0) == 0 ||
1203  permuteToLIDs.extent(0) == 0) {
1204  if (verbose) {
1205  std::ostringstream os;
1206  os << *prefix << "No permutations. Done!" << endl;
1207  std::cerr << os.str();
1208  }
1209  return;
1210  }
1211  ProfilingRegion regionP("Tpetra::MultiVector::permute");
1212 
1213  if (verbose) {
1214  std::ostringstream os;
1215  os << *prefix << "Permute" << endl;
1216  std::cerr << os.str();
1217  }
1218 
1219  // We could in theory optimize for the case where exactly one of
1220  // them is constant stride, but we don't currently do that.
1221  const bool nonConstStride =
1222  !this->isConstantStride() || !sourceMV.isConstantStride();
1223 
1224  if (verbose) {
1225  std::ostringstream os;
1226  os << *prefix << "nonConstStride="
1227  << (nonConstStride ? "true" : "false") << endl;
1228  std::cerr << os.str();
1229  }
1230 
1231  // We only need the "which vectors" arrays if either the source or
1232  // target MV is not constant stride. Since we only have one
1233  // kernel that must do double-duty, we have to create a "which
1234  // vectors" array for the MV that _is_ constant stride.
1235  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1236  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1237  if (nonConstStride) {
1238  if (this->whichVectors_.size() == 0) {
1239  Kokkos::DualView<size_t*, device_type> tmpTgt("tgtWhichVecs", numCols);
1240  tmpTgt.modify_host();
1241  for (size_t j = 0; j < numCols; ++j) {
1242  tmpTgt.view_host()(j) = j;
1243  }
1244  if (!copyOnHost) {
1245  tmpTgt.sync_device();
1246  }
1247  tgtWhichVecs = tmpTgt;
1248  } else {
1249  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_();
1250  tgtWhichVecs =
1251  getDualViewCopyFromArrayView<size_t, device_type>(tgtWhichVecsT,
1252  "tgtWhichVecs",
1253  copyOnHost);
1254  }
1255  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(tgtWhichVecs.extent(0)) !=
1256  this->getNumVectors(),
1257  std::logic_error, "tgtWhichVecs.extent(0) = " << tgtWhichVecs.extent(0) << " != this->getNumVectors() = " << this->getNumVectors() << ".");
1258 
1259  if (sourceMV.whichVectors_.size() == 0) {
1260  Kokkos::DualView<size_t*, device_type> tmpSrc("srcWhichVecs", numCols);
1261  tmpSrc.modify_host();
1262  for (size_t j = 0; j < numCols; ++j) {
1263  tmpSrc.view_host()(j) = j;
1264  }
1265  if (!copyOnHost) {
1266  tmpSrc.sync_device();
1267  }
1268  srcWhichVecs = tmpSrc;
1269  } else {
1270  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1271  sourceMV.whichVectors_();
1272  srcWhichVecs =
1273  getDualViewCopyFromArrayView<size_t, device_type>(srcWhichVecsT,
1274  "srcWhichVecs",
1275  copyOnHost);
1276  }
1277  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(srcWhichVecs.extent(0)) !=
1278  sourceMV.getNumVectors(),
1279  std::logic_error,
1280  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent(0)
1281  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors()
1282  << ".");
1283  }
1284 
1285  if (copyOnHost) { // permute on host too
1286  if (verbose) {
1287  std::ostringstream os;
1288  os << *prefix << "Get permute LIDs on host" << std::endl;
1289  std::cerr << os.str();
1290  }
1291  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1292  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1293 
1294  TEUCHOS_ASSERT(!permuteToLIDs.need_sync_host());
1295  auto permuteToLIDs_h = create_const_view(permuteToLIDs.view_host());
1296  TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_host());
1297  auto permuteFromLIDs_h =
1298  create_const_view(permuteFromLIDs.view_host());
1299 
1300  if (verbose) {
1301  std::ostringstream os;
1302  os << *prefix << "Permute on host" << endl;
1303  std::cerr << os.str();
1304  }
1305  if (nonConstStride) {
1306  // No need to sync first, because copyOnHost argument to
1307  // getDualViewCopyFromArrayView puts them in the right place.
1308  auto tgtWhichVecs_h =
1309  create_const_view(tgtWhichVecs.view_host());
1310  auto srcWhichVecs_h =
1311  create_const_view(srcWhichVecs.view_host());
1312  if (CM == ADD_ASSIGN) {
1313  using op_type = KokkosRefactor::Details::AddOp;
1314  permute_array_multi_column_variable_stride(tgt_h, src_h,
1315  permuteToLIDs_h,
1316  permuteFromLIDs_h,
1317  tgtWhichVecs_h,
1318  srcWhichVecs_h, numCols,
1319  op_type());
1320  } else {
1321  using op_type = KokkosRefactor::Details::InsertOp;
1322  permute_array_multi_column_variable_stride(tgt_h, src_h,
1323  permuteToLIDs_h,
1324  permuteFromLIDs_h,
1325  tgtWhichVecs_h,
1326  srcWhichVecs_h, numCols,
1327  op_type());
1328  }
1329  } else {
1330  if (CM == ADD_ASSIGN) {
1331  using op_type = KokkosRefactor::Details::AddOp;
1332  permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1333  permuteFromLIDs_h, numCols, op_type());
1334  } else {
1335  using op_type = KokkosRefactor::Details::InsertOp;
1336  permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1337  permuteFromLIDs_h, numCols, op_type());
1338  }
1339  }
1340  } else { // permute on device
1341  if (verbose) {
1342  std::ostringstream os;
1343  os << *prefix << "Get permute LIDs on device" << endl;
1344  std::cerr << os.str();
1345  }
1346  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1347  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1348 
1349  TEUCHOS_ASSERT(!permuteToLIDs.need_sync_device());
1350  auto permuteToLIDs_d = create_const_view(permuteToLIDs.view_device());
1351  TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_device());
1352  auto permuteFromLIDs_d =
1353  create_const_view(permuteFromLIDs.view_device());
1354 
1355  if (verbose) {
1356  std::ostringstream os;
1357  os << *prefix << "Permute on device" << endl;
1358  std::cerr << os.str();
1359  }
1360  if (nonConstStride) {
1361  // No need to sync first, because copyOnHost argument to
1362  // getDualViewCopyFromArrayView puts them in the right place.
1363  auto tgtWhichVecs_d = create_const_view(tgtWhichVecs.view_device());
1364  auto srcWhichVecs_d = create_const_view(srcWhichVecs.view_device());
1365  if (CM == ADD_ASSIGN) {
1366  using op_type = KokkosRefactor::Details::AddOp;
1367  permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1368  permuteToLIDs_d,
1369  permuteFromLIDs_d,
1370  tgtWhichVecs_d,
1371  srcWhichVecs_d, numCols,
1372  op_type());
1373  } else {
1374  using op_type = KokkosRefactor::Details::InsertOp;
1375  permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1376  permuteToLIDs_d,
1377  permuteFromLIDs_d,
1378  tgtWhichVecs_d,
1379  srcWhichVecs_d, numCols,
1380  op_type());
1381  }
1382  } else {
1383  if (CM == ADD_ASSIGN) {
1384  using op_type = KokkosRefactor::Details::AddOp;
1385  permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1386  permuteFromLIDs_d, numCols, op_type());
1387  } else {
1388  using op_type = KokkosRefactor::Details::InsertOp;
1389  permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1390  permuteFromLIDs_d, numCols, op_type());
1391  }
1392  }
1393  }
1394 
1395  if (verbose) {
1396  std::ostringstream os;
1397  os << *prefix << "Done!" << endl;
1398  std::cerr << os.str();
1399  }
1400 }
1401 
1402 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1404  const SrcDistObject& sourceObj, const size_t numSameIDs,
1405  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1406  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1407  const CombineMode CM) {
1408  copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1409  execution_space());
1410 }
1411 
1412 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1414  packAndPrepare(const SrcDistObject& sourceObj,
1415  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1416  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1417  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1418  size_t& constantNumPackets,
1419  const execution_space& space) {
1420  using Kokkos::Compat::create_const_view;
1421  using Kokkos::Compat::getKokkosViewDeepCopy;
1422  using std::endl;
1423  using ::Tpetra::Details::Behavior;
1424  using ::Tpetra::Details::ProfilingRegion;
1427 
1428  // We've already called checkSizes(), so this cast must succeed.
1429  MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1430 
1431  const bool packOnHost = runKernelOnHost(sourceMV);
1432  const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1433  const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1434  const char tfecfFuncName[] = "packAndPrepare: ";
1435  ProfilingRegion regionPAP(packOnHost ? longFuncNameHost : longFuncNameDevice);
1436 
1437  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1438  // have the option to check indices. We do so when Tpetra is in
1439  // debug mode. It is in debug mode by default in a debug build,
1440  // but you may control this at run time, before launching the
1441  // executable, by setting the TPETRA_DEBUG environment variable to
1442  // "1" (or "TRUE").
1443  const bool debugCheckIndices = Behavior::debug();
1444  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1445  // environment variable to "1" (or "TRUE") for copious debug
1446  // output to std::cerr on every MPI process. This is unwise for
1447  // runs with large numbers of MPI processes.
1448  const bool printDebugOutput = Behavior::verbose();
1449 
1450  std::unique_ptr<std::string> prefix;
1451  if (printDebugOutput) {
1452  auto map = this->getMap();
1453  auto comm = map.is_null() ? Teuchos::null : map->getComm();
1454  const int myRank = comm.is_null() ? -1 : comm->getRank();
1455  std::ostringstream os;
1456  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1457  prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1458  os << "Start" << endl;
1459  std::cerr << os.str();
1460  }
1461 
1462  const size_t numCols = sourceMV.getNumVectors();
1463 
1464  // This spares us from needing to fill numExportPacketsPerLID.
1465  // Setting constantNumPackets to a nonzero value signals that
1466  // all packets have the same number of entries.
1467  constantNumPackets = numCols;
1468 
1469  // If we have no exports, there is nothing to do. Make sure this
1470  // goes _after_ setting constantNumPackets correctly.
1471  if (exportLIDs.extent(0) == 0) {
1472  if (printDebugOutput) {
1473  std::ostringstream os;
1474  os << *prefix << "No exports on this proc, DONE" << std::endl;
1475  std::cerr << os.str();
1476  }
1477  return;
1478  }
1479 
1480  /* The layout in the export for MultiVectors is as follows:
1481  exports = { all of the data from row exportLIDs.front() ;
1482  ....
1483  all of the data from row exportLIDs.back() }
1484  This doesn't have the best locality, but is necessary because
1485  the data for a Packet (all data associated with an LID) is
1486  required to be contiguous. */
1487 
1488  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1489  // packing scheme in the above comment? The data going to a
1490  // particular process must be contiguous, of course, but those
1491  // data could include entries from multiple LIDs. DistObject just
1492  // needs to know how to index into that data. Kokkos is good at
1493  // decoupling storage intent from data layout choice.
1494 
1495  const size_t numExportLIDs = exportLIDs.extent(0);
1496  const size_t newExportsSize = numCols * numExportLIDs;
1497  if (printDebugOutput) {
1498  std::ostringstream os;
1499  os << *prefix << "realloc: "
1500  << "numExportLIDs: " << numExportLIDs
1501  << ", exports.extent(0): " << exports.extent(0)
1502  << ", newExportsSize: " << newExportsSize << std::endl;
1503  std::cerr << os.str();
1504  }
1505  reallocDualViewIfNeeded(exports, newExportsSize, "exports");
1506 
1507  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1508  // sync it. Pack it where it's currently sync'd.
1509  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1510  std::logic_error,
1511  "Input MultiVector needs sync to both host "
1512  "and device.");
1513  if (printDebugOutput) {
1514  std::ostringstream os;
1515  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1516  std::cerr << os.str();
1517  }
1518 
1519  // Mark 'exports' here, since we might have resized it above.
1520  // Resizing currently requires calling the constructor, which
1521  // clears out the 'modified' flags.
1522  if (packOnHost) {
1523  // nde 06 Feb 2020: If 'exports' does not require resize
1524  // when reallocDualViewIfNeeded is called, the modified flags
1525  // are not cleared out. This can result in host and device views
1526  // being out-of-sync, resuling in an error in exports.modify_* calls.
1527  // Clearing the sync flags prevents this possible case.
1528  exports.clear_sync_state();
1529  exports.modify_host();
1530  } else {
1531  // nde 06 Feb 2020: If 'exports' does not require resize
1532  // when reallocDualViewIfNeeded is called, the modified flags
1533  // are not cleared out. This can result in host and device views
1534  // being out-of-sync, resuling in an error in exports.modify_* calls.
1535  // Clearing the sync flags prevents this possible case.
1536  exports.clear_sync_state();
1537  exports.modify_device();
1538  }
1539 
1540  if (numCols == 1) { // special case for one column only
1541  // MultiVector always represents a single column with constant
1542  // stride, but it doesn't hurt to implement both cases anyway.
1543  //
1544  // ETP: I'm not sure I agree with the above statement. Can't a single-
1545  // column multivector be a subview of another multi-vector, in which case
1546  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1547  // separately.
1548  //
1549  // mfh 18 Jan 2016: In answer to ETP's comment above:
1550  // MultiVector treats single-column MultiVectors created using a
1551  // "nonconstant stride constructor" as a special case, and makes
1552  // them constant stride (by making whichVectors_ have length 0).
1553  if (sourceMV.isConstantStride()) {
1554  using KokkosRefactor::Details::pack_array_single_column;
1555  if (printDebugOutput) {
1556  std::ostringstream os;
1557  os << *prefix << "Pack numCols=1 const stride" << endl;
1558  std::cerr << os.str();
1559  }
1560  if (packOnHost) {
1561  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1562  pack_array_single_column(exports.view_host(),
1563  src_host,
1564  exportLIDs.view_host(),
1565  0,
1566  debugCheckIndices);
1567  } else { // pack on device
1568  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1569  pack_array_single_column(exports.view_device(),
1570  src_dev,
1571  exportLIDs.view_device(),
1572  0,
1573  debugCheckIndices,
1574  space);
1575  }
1576  } else {
1577  using KokkosRefactor::Details::pack_array_single_column;
1578  if (printDebugOutput) {
1579  std::ostringstream os;
1580  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1581  std::cerr << os.str();
1582  }
1583  if (packOnHost) {
1584  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1585  pack_array_single_column(exports.view_host(),
1586  src_host,
1587  exportLIDs.view_host(),
1588  sourceMV.whichVectors_[0],
1589  debugCheckIndices);
1590  } else { // pack on device
1591  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1592  pack_array_single_column(exports.view_device(),
1593  src_dev,
1594  exportLIDs.view_device(),
1595  sourceMV.whichVectors_[0],
1596  debugCheckIndices,
1597  space);
1598  }
1599  }
1600  } else { // the source MultiVector has multiple columns
1601  if (sourceMV.isConstantStride()) {
1602  using KokkosRefactor::Details::pack_array_multi_column;
1603  if (printDebugOutput) {
1604  std::ostringstream os;
1605  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1606  std::cerr << os.str();
1607  }
1608  if (packOnHost) {
1609  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1610  pack_array_multi_column(exports.view_host(),
1611  src_host,
1612  exportLIDs.view_host(),
1613  numCols,
1614  debugCheckIndices);
1615  } else { // pack on device
1616  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1617  pack_array_multi_column(exports.view_device(),
1618  src_dev,
1619  exportLIDs.view_device(),
1620  numCols,
1621  debugCheckIndices,
1622  space);
1623  }
1624  } else {
1625  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1626  if (printDebugOutput) {
1627  std::ostringstream os;
1628  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1629  << endl;
1630  std::cerr << os.str();
1631  }
1632  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1633  // whichVectors_ can be expensive, but pack and unpack for
1634  // nonconstant-stride MultiVectors is slower anyway.
1635  using IST = impl_scalar_type;
1636  using DV = Kokkos::DualView<IST*, device_type>;
1637  using HES = typename DV::t_host::execution_space;
1638  using DES = typename DV::t_dev::execution_space;
1639  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_();
1640  if (packOnHost) {
1641  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1642  pack_array_multi_column_variable_stride(exports.view_host(),
1643  src_host,
1644  exportLIDs.view_host(),
1645  getKokkosViewDeepCopy<HES>(whichVecs),
1646  numCols,
1647  debugCheckIndices);
1648  } else { // pack on device
1649  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1650  pack_array_multi_column_variable_stride(exports.view_device(),
1651  src_dev,
1652  exportLIDs.view_device(),
1653  getKokkosViewDeepCopy<DES>(whichVecs),
1654  numCols,
1655  debugCheckIndices, space);
1656  }
1657  }
1658  }
1659 
1660  if (printDebugOutput) {
1661  std::ostringstream os;
1662  os << *prefix << "Done!" << endl;
1663  std::cerr << os.str();
1664  }
1665 }
1666 
1667 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1669  packAndPrepare(const SrcDistObject& sourceObj,
1670  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1671  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1672  Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1673  size_t& constantNumPackets) {
1674  packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space());
1675 }
1676 
1677 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1678 template <class NO>
1679 typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1680  typename NO::device_type::memory_space>::value,
1681  bool>::type
1683  reallocImportsIfNeededImpl(const size_t newSize,
1684  const bool verbose,
1685  const std::string* prefix,
1686  const bool areRemoteLIDsContiguous,
1687  const CombineMode CM) {
1688  // This implementation of reallocImportsIfNeeded is an
1689  // optimization that is specific to MultiVector. We check if the
1690  // imports_ view can be aliased to the end of the data view_. If
1691  // that is the case, we can skip the unpackAndCombine call.
1692 
1693  if (verbose) {
1694  std::ostringstream os;
1695  os << *prefix << "Realloc (if needed) imports_ from "
1696  << this->imports_.extent(0) << " to " << newSize << std::endl;
1697  std::cerr << os.str();
1698  }
1699 
1700  bool reallocated = false;
1701 
1702  using IST = impl_scalar_type;
1703  using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1704 
1705  // Conditions for aliasing memory:
1706  // - When both sides of the dual view are in the same memory
1707  // space, we do not need to worry about syncing things.
1708  // - When both memory spaces are different, we only alias if this
1709  // does not incur additional sync'ing.
1710  // - The remote LIDs need to be contiguous, so that we do not need
1711  // to reorder received information.
1712  // - CombineMode needs to be INSERT.
1713  // - The number of vectors needs to be 1, otherwise we need to
1714  // reorder the received data.
1715  if ((std::is_same_v<typename dual_view_type::t_dev::device_type, typename dual_view_type::t_host::device_type> ||
1716  (Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_device()) ||
1717  (!Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_host())) &&
1718  areRemoteLIDsContiguous &&
1719  (CM == INSERT || CM == REPLACE) &&
1720  (getNumVectors() == 1) &&
1721  (newSize > 0)) {
1722  size_t offset = getLocalLength() - newSize;
1723  reallocated = this->imports_.view_device().data() != view_.getDualView().view_device().data() + offset;
1724  if (reallocated) {
1725  typedef std::pair<size_t, size_t> range_type;
1726  this->imports_ = DV(view_.getDualView(),
1727  range_type(offset, getLocalLength()),
1728  0);
1729 
1730  if (verbose) {
1731  std::ostringstream os;
1732  os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1733  std::cerr << os.str();
1734  }
1735  }
1736  return reallocated;
1737  }
1738  {
1740  reallocated =
1741  reallocDualViewIfNeeded(this->unaliased_imports_, newSize, "imports");
1742  if (verbose) {
1743  std::ostringstream os;
1744  os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1745  std::cerr << os.str();
1746  }
1747  this->imports_ = this->unaliased_imports_;
1748  }
1749  return reallocated;
1750 }
1751 
1752 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1753 template <class NO>
1754 typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1755  typename NO::device_type::memory_space>::value,
1756  bool>::type
1758  reallocImportsIfNeededImpl(const size_t newSize,
1759  const bool verbose,
1760  const std::string* prefix,
1761  const bool areRemoteLIDsContiguous,
1762  const CombineMode CM) {
1763  return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1764 }
1765 
1766 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1768  reallocImportsIfNeeded(const size_t newSize,
1769  const bool verbose,
1770  const std::string* prefix,
1771  const bool areRemoteLIDsContiguous,
1772  const CombineMode CM) {
1774  return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1775 }
1776 
1777 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1780  return (this->imports_.view_device().data() + this->imports_.view_device().extent(0) ==
1781  view_.getDualView().view_device().data() + view_.getDualView().view_device().extent(0));
1782 }
1783 
1784 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1786  unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1787  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1788  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1789  const size_t constantNumPackets,
1790  const CombineMode CM,
1791  const execution_space& space) {
1792  using Kokkos::Compat::getKokkosViewDeepCopy;
1793  using KokkosRefactor::Details::unpack_array_multi_column;
1794  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1795  using std::endl;
1796  using ::Tpetra::Details::Behavior;
1797  using ::Tpetra::Details::ProfilingRegion;
1798 
1799  const bool unpackOnHost = runKernelOnHost(imports);
1800 
1801  const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1802  const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1803  const char* longFuncName = unpackOnHost ? longFuncNameHost : longFuncNameDevice;
1804  const char tfecfFuncName[] = "unpackAndCombine: ";
1805 
1806  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1807  // have the option to check indices. We do so when Tpetra is in
1808  // debug mode. It is in debug mode by default in a debug build,
1809  // but you may control this at run time, before launching the
1810  // executable, by setting the TPETRA_DEBUG environment variable to
1811  // "1" (or "TRUE").
1812  const bool debugCheckIndices = Behavior::debug();
1813 
1814  const bool printDebugOutput = Behavior::verbose();
1815  std::unique_ptr<std::string> prefix;
1816  if (printDebugOutput) {
1817  auto map = this->getMap();
1818  auto comm = map.is_null() ? Teuchos::null : map->getComm();
1819  const int myRank = comm.is_null() ? -1 : comm->getRank();
1820  std::ostringstream os;
1821  os << "Proc " << myRank << ": " << longFuncName << ": ";
1822  prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1823  os << "Start" << endl;
1824  std::cerr << os.str();
1825  }
1826 
1827  // If we have no imports, there is nothing to do
1828  if (importLIDs.extent(0) == 0) {
1829  if (printDebugOutput) {
1830  std::ostringstream os;
1831  os << *prefix << "No imports. Done!" << endl;
1832  std::cerr << os.str();
1833  }
1834  return;
1835  }
1836 
1837  // Check, whether imports_ is a subview of the MV view.
1838  if (importsAreAliased()) {
1839  if (printDebugOutput) {
1840  std::ostringstream os;
1841  os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1842  std::cerr << os.str();
1843  }
1844  return;
1845  }
1846 
1847  ProfilingRegion regionUAC(longFuncName);
1848 
1849  const size_t numVecs = getNumVectors();
1850  if (debugCheckIndices) {
1851  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(imports.extent(0)) !=
1852  numVecs * importLIDs.extent(0),
1853  std::runtime_error,
1854  "imports.extent(0) = " << imports.extent(0)
1855  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1856  << " * " << importLIDs.extent(0) << " = "
1857  << numVecs * importLIDs.extent(0) << ".");
1858 
1859  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(constantNumPackets == static_cast<size_t>(0), std::runtime_error,
1860  "constantNumPackets input argument must be nonzero.");
1861 
1862  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(numVecs) !=
1863  static_cast<size_t>(constantNumPackets),
1864  std::runtime_error, "constantNumPackets must equal numVecs.");
1865  }
1866 
1867  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1868  // the memory space in which the imports buffer was last modified and
1869  // the size of the imports buffer.
1870  // DistObject::doTransferNew decides where it was last modified (based on
1871  // whether communication buffers used were on host or device).
1872  if (unpackOnHost) {
1873  if (this->imports_.need_sync_host()) this->imports_.sync_host();
1874  } else {
1875  if (this->imports_.need_sync_device()) this->imports_.sync_device();
1876  }
1877 
1878  if (printDebugOutput) {
1879  std::ostringstream os;
1880  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1881  << endl;
1882  std::cerr << os.str();
1883  }
1884 
1885  // We have to sync before modifying, because this method may read
1886  // as well as write (depending on the CombineMode).
1887  auto imports_d = imports.view_device();
1888  auto imports_h = imports.view_host();
1889  auto importLIDs_d = importLIDs.view_device();
1890  auto importLIDs_h = importLIDs.view_host();
1891 
1892  Kokkos::DualView<size_t*, device_type> whichVecs;
1893  if (!isConstantStride()) {
1894  Kokkos::View<const size_t*, Kokkos::HostSpace,
1895  Kokkos::MemoryUnmanaged>
1896  whichVecsIn(whichVectors_.getRawPtr(),
1897  numVecs);
1898  whichVecs = Kokkos::DualView<size_t*, device_type>("whichVecs", numVecs);
1899  if (unpackOnHost) {
1900  whichVecs.modify_host();
1901  // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
1902  Kokkos::deep_copy(whichVecs.view_host(), whichVecsIn);
1903  } else {
1904  whichVecs.modify_device();
1905  // DEEP_COPY REVIEW - HOST-TO-DEVICE
1906  Kokkos::deep_copy(whichVecs.view_device(), whichVecsIn);
1907  }
1908  }
1909  auto whichVecs_d = whichVecs.view_device();
1910  auto whichVecs_h = whichVecs.view_host();
1911 
1912  /* The layout in the export for MultiVectors is as follows:
1913  imports = { all of the data from row exportLIDs.front() ;
1914  ....
1915  all of the data from row exportLIDs.back() }
1916  This doesn't have the best locality, but is necessary because
1917  the data for a Packet (all data associated with an LID) is
1918  required to be contiguous. */
1919 
1920  if (numVecs > 0 && importLIDs.extent(0) > 0) {
1921  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1922  using host_exec_space = typename dual_view_type::t_host::execution_space;
1923 
1924  // This fixes GitHub Issue #4418.
1925  const bool use_atomic_updates = unpackOnHost ? host_exec_space().concurrency() != 1 : dev_exec_space().concurrency() != 1;
1926 
1927  if (printDebugOutput) {
1928  std::ostringstream os;
1929  os << *prefix << "Unpack: " << combineModeToString(CM) << endl;
1930  std::cerr << os.str();
1931  }
1932 
1933  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1934  // custom combine modes, start editing here.
1935 
1936  if (CM == INSERT || CM == REPLACE) {
1937  using op_type = KokkosRefactor::Details::InsertOp;
1938  if (isConstantStride()) {
1939  if (unpackOnHost) {
1940  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1941  unpack_array_multi_column(host_exec_space(),
1942  X_h, imports_h, importLIDs_h,
1943  op_type(), numVecs,
1944  use_atomic_updates,
1945  debugCheckIndices);
1946 
1947  } else { // unpack on device
1948  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1949  unpack_array_multi_column(space,
1950  X_d, imports_d, importLIDs_d,
1951  op_type(), numVecs,
1952  use_atomic_updates,
1953  debugCheckIndices);
1954  }
1955  } else { // not constant stride
1956  if (unpackOnHost) {
1957  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1958  unpack_array_multi_column_variable_stride(host_exec_space(),
1959  X_h, imports_h,
1960  importLIDs_h,
1961  whichVecs_h,
1962  op_type(),
1963  numVecs,
1964  use_atomic_updates,
1965  debugCheckIndices);
1966  } else { // unpack on device
1967  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1968  unpack_array_multi_column_variable_stride(space,
1969  X_d, imports_d,
1970  importLIDs_d,
1971  whichVecs_d,
1972  op_type(),
1973  numVecs,
1974  use_atomic_updates,
1975  debugCheckIndices);
1976  }
1977  }
1978  } else if (CM == ADD || CM == ADD_ASSIGN) {
1979  using op_type = KokkosRefactor::Details::AddOp;
1980  if (isConstantStride()) {
1981  if (unpackOnHost) {
1982  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1983  unpack_array_multi_column(host_exec_space(),
1984  X_h, imports_h, importLIDs_h,
1985  op_type(), numVecs,
1986  use_atomic_updates,
1987  debugCheckIndices);
1988  } else { // unpack on device
1989  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1990  unpack_array_multi_column(space,
1991  X_d, imports_d, importLIDs_d,
1992  op_type(), numVecs,
1993  use_atomic_updates,
1994  debugCheckIndices);
1995  }
1996  } else { // not constant stride
1997  if (unpackOnHost) {
1998  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1999  unpack_array_multi_column_variable_stride(host_exec_space(),
2000  X_h, imports_h,
2001  importLIDs_h,
2002  whichVecs_h,
2003  op_type(),
2004  numVecs,
2005  use_atomic_updates,
2006  debugCheckIndices);
2007  } else { // unpack on device
2008  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2009  unpack_array_multi_column_variable_stride(space,
2010  X_d, imports_d,
2011  importLIDs_d,
2012  whichVecs_d,
2013  op_type(),
2014  numVecs,
2015  use_atomic_updates,
2016  debugCheckIndices);
2017  }
2018  }
2019  } else if (CM == ABSMAX) {
2020  using op_type = KokkosRefactor::Details::AbsMaxOp;
2021  if (isConstantStride()) {
2022  if (unpackOnHost) {
2023  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2024  unpack_array_multi_column(host_exec_space(),
2025  X_h, imports_h, importLIDs_h,
2026  op_type(), numVecs,
2027  use_atomic_updates,
2028  debugCheckIndices);
2029  } else { // unpack on device
2030  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2031  unpack_array_multi_column(space,
2032  X_d, imports_d, importLIDs_d,
2033  op_type(), numVecs,
2034  use_atomic_updates,
2035  debugCheckIndices);
2036  }
2037  } else {
2038  if (unpackOnHost) {
2039  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2040  unpack_array_multi_column_variable_stride(host_exec_space(),
2041  X_h, imports_h,
2042  importLIDs_h,
2043  whichVecs_h,
2044  op_type(),
2045  numVecs,
2046  use_atomic_updates,
2047  debugCheckIndices);
2048  } else { // unpack on device
2049  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2050  unpack_array_multi_column_variable_stride(space,
2051  X_d, imports_d,
2052  importLIDs_d,
2053  whichVecs_d,
2054  op_type(),
2055  numVecs,
2056  use_atomic_updates,
2057  debugCheckIndices);
2058  }
2059  }
2060  } else {
2061  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, "Invalid CombineMode");
2062  }
2063  } else {
2064  if (printDebugOutput) {
2065  std::ostringstream os;
2066  os << *prefix << "Nothing to unpack" << endl;
2067  std::cerr << os.str();
2068  }
2069  }
2070 
2071  if (printDebugOutput) {
2072  std::ostringstream os;
2073  os << *prefix << "Done!" << endl;
2074  std::cerr << os.str();
2075  }
2076 }
2077 
2078 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2080  unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2081  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2082  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2083  const size_t constantNumPackets,
2084  const CombineMode CM) {
2085  unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2086 }
2087 
2088 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2089 size_t
2092  if (isConstantStride()) {
2093  return static_cast<size_t>(view_.extent(1));
2094  } else {
2095  return static_cast<size_t>(whichVectors_.size());
2096  }
2097 }
2098 
2099 namespace { // (anonymous)
2100 
2101 template <class RV>
2102 void gblDotImpl(const RV& dotsOut,
2103  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2104  const bool distributed) {
2105  using Teuchos::REDUCE_MAX;
2106  using Teuchos::REDUCE_SUM;
2107  using Teuchos::reduceAll;
2108  typedef typename RV::non_const_value_type dot_type;
2109 
2110  const size_t numVecs = dotsOut.extent(0);
2111 
2112  // If the MultiVector is distributed over multiple processes, do
2113  // the distributed (interprocess) part of the dot product. We
2114  // assume that the MPI implementation can read from and write to
2115  // device memory.
2116  //
2117  // replaceMap() may have removed some processes. Those
2118  // processes have a null Map. They must not participate in any
2119  // collective operations. We ask first whether the Map is null,
2120  // because isDistributed() defers that question to the Map. We
2121  // still compute and return local dots for processes not
2122  // participating in collective operations; those probably don't
2123  // make any sense, but it doesn't hurt to do them, since it's
2124  // illegal to call dot() on those processes anyway.
2125  if (distributed && !comm.is_null()) {
2126  // The calling process only participates in the collective if
2127  // both the Map and its Comm on that process are nonnull.
2128  const int nv = static_cast<int>(numVecs);
2129  const bool commIsInterComm = ::Tpetra::Details::isInterComm(*comm);
2130 
2131  if (commIsInterComm) {
2132  // If comm is an intercomm, then we may not alias input and
2133  // output buffers, so we have to make a copy of the local
2134  // sum.
2135  typename RV::non_const_type lclDots(Kokkos::ViewAllocateWithoutInitializing("tmp"), numVecs);
2136  // DEEP_COPY REVIEW - NOT TESTED
2137  Kokkos::deep_copy(lclDots, dotsOut);
2138  const dot_type* const lclSum = lclDots.data();
2139  dot_type* const gblSum = dotsOut.data();
2140  reduceAll<int, dot_type>(*comm, REDUCE_SUM, nv, lclSum, gblSum);
2141  } else {
2142  dot_type* const inout = dotsOut.data();
2143  reduceAll<int, dot_type>(*comm, REDUCE_SUM, nv, inout, inout);
2144  }
2145  }
2146 }
2147 } // namespace
2148 
2149 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2152  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const {
2153  using Kokkos::subview;
2154  using Teuchos::Comm;
2155  using Teuchos::null;
2156  using Teuchos::RCP;
2157  using ::Tpetra::Details::Behavior;
2158  // View of all the dot product results.
2159  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2160  typedef typename dual_view_type::t_dev_const XMV;
2161  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2162 
2163  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Kokkos::View)");
2164 
2165  const size_t numVecs = this->getNumVectors();
2166  if (numVecs == 0) {
2167  return; // nothing to do
2168  }
2169  const size_t lclNumRows = this->getLocalLength();
2170  const size_t numDots = static_cast<size_t>(dots.extent(0));
2171  const bool debug = Behavior::debug();
2172 
2173  if (debug) {
2174  const bool compat = this->getMap()->isCompatible(*(A.getMap()));
2175  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!compat, std::invalid_argument,
2176  "'*this' MultiVector is not "
2177  "compatible with the input MultiVector A. We only test for this "
2178  "in debug mode.");
2179  }
2180 
2181  // FIXME (mfh 11 Jul 2014) These exception tests may not
2182  // necessarily be thrown on all processes consistently. We should
2183  // instead pass along error state with the inner product. We
2184  // could do this by setting an extra slot to
2185  // Kokkos::ArithTraits<dot_type>::one() on error. The
2186  // final sum should be
2187  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2188  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2189  lclNumRows != A.getLocalLength(), std::runtime_error,
2190  "MultiVectors do not have the same local length. "
2191  "this->getLocalLength() = "
2192  << lclNumRows << " != "
2193  "A.getLocalLength() = "
2194  << A.getLocalLength() << ".");
2195  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2196  numVecs != A.getNumVectors(), std::runtime_error,
2197  "MultiVectors must have the same number of columns (vectors). "
2198  "this->getNumVectors() = "
2199  << numVecs << " != "
2200  "A.getNumVectors() = "
2201  << A.getNumVectors() << ".");
2202  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2203  numDots != numVecs, std::runtime_error,
2204  "The output array 'dots' must have the same number of entries as the "
2205  "number of columns (vectors) in *this and A. dots.extent(0) = "
2206  << numDots << " != this->getNumVectors() = " << numVecs << ".");
2207 
2208  const std::pair<size_t, size_t> colRng(0, numVecs);
2209  RV dotsOut = subview(dots, colRng);
2210  RCP<const Comm<int> > comm = this->getMap().is_null() ? null : this->getMap()->getComm();
2211 
2212  auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2213  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2214 
2215  ::Tpetra::Details::lclDot<RV, XMV>(dotsOut, thisView, A_view, lclNumRows, numVecs,
2216  this->whichVectors_.getRawPtr(),
2217  A.whichVectors_.getRawPtr(),
2218  this->isConstantStride(), A.isConstantStride());
2219 
2220  // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2221  // functions unless they explicitely return a value to Host.
2222  // Here while the lclDot are on host, they are not a return
2223  // value, therefore they might be avaible to us immediately.
2224  // Adding a frnce here guarantees that we will have the lclDot
2225  // ahead of the MPI reduction.
2226  execution_space exec_space_instance = execution_space();
2227  exec_space_instance.fence("Tpetra::MultiVector::dot");
2228 
2229  gblDotImpl(dotsOut, comm, this->isDistributed());
2230 }
2231 
2232 namespace { // (anonymous)
2233 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2235 multiVectorSingleColumnDot(const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
2237  using ::Tpetra::Details::ProfilingRegion;
2239  using dot_type = typename MV::dot_type;
2240  ProfilingRegion region("Tpetra::multiVectorSingleColumnDot");
2241 
2242  auto map = x.getMap();
2243  Teuchos::RCP<const Teuchos::Comm<int> > comm =
2244  map.is_null() ? Teuchos::null : map->getComm();
2245  if (comm.is_null()) {
2246 #if KOKKOS_VERSION >= 40799
2247  return KokkosKernels::ArithTraits<dot_type>::zero();
2248 #else
2249  return Kokkos::ArithTraits<dot_type>::zero();
2250 #endif
2251  } else {
2252  using LO = LocalOrdinal;
2253  // The min just ensures that we don't overwrite memory that
2254  // doesn't belong to us, in the erroneous input case where x
2255  // and y have different numbers of rows.
2256  const LO lclNumRows = static_cast<LO>(std::min(x.getLocalLength(),
2257  y.getLocalLength()));
2258  const Kokkos::pair<LO, LO> rowRng(0, lclNumRows);
2259 #if KOKKOS_VERSION >= 40799
2260  dot_type lclDot = KokkosKernels::ArithTraits<dot_type>::zero();
2261 #else
2262  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero();
2263 #endif
2264 #if KOKKOS_VERSION >= 40799
2265  dot_type gblDot = KokkosKernels::ArithTraits<dot_type>::zero();
2266 #else
2267  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero();
2268 #endif
2269 
2270  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2271  // const_cast<MV&>(x).sync_device ();
2272  // const_cast<MV&>(y).sync_device ();
2273 
2274  auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2275  auto x_1d = Kokkos::subview(x_2d, rowRng, 0);
2276  auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2277  auto y_1d = Kokkos::subview(y_2d, rowRng, 0);
2278  lclDot = KokkosBlas::dot(x_1d, y_1d);
2279 
2280  if (x.isDistributed()) {
2281  using Teuchos::outArg;
2282  using Teuchos::REDUCE_SUM;
2283  using Teuchos::reduceAll;
2284  reduceAll<int, dot_type>(*comm, REDUCE_SUM, lclDot, outArg(gblDot));
2285  } else {
2286  gblDot = lclDot;
2287  }
2288  return gblDot;
2289  }
2290 }
2291 } // namespace
2292 
2293 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2296  const Teuchos::ArrayView<dot_type>& dots) const {
2297  const char tfecfFuncName[] = "dot: ";
2298  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Teuchos::ArrayView)");
2299 
2300  const size_t numVecs = this->getNumVectors();
2301  const size_t lclNumRows = this->getLocalLength();
2302  const size_t numDots = static_cast<size_t>(dots.size());
2303 
2304  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2305  // not necessarily be thrown on all processes consistently. We
2306  // keep them for now, because MultiVector's unit tests insist on
2307  // them. In the future, we should instead pass along error state
2308  // with the inner product. We could do this by setting an extra
2309  // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2310  // The final sum should be
2311  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2312  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength(), std::runtime_error,
2313  "MultiVectors do not have the same local length. "
2314  "this->getLocalLength() = "
2315  << lclNumRows << " != "
2316  "A.getLocalLength() = "
2317  << A.getLocalLength() << ".");
2318  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs != A.getNumVectors(), std::runtime_error,
2319  "MultiVectors must have the same number of columns (vectors). "
2320  "this->getNumVectors() = "
2321  << numVecs << " != "
2322  "A.getNumVectors() = "
2323  << A.getNumVectors() << ".");
2324  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numDots != numVecs, std::runtime_error,
2325  "The output array 'dots' must have the same number of entries as the "
2326  "number of columns (vectors) in *this and A. dots.extent(0) = "
2327  << numDots << " != this->getNumVectors() = " << numVecs << ".");
2328 
2329  if (numVecs == 1 && this->isConstantStride() && A.isConstantStride()) {
2330  const dot_type gblDot = multiVectorSingleColumnDot(*this, A);
2331  dots[0] = gblDot;
2332  } else {
2333  this->dot(A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr(), numDots));
2334  }
2335 }
2336 
2337 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2339  norm2(const Teuchos::ArrayView<mag_type>& norms) const {
2341  using ::Tpetra::Details::NORM_TWO;
2342  using ::Tpetra::Details::ProfilingRegion;
2343  ProfilingRegion region("Tpetra::MV::norm2 (host output)");
2344 
2345  // The function needs to be able to sync X.
2346  MV& X = const_cast<MV&>(*this);
2347  multiVectorNormImpl(norms.getRawPtr(), X, NORM_TWO);
2348 }
2349 
2350 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2352  norm2(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2353  Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2354  this->norm2(norms_av);
2355 }
2356 
2357 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2359  norm1(const Teuchos::ArrayView<mag_type>& norms) const {
2361  using ::Tpetra::Details::NORM_ONE;
2362  using ::Tpetra::Details::ProfilingRegion;
2363  ProfilingRegion region("Tpetra::MV::norm1 (host output)");
2364 
2365  // The function needs to be able to sync X.
2366  MV& X = const_cast<MV&>(*this);
2367  multiVectorNormImpl(norms.data(), X, NORM_ONE);
2368 }
2369 
2370 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2372  norm1(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2373  Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2374  this->norm1(norms_av);
2375 }
2376 
2377 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2379  normInf(const Teuchos::ArrayView<mag_type>& norms) const {
2381  using ::Tpetra::Details::NORM_INF;
2382  using ::Tpetra::Details::ProfilingRegion;
2383  ProfilingRegion region("Tpetra::MV::normInf (host output)");
2384 
2385  // The function needs to be able to sync X.
2386  MV& X = const_cast<MV&>(*this);
2387  multiVectorNormImpl(norms.getRawPtr(), X, NORM_INF);
2388 }
2389 
2390 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2392  normInf(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2393  Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2394  this->normInf(norms_av);
2395 }
2396 
2397 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2399  meanValue(const Teuchos::ArrayView<impl_scalar_type>& means) const {
2400  // KR FIXME Overload this method to take a View.
2401  using Kokkos::ALL;
2402  using Kokkos::subview;
2403  using Teuchos::Comm;
2404  using Teuchos::RCP;
2405  using Teuchos::REDUCE_SUM;
2406  using Teuchos::reduceAll;
2407 #if KOKKOS_VERSION >= 40799
2408  typedef KokkosKernels::ArithTraits<impl_scalar_type> ATS;
2409 #else
2410  typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2411 #endif
2412 
2413  const size_t lclNumRows = this->getLocalLength();
2414  const size_t numVecs = this->getNumVectors();
2415  const size_t numMeans = static_cast<size_t>(means.size());
2416 
2417  TEUCHOS_TEST_FOR_EXCEPTION(
2418  numMeans != numVecs, std::runtime_error,
2419  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2420  << " != this->getNumVectors() = " << numVecs << ".");
2421 
2422  const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2423  const std::pair<size_t, size_t> colRng(0, numVecs);
2424 
2425  // Make sure that the final output view has the same layout as the
2426  // temporary view's host_mirror_type. Left or Right doesn't matter for
2427  // a 1-D array anyway; this is just to placate the compiler.
2428  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2429  typedef Kokkos::View<impl_scalar_type*,
2430  typename local_view_type::host_mirror_type::array_layout,
2431  Kokkos::HostSpace,
2432  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
2433  host_local_view_type;
2434  host_local_view_type meansOut(means.getRawPtr(), numMeans);
2435 
2436  RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
2437 
2438  // If we need sync to device, then host has the most recent version.
2439  const bool useHostVersion = this->need_sync_device();
2440  if (useHostVersion) {
2441  // DualView was last modified on host, so run the local kernel there.
2442  auto X_lcl = subview(getLocalViewHost(Access::ReadOnly),
2443  rowRng, Kokkos::ALL());
2444  // Compute the local sum of each column.
2445  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2446  if (isConstantStride()) {
2447  KokkosBlas::sum(lclSums, X_lcl);
2448  } else {
2449  for (size_t j = 0; j < numVecs; ++j) {
2450  const size_t col = whichVectors_[j];
2451  KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2452  }
2453  }
2454 
2455  // If there are multiple MPI processes, the all-reduce reads
2456  // from lclSums, and writes to meansOut. Otherwise, we just
2457  // copy lclSums into meansOut.
2458  if (!comm.is_null() && this->isDistributed()) {
2459  reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2460  lclSums.data(), meansOut.data());
2461  } else {
2462  // DEEP_COPY REVIEW - NOT TESTED
2463  Kokkos::deep_copy(meansOut, lclSums);
2464  }
2465  } else {
2466  // DualView was last modified on device, so run the local kernel there.
2467  auto X_lcl = subview(this->getLocalViewDevice(Access::ReadOnly),
2468  rowRng, Kokkos::ALL());
2469 
2470  // Compute the local sum of each column.
2471  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2472  if (isConstantStride()) {
2473  KokkosBlas::sum(lclSums, X_lcl);
2474  } else {
2475  for (size_t j = 0; j < numVecs; ++j) {
2476  const size_t col = whichVectors_[j];
2477  KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2478  }
2479  }
2480  // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2481  // functions unless they explicitly return a value to Host.
2482  // Here while the lclSums are on the host, they are not a return
2483  // value, therefore they might be available to us immediately.
2484  // Adding a fence here guarantees that we will have the lclSums
2485  // ahead of the MPI reduction.
2486  execution_space exec_space_instance = execution_space();
2487  exec_space_instance.fence("Tpetra::MultiVector::mean");
2488 
2489  // If there are multiple MPI processes, the all-reduce reads
2490  // from lclSums, and writes to meansOut. (We assume that MPI
2491  // can read device memory.) Otherwise, we just copy lclSums
2492  // into meansOut.
2493  if (!comm.is_null() && this->isDistributed()) {
2494  reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2495  lclSums.data(), meansOut.data());
2496  } else {
2497  // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2498  Kokkos::deep_copy(meansOut, lclSums);
2499  }
2500  }
2501 
2502  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2503  // to the magnitude type, since operator/ (std::complex<T>, int)
2504  // isn't necessarily defined.
2505  const impl_scalar_type OneOverN =
2506  ATS::one() / static_cast<mag_type>(this->getGlobalLength());
2507  for (size_t k = 0; k < numMeans; ++k) {
2508  meansOut(k) = meansOut(k) * OneOverN;
2509  }
2510 }
2511 
2512 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2515  typedef impl_scalar_type IST;
2516 #if KOKKOS_VERSION >= 40799
2517  typedef KokkosKernels::ArithTraits<IST> ATS;
2518 #else
2519  typedef Kokkos::ArithTraits<IST> ATS;
2520 #endif
2521  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2522  typedef typename pool_type::generator_type generator_type;
2523 
2524  const IST max = Kokkos::rand<generator_type, IST>::max();
2525  const IST min = ATS::is_signed ? IST(-max) : ATS::zero();
2526 
2527  this->randomize(static_cast<Scalar>(min), static_cast<Scalar>(max));
2528 }
2529 
2530 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2532  randomize(const Scalar& minVal, const Scalar& maxVal) {
2533  typedef impl_scalar_type IST;
2534  typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2535  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2536 
2537  // Seed the pool based on the system RNG and the MPI rank, if needed
2538  if (!tpetra_pool_type::isSet())
2539  tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2540 
2541  pool_type& rand_pool = tpetra_pool_type::getPool();
2542  const IST max = static_cast<IST>(maxVal);
2543  const IST min = static_cast<IST>(minVal);
2544 
2545  auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2546 
2547  if (isConstantStride()) {
2548  Kokkos::fill_random(thisView, rand_pool, min, max);
2549  } else {
2550  const size_t numVecs = getNumVectors();
2551  for (size_t k = 0; k < numVecs; ++k) {
2552  const size_t col = whichVectors_[k];
2553  auto X_k = Kokkos::subview(thisView, Kokkos::ALL(), col);
2554  Kokkos::fill_random(X_k, rand_pool, min, max);
2555  }
2556  }
2557 }
2558 
2559 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2561  putScalar(const Scalar& alpha) {
2562  using ::Tpetra::Details::ProfilingRegion;
2563  using ::Tpetra::Details::Blas::fill;
2564  using DES = typename dual_view_type::t_dev::execution_space;
2565  using HES = typename dual_view_type::t_host::execution_space;
2566  using LO = LocalOrdinal;
2567  ProfilingRegion region("Tpetra::MultiVector::putScalar");
2568 
2569  // We need this cast for cases like Scalar = std::complex<T> but
2570  // impl_scalar_type = Kokkos::complex<T>.
2571  const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2572  const LO lclNumRows = static_cast<LO>(this->getLocalLength());
2573  const LO numVecs = static_cast<LO>(this->getNumVectors());
2574 
2575  // Modify the most recently updated version of the data. This
2576  // avoids sync'ing, which could violate users' expectations.
2577  //
2578  // If we need sync to device, then host has the most recent version.
2579  const bool runOnHost = runKernelOnHost(*this);
2580  if (!runOnHost) {
2581  auto X = this->getLocalViewDevice(Access::OverwriteAll);
2582  if (this->isConstantStride()) {
2583  fill(DES(), X, theAlpha, lclNumRows, numVecs);
2584  } else {
2585  fill(DES(), X, theAlpha, lclNumRows, numVecs,
2586  this->whichVectors_.getRawPtr());
2587  }
2588  } else { // last modified in host memory, so modify data there.
2589  auto X = this->getLocalViewHost(Access::OverwriteAll);
2590  if (this->isConstantStride()) {
2591  fill(HES(), X, theAlpha, lclNumRows, numVecs);
2592  } else {
2593  fill(HES(), X, theAlpha, lclNumRows, numVecs,
2594  this->whichVectors_.getRawPtr());
2595  }
2596  }
2597 }
2598 
2599 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2601  replaceMap(const Teuchos::RCP<const map_type>& newMap) {
2602  using Teuchos::ArrayRCP;
2603  using Teuchos::Comm;
2604  using Teuchos::RCP;
2605  using ST = Scalar;
2606  using LO = LocalOrdinal;
2607  using GO = GlobalOrdinal;
2608 
2609  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2610  // it might work if the MV is a column view of another MV.
2611  // However, things might go wrong when restoring the original
2612  // Map, so we don't allow this case for now.
2613  TEUCHOS_TEST_FOR_EXCEPTION(
2614  !this->isConstantStride(), std::logic_error,
2615  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2616  "if the MultiVector is a column view of another MultiVector (that is, if "
2617  "isConstantStride() == false).");
2618 
2619  // Case 1: current Map and new Map are both nonnull on this process.
2620  // Case 2: current Map is nonnull, new Map is null.
2621  // Case 3: current Map is null, new Map is nonnull.
2622  // Case 4: both Maps are null: forbidden.
2623  //
2624  // Case 1 means that we don't have to do anything on this process,
2625  // other than assign the new Map. (We always have to do that.)
2626  // It's an error for the user to supply a Map that requires
2627  // resizing in this case.
2628  //
2629  // Case 2 means that the calling process is in the current Map's
2630  // communicator, but will be excluded from the new Map's
2631  // communicator. We don't have to do anything on the calling
2632  // process; just leave whatever data it may have alone.
2633  //
2634  // Case 3 means that the calling process is excluded from the
2635  // current Map's communicator, but will be included in the new
2636  // Map's communicator. This means we need to (re)allocate the
2637  // local DualView if it does not have the right number of rows.
2638  // If the new number of rows is nonzero, we'll fill the newly
2639  // allocated local data with zeros, as befits a projection
2640  // operation.
2641  //
2642  // The typical use case for Case 3 is that the MultiVector was
2643  // first created with the Map with more processes, then that Map
2644  // was replaced with a Map with fewer processes, and finally the
2645  // original Map was restored on this call to replaceMap.
2646 
2647  if (this->getMap().is_null()) { // current Map is null
2648  // If this->getMap() is null, that means that this MultiVector
2649  // has already had replaceMap happen to it. In that case, just
2650  // reallocate the DualView with the right size.
2651 
2652  TEUCHOS_TEST_FOR_EXCEPTION(
2653  newMap.is_null(), std::invalid_argument,
2654  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2655  "This probably means that the input Map is incorrect.");
2656 
2657  // Case 3: current Map is null, new Map is nonnull.
2658  // Reallocate the DualView with the right dimensions.
2659  const size_t newNumRows = newMap->getLocalNumElements();
2660  const size_t origNumRows = view_.extent(0);
2661  const size_t numCols = this->getNumVectors();
2662 
2663  if (origNumRows != newNumRows || view_.extent(1) != numCols) {
2664  view_ = allocDualView<ST, LO, GO, Node>(newNumRows, numCols);
2665  }
2666  } else if (newMap.is_null()) { // Case 2: current Map is nonnull, new Map is null
2667  // I am an excluded process. Reinitialize my data so that I
2668  // have 0 rows. Keep the number of columns as before.
2669  const size_t newNumRows = static_cast<size_t>(0);
2670  const size_t numCols = this->getNumVectors();
2671  view_ = allocDualView<ST, LO, GO, Node>(newNumRows, numCols);
2672  }
2673 
2674  this->map_ = newMap;
2675 }
2676 
2677 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2679  scale(const Scalar& alpha) {
2680  using Kokkos::ALL;
2681  using IST = impl_scalar_type;
2682 
2683  const IST theAlpha = static_cast<IST>(alpha);
2684 #if KOKKOS_VERSION >= 40799
2685  if (theAlpha == KokkosKernels::ArithTraits<IST>::one()) {
2686 #else
2687  if (theAlpha == Kokkos::ArithTraits<IST>::one()) {
2688 #endif
2689  return; // do nothing
2690  }
2691  const size_t lclNumRows = getLocalLength();
2692  const size_t numVecs = getNumVectors();
2693  const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2694  const std::pair<size_t, size_t> colRng(0, numVecs);
2695 
2696  // We can't substitute putScalar(0.0) for scale(0.0), because the
2697  // former will overwrite NaNs present in the MultiVector. The
2698  // semantics of this call require multiplying them by 0, which
2699  // IEEE 754 requires to be NaN.
2700 
2701  // If we need sync to device, then host has the most recent version.
2702  const bool useHostVersion = need_sync_device();
2703  if (useHostVersion) {
2704  auto Y_lcl = Kokkos::subview(getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2705  if (isConstantStride()) {
2706  KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2707  } else {
2708  for (size_t k = 0; k < numVecs; ++k) {
2709  const size_t Y_col = whichVectors_[k];
2710  auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2711  KokkosBlas::scal(Y_k, theAlpha, Y_k);
2712  }
2713  }
2714  } else { // work on device
2715  auto Y_lcl = Kokkos::subview(getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2716  if (isConstantStride()) {
2717  KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2718  } else {
2719  for (size_t k = 0; k < numVecs; ++k) {
2720  const size_t Y_col = isConstantStride() ? k : whichVectors_[k];
2721  auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2722  KokkosBlas::scal(Y_k, theAlpha, Y_k);
2723  }
2724  }
2725  }
2726 }
2727 
2728 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2730  scale(const Teuchos::ArrayView<const Scalar>& alphas) {
2731  const size_t numVecs = this->getNumVectors();
2732  const size_t numAlphas = static_cast<size_t>(alphas.size());
2733  TEUCHOS_TEST_FOR_EXCEPTION(
2734  numAlphas != numVecs, std::invalid_argument,
2735  "Tpetra::MultiVector::"
2736  "scale: alphas.size() = "
2737  << numAlphas << " != this->getNumVectors() = "
2738  << numVecs << ".");
2739 
2740  // Use a DualView to copy the scaling constants onto the device.
2741  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2742  k_alphas_type k_alphas("alphas::tmp", numAlphas);
2743  k_alphas.modify_host();
2744  for (size_t i = 0; i < numAlphas; ++i) {
2745  k_alphas.view_host()(i) = static_cast<impl_scalar_type>(alphas[i]);
2746  }
2747  k_alphas.sync_device();
2748  // Invoke the scale() overload that takes a device View of coefficients.
2749  this->scale(k_alphas.view_device());
2750 }
2751 
2752 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2754  scale(const Kokkos::View<const impl_scalar_type*, device_type>& alphas) {
2755  using Kokkos::ALL;
2756  using Kokkos::subview;
2757 
2758  const size_t lclNumRows = this->getLocalLength();
2759  const size_t numVecs = this->getNumVectors();
2760  TEUCHOS_TEST_FOR_EXCEPTION(
2761  static_cast<size_t>(alphas.extent(0)) != numVecs,
2762  std::invalid_argument,
2763  "Tpetra::MultiVector::scale(alphas): "
2764  "alphas.extent(0) = "
2765  << alphas.extent(0)
2766  << " != this->getNumVectors () = " << numVecs << ".");
2767  const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2768  const std::pair<size_t, size_t> colRng(0, numVecs);
2769 
2770  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2771  // type of the return value of subview. This is because if we
2772  // switch the array layout from LayoutLeft to LayoutRight
2773  // (preferred for performance of block operations), the types
2774  // below won't be valid. (A view of a column of a LayoutRight
2775  // multivector has LayoutStride, not LayoutLeft.)
2776 
2777  // If we need sync to device, then host has the most recent version.
2778  const bool useHostVersion = this->need_sync_device();
2779  if (useHostVersion) {
2780  // Work in host memory. This means we need to create a host
2781  // mirror of the input View of coefficients.
2782  auto alphas_h = Kokkos::create_mirror_view(alphas);
2783  // DEEP_COPY REVIEW - NOT TESTED
2784  Kokkos::deep_copy(alphas_h, alphas);
2785 
2786  auto Y_lcl = subview(this->getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2787  if (isConstantStride()) {
2788  KokkosBlas::scal(Y_lcl, alphas_h, Y_lcl);
2789  } else {
2790  for (size_t k = 0; k < numVecs; ++k) {
2791  const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2792  auto Y_k = subview(Y_lcl, ALL(), Y_col);
2793  // We don't have to use the entire 1-D View here; we can use
2794  // the version that takes a scalar coefficient.
2795  KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2796  }
2797  }
2798  } else { // Work in device memory, using the input View 'alphas' directly.
2799  auto Y_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2800  if (isConstantStride()) {
2801  KokkosBlas::scal(Y_lcl, alphas, Y_lcl);
2802  } else {
2803  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2804  // as values on host, so copy them to host. Another approach
2805  // would be to fix scal() so that it takes a 0-D View as the
2806  // second argument.
2807  auto alphas_h = Kokkos::create_mirror_view(alphas);
2808  // DEEP_COPY REVIEW - NOT TESTED
2809  Kokkos::deep_copy(alphas_h, alphas);
2810 
2811  for (size_t k = 0; k < numVecs; ++k) {
2812  const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2813  auto Y_k = subview(Y_lcl, ALL(), Y_col);
2814  KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2815  }
2816  }
2817  }
2818 }
2819 
2820 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2822  scale(const Scalar& alpha,
2824  using Kokkos::ALL;
2825  using Kokkos::subview;
2826  const char tfecfFuncName[] = "scale: ";
2827 
2828  const size_t lclNumRows = getLocalLength();
2829  const size_t numVecs = getNumVectors();
2830 
2831  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2832  lclNumRows != A.getLocalLength(), std::invalid_argument,
2833  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2834  << A.getLocalLength() << ".");
2835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2836  numVecs != A.getNumVectors(), std::invalid_argument,
2837  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2838  << A.getNumVectors() << ".");
2839 
2840  const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2841  const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2842  const std::pair<size_t, size_t> colRng(0, numVecs);
2843 
2844  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2845  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2846  auto Y_lcl = subview(Y_lcl_orig, rowRng, ALL());
2847  auto X_lcl = subview(X_lcl_orig, rowRng, ALL());
2848 
2849  if (isConstantStride() && A.isConstantStride()) {
2850  KokkosBlas::scal(Y_lcl, theAlpha, X_lcl);
2851  } else {
2852  // Make sure that Kokkos only uses the local length for add.
2853  for (size_t k = 0; k < numVecs; ++k) {
2854  const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2855  const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2856  auto Y_k = subview(Y_lcl, ALL(), Y_col);
2857  auto X_k = subview(X_lcl, ALL(), X_col);
2858 
2859  KokkosBlas::scal(Y_k, theAlpha, X_k);
2860  }
2861  }
2862 }
2863 
2864 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2867  const char tfecfFuncName[] = "reciprocal: ";
2868 
2869  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2870  getLocalLength() != A.getLocalLength(), std::runtime_error,
2871  "MultiVectors do not have the same local length. "
2872  "this->getLocalLength() = "
2873  << getLocalLength()
2874  << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2875  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2876  A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2877  ": MultiVectors do not have the same number of columns (vectors). "
2878  "this->getNumVectors() = "
2879  << getNumVectors()
2880  << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2881 
2882  const size_t numVecs = getNumVectors();
2883 
2884  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2885  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2886 
2887  if (isConstantStride() && A.isConstantStride()) {
2888  KokkosBlas::reciprocal(this_view_dev, A_view_dev);
2889  } else {
2890  using Kokkos::ALL;
2891  using Kokkos::subview;
2892  for (size_t k = 0; k < numVecs; ++k) {
2893  const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2894  auto vector_k = subview(this_view_dev, ALL(), this_col);
2895  const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2896  auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2897  KokkosBlas::reciprocal(vector_k, vector_Ak);
2898  }
2899  }
2900 }
2901 
2902 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2905  const char tfecfFuncName[] = "abs";
2906 
2907  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2908  getLocalLength() != A.getLocalLength(), std::runtime_error,
2909  ": MultiVectors do not have the same local length. "
2910  "this->getLocalLength() = "
2911  << getLocalLength()
2912  << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2913  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2914  A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2915  ": MultiVectors do not have the same number of columns (vectors). "
2916  "this->getNumVectors() = "
2917  << getNumVectors()
2918  << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2919  const size_t numVecs = getNumVectors();
2920 
2921  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2922  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2923 
2924  if (isConstantStride() && A.isConstantStride()) {
2925  KokkosBlas::abs(this_view_dev, A_view_dev);
2926  } else {
2927  using Kokkos::ALL;
2928  using Kokkos::subview;
2929 
2930  for (size_t k = 0; k < numVecs; ++k) {
2931  const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2932  auto vector_k = subview(this_view_dev, ALL(), this_col);
2933  const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2934  auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2935  KokkosBlas::abs(vector_k, vector_Ak);
2936  }
2937  }
2938 }
2939 
2940 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2942  update(const Scalar& alpha,
2944  const Scalar& beta) {
2945  const char tfecfFuncName[] = "update: ";
2946  using Kokkos::ALL;
2947  using Kokkos::subview;
2948 
2949  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta)");
2950 
2951  const size_t lclNumRows = getLocalLength();
2952  const size_t numVecs = getNumVectors();
2953 
2955  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2956  lclNumRows != A.getLocalLength(), std::invalid_argument,
2957  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2958  << A.getLocalLength() << ".");
2959  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2960  numVecs != A.getNumVectors(), std::invalid_argument,
2961  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2962  << A.getNumVectors() << ".");
2963  }
2964 
2965  const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2966  const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
2967  const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2968  const std::pair<size_t, size_t> colRng(0, numVecs);
2969 
2970  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2971  auto Y_lcl = subview(Y_lcl_orig, rowRng, Kokkos::ALL());
2972  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2973  auto X_lcl = subview(X_lcl_orig, rowRng, Kokkos::ALL());
2974 
2975  // The device memory of *this is about to be modified
2976  if (isConstantStride() && A.isConstantStride()) {
2977  KokkosBlas::axpby(theAlpha, X_lcl, theBeta, Y_lcl);
2978  } else {
2979  // Make sure that Kokkos only uses the local length for add.
2980  for (size_t k = 0; k < numVecs; ++k) {
2981  const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2982  const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2983  auto Y_k = subview(Y_lcl, ALL(), Y_col);
2984  auto X_k = subview(X_lcl, ALL(), X_col);
2985 
2986  KokkosBlas::axpby(theAlpha, X_k, theBeta, Y_k);
2987  }
2988  }
2989 }
2990 
2991 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2993  update(const Scalar& alpha,
2995  const Scalar& beta,
2997  const Scalar& gamma) {
2998  using Kokkos::ALL;
2999  using Kokkos::subview;
3000 
3001  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3002 
3003  ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3004 
3005  const size_t lclNumRows = this->getLocalLength();
3006  const size_t numVecs = getNumVectors();
3007 
3009  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3010  lclNumRows != A.getLocalLength(), std::invalid_argument,
3011  "The input MultiVector A has " << A.getLocalLength() << " local "
3012  "row(s), but this MultiVector has "
3013  << lclNumRows << " local "
3014  "row(s).");
3015  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3016  lclNumRows != B.getLocalLength(), std::invalid_argument,
3017  "The input MultiVector B has " << B.getLocalLength() << " local "
3018  "row(s), but this MultiVector has "
3019  << lclNumRows << " local "
3020  "row(s).");
3021  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3022  A.getNumVectors() != numVecs, std::invalid_argument,
3023  "The input MultiVector A has " << A.getNumVectors() << " column(s), "
3024  "but this MultiVector has "
3025  << numVecs << " column(s).");
3026  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3027  B.getNumVectors() != numVecs, std::invalid_argument,
3028  "The input MultiVector B has " << B.getNumVectors() << " column(s), "
3029  "but this MultiVector has "
3030  << numVecs << " column(s).");
3031  }
3032 
3033  const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3034  const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
3035  const impl_scalar_type theGamma = static_cast<impl_scalar_type>(gamma);
3036 
3037  const std::pair<size_t, size_t> rowRng(0, lclNumRows);
3038  const std::pair<size_t, size_t> colRng(0, numVecs);
3039 
3040  // Prefer 'auto' over specifying the type explicitly. This avoids
3041  // issues with a subview possibly having a different type than the
3042  // original view.
3043  auto C_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
3044  auto A_lcl = subview(A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3045  auto B_lcl = subview(B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3046 
3047  if (isConstantStride() && A.isConstantStride() && B.isConstantStride()) {
3048  KokkosBlas::update(theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3049  } else {
3050  // Some input (or *this) is not constant stride,
3051  // so perform the update one column at a time.
3052  for (size_t k = 0; k < numVecs; ++k) {
3053  const size_t this_col = isConstantStride() ? k : whichVectors_[k];
3054  const size_t A_col = A.isConstantStride() ? k : A.whichVectors_[k];
3055  const size_t B_col = B.isConstantStride() ? k : B.whichVectors_[k];
3056  KokkosBlas::update(theAlpha, subview(A_lcl, rowRng, A_col),
3057  theBeta, subview(B_lcl, rowRng, B_col),
3058  theGamma, subview(C_lcl, rowRng, this_col));
3059  }
3060  }
3061 }
3062 
3063 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3064 Teuchos::ArrayRCP<const Scalar>
3066  getData(size_t j) const {
3067  using Kokkos::ALL;
3068  using IST = impl_scalar_type;
3069  const char tfecfFuncName[] = "getData: ";
3070 
3071  // Any MultiVector method that called the (classic) Kokkos Node's
3072  // viewBuffer or viewBufferNonConst methods always implied a
3073  // device->host synchronization. Thus, we synchronize here as
3074  // well.
3075 
3076  auto hostView = getLocalViewHost(Access::ReadOnly);
3077  const size_t col = isConstantStride() ? j : whichVectors_[j];
3078  auto hostView_j = Kokkos::subview(hostView, ALL(), col);
3079  Teuchos::ArrayRCP<const IST> dataAsArcp =
3080  Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3081 
3082  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3083  static_cast<size_t>(dataAsArcp.size()),
3084  std::logic_error,
3085  "hostView_j.extent(0) = " << hostView_j.extent(0)
3086  << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3087  "Please report this bug to the Tpetra developers.");
3088 
3089  return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3090 }
3091 
3092 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3093 Teuchos::ArrayRCP<Scalar>
3095  getDataNonConst(size_t j) {
3096  using Kokkos::ALL;
3097  using Kokkos::subview;
3098  using IST = impl_scalar_type;
3099  const char tfecfFuncName[] = "getDataNonConst: ";
3100 
3101  auto hostView = getLocalViewHost(Access::ReadWrite);
3102  const size_t col = isConstantStride() ? j : whichVectors_[j];
3103  auto hostView_j = subview(hostView, ALL(), col);
3104  Teuchos::ArrayRCP<IST> dataAsArcp =
3105  Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3106 
3107  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3108  static_cast<size_t>(dataAsArcp.size()),
3109  std::logic_error,
3110  "hostView_j.extent(0) = " << hostView_j.extent(0)
3111  << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3112  "Please report this bug to the Tpetra developers.");
3113 
3114  return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3115 }
3116 
3117 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3118 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3120  subCopy(const Teuchos::ArrayView<const size_t>& cols) const {
3121  using Teuchos::RCP;
3123 
3124  // Check whether the index set in cols is contiguous. If it is,
3125  // use the more efficient Range1D version of subCopy.
3126  bool contiguous = true;
3127  const size_t numCopyVecs = static_cast<size_t>(cols.size());
3128  for (size_t j = 1; j < numCopyVecs; ++j) {
3129  if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3130  contiguous = false;
3131  break;
3132  }
3133  }
3134  if (contiguous && numCopyVecs > 0) {
3135  return this->subCopy(Teuchos::Range1D(cols[0], cols[numCopyVecs - 1]));
3136  } else {
3137  RCP<const MV> X_sub = this->subView(cols);
3138  RCP<MV> Y(new MV(this->getMap(), numCopyVecs, false));
3139  Y->assign(*X_sub);
3140  return Y;
3141  }
3142 }
3143 
3144 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3145 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3147  subCopy(const Teuchos::Range1D& colRng) const {
3148  using Teuchos::RCP;
3150 
3151  RCP<const MV> X_sub = this->subView(colRng);
3152  RCP<MV> Y(new MV(this->getMap(), static_cast<size_t>(colRng.size()), false));
3153  Y->assign(*X_sub);
3154  return Y;
3155 }
3156 
3157 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3158 size_t
3161  return view_.origExtent(0);
3162 }
3163 
3164 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3165 size_t
3168  return view_.origExtent(1);
3169 }
3170 
3171 template <class Scalar, class LO, class GO, class Node>
3174  const Teuchos::RCP<const map_type>& subMap,
3175  const local_ordinal_type rowOffset)
3176  : base_type(subMap) {
3177  using Kokkos::ALL;
3178  using Kokkos::subview;
3179  using std::endl;
3180  using Teuchos::outArg;
3181  using Teuchos::RCP;
3182  using Teuchos::rcp;
3183  using Teuchos::REDUCE_MIN;
3184  using Teuchos::reduceAll;
3186  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3187  const char suffix[] = "Please report this bug to the Tpetra developers.";
3188  int lclGood = 1;
3189  int gblGood = 1;
3190  std::unique_ptr<std::ostringstream> errStrm;
3191  const bool debug = ::Tpetra::Details::Behavior::debug();
3192  const bool verbose = ::Tpetra::Details::Behavior::verbose();
3193 
3194  // Be careful to use the input Map's communicator, not X's. The
3195  // idea is that, on return, *this is a subview of X, using the
3196  // input Map.
3197  const auto comm = subMap->getComm();
3198  TEUCHOS_ASSERT(!comm.is_null());
3199  const int myRank = comm->getRank();
3200 
3201  const LO lclNumRowsBefore = static_cast<LO>(X.getLocalLength());
3202  const LO numCols = static_cast<LO>(X.getNumVectors());
3203  const LO newNumRows = static_cast<LO>(subMap->getLocalNumElements());
3204  if (verbose) {
3205  std::ostringstream os;
3206  os << "Proc " << myRank << ": " << prefix
3207  << "X: {lclNumRows: " << lclNumRowsBefore
3208  << ", origLclNumRows: " << X.getOrigNumLocalRows()
3209  << ", numCols: " << numCols << "}, "
3210  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3211  std::cerr << os.str();
3212  }
3213  // We ask for the _original_ number of rows in X, because X could
3214  // be a shorter (fewer rows) view of a longer MV. (For example, X
3215  // could be a domain Map view of a column Map MV.)
3216  const bool tooManyElts =
3217  newNumRows + rowOffset > static_cast<LO>(X.getOrigNumLocalRows());
3218  if (tooManyElts) {
3219  errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3220  *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3221  << newNumRows << ") + rowOffset (=" << rowOffset
3222  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows()
3223  << ")." << endl;
3224  lclGood = 0;
3225  TEUCHOS_TEST_FOR_EXCEPTION(!debug && tooManyElts, std::invalid_argument,
3226  prefix << errStrm->str() << suffix);
3227  }
3228 
3229  if (debug) {
3230  reduceAll<int, int>(*comm, REDUCE_MIN, lclGood, outArg(gblGood));
3231  if (gblGood != 1) {
3232  std::ostringstream gblErrStrm;
3233  const std::string myErrStr =
3234  errStrm.get() != nullptr ? errStrm->str() : std::string("");
3235  ::Tpetra::Details::gathervPrint(gblErrStrm, myErrStr, *comm);
3236  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3237  }
3238  }
3239 
3240  using range_type = std::pair<LO, LO>;
3241  const range_type origRowRng(rowOffset, static_cast<LO>(X.view_.origExtent(0)));
3242  const range_type rowRng(rowOffset, rowOffset + newNumRows);
3243 
3244  wrapped_dual_view_type newView = takeSubview(X.view_, rowRng, ALL());
3245 
3246  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3247  // handling subviews of degenerate Views quite so well. For some
3248  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3249  // 0. We work around by creating a new empty DualView of the
3250  // desired (degenerate) dimensions.
3251  if (newView.extent(0) == 0 &&
3252  newView.extent(1) != X.view_.extent(1)) {
3253  newView =
3254  allocDualView<Scalar, LO, GO, Node>(0, X.getNumVectors());
3255  }
3256 
3257  MV subViewMV = X.isConstantStride() ? MV(subMap, newView) : MV(subMap, newView, X.whichVectors_());
3258 
3259  if (debug) {
3260  const LO lclNumRowsRet = static_cast<LO>(subViewMV.getLocalLength());
3261  const LO numColsRet = static_cast<LO>(subViewMV.getNumVectors());
3262  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3263  lclGood = 0;
3264  if (errStrm.get() == nullptr) {
3265  errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3266  }
3267  *errStrm << " Proc " << myRank << ": subMap.getLocalNumElements(): " << newNumRows << ", subViewMV.getLocalLength(): " << lclNumRowsRet << ", X.getNumVectors(): " << numCols << ", subViewMV.getNumVectors(): " << numColsRet << endl;
3268  }
3269  reduceAll<int, int>(*comm, REDUCE_MIN, lclGood, outArg(gblGood));
3270  if (gblGood != 1) {
3271  std::ostringstream gblErrStrm;
3272  if (myRank == 0) {
3273  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3274  "dimensions on one or more processes:"
3275  << endl;
3276  }
3277  const std::string myErrStr =
3278  errStrm.get() != nullptr ? errStrm->str() : std::string("");
3279  ::Tpetra::Details::gathervPrint(gblErrStrm, myErrStr, *comm);
3280  gblErrStrm << suffix << endl;
3281  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3282  }
3283  }
3284 
3285  if (verbose) {
3286  std::ostringstream os;
3287  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3288  std::cerr << os.str();
3289  }
3290 
3291  *this = subViewMV; // shallow copy
3292 
3293  if (verbose) {
3294  std::ostringstream os;
3295  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3296  std::cerr << os.str();
3297  }
3298 }
3299 
3300 template <class Scalar, class LO, class GO, class Node>
3303  const map_type& subMap,
3304  const size_t rowOffset)
3305  : MultiVector(X, Teuchos::RCP<const map_type>(new map_type(subMap)),
3306  static_cast<local_ordinal_type>(rowOffset)) {}
3307 
3308 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3309 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3311  offsetView(const Teuchos::RCP<const map_type>& subMap,
3312  const size_t offset) const {
3314  return Teuchos::rcp(new MV(*this, *subMap, offset));
3315 }
3316 
3317 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3318 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3320  offsetViewNonConst(const Teuchos::RCP<const map_type>& subMap,
3321  const size_t offset) {
3323  return Teuchos::rcp(new MV(*this, *subMap, offset));
3324 }
3325 
3326 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3327 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3329  subView(const Teuchos::ArrayView<const size_t>& cols) const {
3330  using Teuchos::Array;
3331  using Teuchos::rcp;
3333 
3334  const size_t numViewCols = static_cast<size_t>(cols.size());
3335  TEUCHOS_TEST_FOR_EXCEPTION(
3336  numViewCols < 1, std::runtime_error,
3337  "Tpetra::MultiVector::subView"
3338  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3339  "contain at least one entry, but cols.size() = "
3340  << cols.size()
3341  << " == 0.");
3342 
3343  // Check whether the index set in cols is contiguous. If it is,
3344  // use the more efficient Range1D version of subView.
3345  bool contiguous = true;
3346  for (size_t j = 1; j < numViewCols; ++j) {
3347  if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3348  contiguous = false;
3349  break;
3350  }
3351  }
3352  if (contiguous) {
3353  if (numViewCols == 0) {
3354  // The output MV has no columns, so there is nothing to view.
3355  return rcp(new MV(this->getMap(), numViewCols));
3356  } else {
3357  // Use the more efficient contiguous-index-range version.
3358  return this->subView(Teuchos::Range1D(cols[0], cols[numViewCols - 1]));
3359  }
3360  }
3361 
3362  if (isConstantStride()) {
3363  return rcp(new MV(this->getMap(), view_, cols));
3364  } else {
3365  Array<size_t> newcols(cols.size());
3366  for (size_t j = 0; j < numViewCols; ++j) {
3367  newcols[j] = whichVectors_[cols[j]];
3368  }
3369  return rcp(new MV(this->getMap(), view_, newcols()));
3370  }
3371 }
3372 
3373 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3374 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3376  subView(const Teuchos::Range1D& colRng) const {
3377  using Kokkos::ALL;
3378  using Kokkos::subview;
3379  using Teuchos::Array;
3380  using Teuchos::RCP;
3381  using Teuchos::rcp;
3382  using ::Tpetra::Details::Behavior;
3384  const char tfecfFuncName[] = "subView(Range1D): ";
3385 
3386  const size_t lclNumRows = this->getLocalLength();
3387  const size_t numVecs = this->getNumVectors();
3388  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3389  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3390  // "at least one vector.");
3391  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3392  static_cast<size_t>(colRng.size()) > numVecs, std::runtime_error,
3393  "colRng.size() = " << colRng.size() << " > this->getNumVectors() = "
3394  << numVecs << ".");
3395  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3396  numVecs != 0 && colRng.size() != 0 &&
3397  (colRng.lbound() < static_cast<Teuchos::Ordinal>(0) ||
3398  static_cast<size_t>(colRng.ubound()) >= numVecs),
3399  std::invalid_argument, "Nonempty input range [" << colRng.lbound() << "," << colRng.ubound() << "] exceeds the valid range of column indices "
3400  "[0, "
3401  << numVecs << "].");
3402 
3403  RCP<const MV> X_ret; // the MultiVector subview to return
3404 
3405  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3406  // broken for the case of views with zero rows. I will brutally
3407  // enforce that the subview has the correct dimensions. In
3408  // particular, in the case of zero rows, I will, if necessary,
3409  // create a new dual_view_type with zero rows and the correct
3410  // number of columns. In a debug build, I will use an all-reduce
3411  // to ensure that it has the correct dimensions on all processes.
3412 
3413  const std::pair<size_t, size_t> rows(0, lclNumRows);
3414  if (colRng.size() == 0) {
3415  const std::pair<size_t, size_t> cols(0, 0); // empty range
3416  wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3417  X_ret = rcp(new MV(this->getMap(), X_sub));
3418  } else {
3419  // Returned MultiVector is constant stride only if *this is.
3420  if (isConstantStride()) {
3421  const std::pair<size_t, size_t> cols(colRng.lbound(),
3422  colRng.ubound() + 1);
3423  wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3424  X_ret = rcp(new MV(this->getMap(), X_sub));
3425  } else {
3426  if (static_cast<size_t>(colRng.size()) == static_cast<size_t>(1)) {
3427  // We're only asking for one column, so the result does have
3428  // constant stride, even though this MultiVector does not.
3429  const std::pair<size_t, size_t> col(whichVectors_[0] + colRng.lbound(),
3430  whichVectors_[0] + colRng.ubound() + 1);
3431  wrapped_dual_view_type X_sub = takeSubview(view_, ALL(), col);
3432  X_ret = rcp(new MV(this->getMap(), X_sub));
3433  } else {
3434  Array<size_t> which(whichVectors_.begin() + colRng.lbound(),
3435  whichVectors_.begin() + colRng.ubound() + 1);
3436  X_ret = rcp(new MV(this->getMap(), view_, which));
3437  }
3438  }
3439  }
3440 
3441  const bool debug = Behavior::debug();
3442  if (debug) {
3443  using Teuchos::Comm;
3444  using Teuchos::outArg;
3445  using Teuchos::REDUCE_MIN;
3446  using Teuchos::reduceAll;
3447 
3448  RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
3449  if (!comm.is_null()) {
3450  int lclSuccess = 1;
3451  int gblSuccess = 1;
3452 
3453  if (X_ret.is_null()) {
3454  lclSuccess = 0;
3455  }
3456  reduceAll<int, int>(*comm, REDUCE_MIN, lclSuccess, outArg(gblSuccess));
3457  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclSuccess != 1, std::logic_error,
3458  "X_ret (the subview of this "
3459  "MultiVector; the return value of this method) is null on some MPI "
3460  "process in this MultiVector's communicator. This should never "
3461  "happen. Please report this bug to the Tpetra developers.");
3462  if (!X_ret.is_null() &&
3463  X_ret->getNumVectors() != static_cast<size_t>(colRng.size())) {
3464  lclSuccess = 0;
3465  }
3466  reduceAll<int, int>(*comm, REDUCE_MIN, lclSuccess,
3467  outArg(gblSuccess));
3468  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclSuccess != 1, std::logic_error,
3469  "X_ret->getNumVectors() != "
3470  "colRng.size(), on at least one MPI process in this MultiVector's "
3471  "communicator. This should never happen. "
3472  "Please report this bug to the Tpetra developers.");
3473  }
3474  }
3475  return X_ret;
3476 }
3477 
3478 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3479 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3481  subViewNonConst(const Teuchos::ArrayView<const size_t>& cols) {
3483  return Teuchos::rcp_const_cast<MV>(this->subView(cols));
3484 }
3485 
3486 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3487 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3489  subViewNonConst(const Teuchos::Range1D& colRng) {
3491  return Teuchos::rcp_const_cast<MV>(this->subView(colRng));
3492 }
3493 
3494 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3497  const size_t j)
3498  : base_type(X.getMap()) {
3499  using Kokkos::subview;
3500  typedef std::pair<size_t, size_t> range_type;
3501  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3502 
3503  const size_t numCols = X.getNumVectors();
3504  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(j >= numCols, std::invalid_argument, "Input index j (== " << j << ") exceeds valid column index range [0, " << numCols << " - 1].");
3505  const size_t jj = X.isConstantStride() ? static_cast<size_t>(j) : static_cast<size_t>(X.whichVectors_[j]);
3506  this->view_ = takeSubview(X.view_, Kokkos::ALL(), range_type(jj, jj + 1));
3507 
3508  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3509  // Export or Import operations with different subviews of a
3510  // MultiVector. Thus, it is safe to reuse communication buffers.
3511  // See #1560 discussion.
3512  //
3513  // We only need one column's worth of buffer for imports_ and
3514  // exports_. Taking subviews now ensures that their lengths will
3515  // be exactly what we need, so we won't have to resize them later.
3516  {
3517  const size_t newSize = X.imports_.extent(0) / numCols;
3518  const size_t offset = jj * newSize;
3519  auto device_view = subview(X.imports_.view_device(),
3520  range_type(offset, offset + newSize));
3521  auto host_view = subview(X.imports_.view_host(),
3522  range_type(offset, offset + newSize));
3523  this->imports_ = decltype(X.imports_)(device_view, host_view);
3524  }
3525  {
3526  const size_t newSize = X.exports_.extent(0) / numCols;
3527  const size_t offset = jj * newSize;
3528  auto device_view = subview(X.exports_.view_device(),
3529  range_type(offset, offset + newSize));
3530  auto host_view = subview(X.exports_.view_host(),
3531  range_type(offset, offset + newSize));
3532  this->exports_ = decltype(X.exports_)(device_view, host_view);
3533  }
3534  // These two DualViews already either have the right number of
3535  // entries, or zero entries. This means that we don't need to
3536  // resize them.
3539 }
3540 
3541 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3542 Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3544  getVector(const size_t j) const {
3546  return Teuchos::rcp(new V(*this, j));
3547 }
3548 
3549 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3550 Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3552  getVectorNonConst(const size_t j) {
3554  return Teuchos::rcp(new V(*this, j));
3555 }
3556 
3557 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3559  get1dCopy(const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const {
3560  using IST = impl_scalar_type;
3561  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3562  Kokkos::HostSpace,
3563  Kokkos::MemoryUnmanaged>;
3564  const char tfecfFuncName[] = "get1dCopy: ";
3565 
3566  const size_t numRows = this->getLocalLength();
3567  const size_t numCols = this->getNumVectors();
3568 
3569  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < numRows, std::runtime_error,
3570  "LDA = " << LDA << " < numRows = " << numRows << ".");
3571  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numRows > size_t(0) && numCols > size_t(0) &&
3572  size_t(A.size()) < LDA * (numCols - 1) + numRows,
3573  std::runtime_error,
3574  "A.size() = " << A.size() << ", but its size must be at least "
3575  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3576 
3577  const std::pair<size_t, size_t> rowRange(0, numRows);
3578  const std::pair<size_t, size_t> colRange(0, numCols);
3579 
3580  input_view_type A_view_orig(reinterpret_cast<IST*>(A.getRawPtr()),
3581  LDA, numCols);
3582  auto A_view = Kokkos::subview(A_view_orig, rowRange, colRange);
3583 
3595  if (this->need_sync_host() && this->need_sync_device()) {
3598  throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3599  } else {
3600  const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3601  if (this->isConstantStride()) {
3602  if (useHostView) {
3603  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3604  // DEEP_COPY REVIEW - NOT TESTED
3605  Kokkos::deep_copy(A_view, srcView_host);
3606  } else {
3607  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3608  // DEEP_COPY REVIEW - NOT TESTED
3609  Kokkos::deep_copy(A_view, srcView_device);
3610  }
3611  } else {
3612  for (size_t j = 0; j < numCols; ++j) {
3613  const size_t srcCol = this->whichVectors_[j];
3614  auto dstColView = Kokkos::subview(A_view, rowRange, j);
3615 
3616  if (useHostView) {
3617  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3618  auto srcColView_host = Kokkos::subview(srcView_host, rowRange, srcCol);
3619  // DEEP_COPY REVIEW - NOT TESTED
3620  Kokkos::deep_copy(dstColView, srcColView_host);
3621  } else {
3622  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3623  auto srcColView_device = Kokkos::subview(srcView_device, rowRange, srcCol);
3624  // DEEP_COPY REVIEW - NOT TESTED
3625  Kokkos::deep_copy(dstColView, srcColView_device);
3626  }
3627  }
3628  }
3629  }
3630 }
3631 
3632 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3634  get2dCopy(const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const {
3636  const char tfecfFuncName[] = "get2dCopy: ";
3637  const size_t numRows = this->getLocalLength();
3638  const size_t numCols = this->getNumVectors();
3639 
3640  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3641  static_cast<size_t>(ArrayOfPtrs.size()) != numCols,
3642  std::runtime_error,
3643  "Input array of pointers must contain as many "
3644  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3645  << ArrayOfPtrs.size() << " != getNumVectors() = " << numCols << ".");
3646 
3647  if (numRows != 0 && numCols != 0) {
3648  // No side effects until we've validated the input.
3649  for (size_t j = 0; j < numCols; ++j) {
3650  const size_t dstLen = static_cast<size_t>(ArrayOfPtrs[j].size());
3651  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3652  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3653  "the input array of arrays is not long enough to fit all entries in "
3654  "that column of the MultiVector. ArrayOfPtrs[j].size() = "
3655  << dstLen << " < getLocalLength() = " << numRows << ".");
3656  }
3657 
3658  // We've validated the input, so it's safe to start copying.
3659  for (size_t j = 0; j < numCols; ++j) {
3660  Teuchos::RCP<const V> X_j = this->getVector(j);
3661  const size_t LDA = static_cast<size_t>(ArrayOfPtrs[j].size());
3662  X_j->get1dCopy(ArrayOfPtrs[j], LDA);
3663  }
3664  }
3665 }
3666 
3667 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3668 Teuchos::ArrayRCP<const Scalar>
3670  get1dView() const {
3671  if (getLocalLength() == 0 || getNumVectors() == 0) {
3672  return Teuchos::null;
3673  } else {
3674  TEUCHOS_TEST_FOR_EXCEPTION(
3675  !isConstantStride(), std::runtime_error,
3676  "Tpetra::MultiVector::"
3677  "get1dView: This MultiVector does not have constant stride, so it is "
3678  "not possible to view its data as a single array. You may check "
3679  "whether a MultiVector has constant stride by calling "
3680  "isConstantStride().");
3681  // Since get1dView() is and was always marked const, I have to
3682  // cast away const here in order not to break backwards
3683  // compatibility.
3684  auto X_lcl = getLocalViewHost(Access::ReadOnly);
3685  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3686  Kokkos::Compat::persistingView(X_lcl);
3687  return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3688  }
3689 }
3690 
3691 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3692 Teuchos::ArrayRCP<Scalar>
3695  if (this->getLocalLength() == 0 || this->getNumVectors() == 0) {
3696  return Teuchos::null;
3697  } else {
3698  TEUCHOS_TEST_FOR_EXCEPTION(!isConstantStride(), std::runtime_error,
3699  "Tpetra::MultiVector::"
3700  "get1dViewNonConst: This MultiVector does not have constant stride, "
3701  "so it is not possible to view its data as a single array. You may "
3702  "check whether a MultiVector has constant stride by calling "
3703  "isConstantStride().");
3704  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3705  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3706  Kokkos::Compat::persistingView(X_lcl);
3707  return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3708  }
3709 }
3710 
3711 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3712 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3715  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3716 
3717  // Don't use the row range here on the outside, in order to avoid
3718  // a strided return type (in case Kokkos::subview is conservative
3719  // about that). Instead, use the row range for the column views
3720  // in the loop.
3721  const size_t myNumRows = this->getLocalLength();
3722  const size_t numCols = this->getNumVectors();
3723  const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3724 
3725  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views(numCols);
3726  for (size_t j = 0; j < numCols; ++j) {
3727  const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3728  auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3729  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3730  Kokkos::Compat::persistingView(X_lcl_j);
3731  views[j] = Teuchos::arcp_reinterpret_cast<Scalar>(X_lcl_j_arcp);
3732  }
3733  return views;
3734 }
3735 
3736 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3739  getLocalViewHost(Access::ReadOnlyStruct s) const {
3740  return view_.getHostView(s);
3741 }
3742 
3743 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3746  getLocalViewHost(Access::ReadWriteStruct s) {
3747  return view_.getHostView(s);
3748 }
3749 
3750 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3753  getLocalViewHost(Access::OverwriteAllStruct s) {
3754  return view_.getHostView(s);
3755 }
3756 
3757 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3760  getLocalViewDevice(Access::ReadOnlyStruct s) const {
3761  return view_.getDeviceView(s);
3762 }
3763 
3764 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3767  getLocalViewDevice(Access::ReadWriteStruct s) {
3768  return view_.getDeviceView(s);
3769 }
3770 
3771 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3774  getLocalViewDevice(Access::OverwriteAllStruct s) {
3775  return view_.getDeviceView(s);
3776 }
3777 
3778 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3782  return view_;
3783 }
3784 
3785 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3786 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3788  get2dView() const {
3789  // Since get2dView() is and was always marked const, I have to
3790  // cast away const here in order not to break backwards
3791  // compatibility.
3792  auto X_lcl = getLocalViewHost(Access::ReadOnly);
3793 
3794  // Don't use the row range here on the outside, in order to avoid
3795  // a strided return type (in case Kokkos::subview is conservative
3796  // about that). Instead, use the row range for the column views
3797  // in the loop.
3798  const size_t myNumRows = this->getLocalLength();
3799  const size_t numCols = this->getNumVectors();
3800  const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3801 
3802  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views(numCols);
3803  for (size_t j = 0; j < numCols; ++j) {
3804  const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3805  auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3806  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3807  Kokkos::Compat::persistingView(X_lcl_j);
3808  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar>(X_lcl_j_arcp);
3809  }
3810  return views;
3811 }
3812 
3813 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3815  multiply(Teuchos::ETransp transA,
3816  Teuchos::ETransp transB,
3817  const Scalar& alpha,
3820  const Scalar& beta) {
3821  using std::endl;
3822  using Teuchos::CONJ_TRANS;
3823  using Teuchos::NO_TRANS;
3824  using Teuchos::RCP;
3825  using Teuchos::rcp;
3826  using Teuchos::rcpFromRef;
3827  using Teuchos::TRANS;
3828  using ::Tpetra::Details::ProfilingRegion;
3829 #if KOKKOS_VERSION >= 40799
3830  using ATS = KokkosKernels::ArithTraits<impl_scalar_type>;
3831 #else
3832  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3833 #endif
3834  using LO = local_ordinal_type;
3835  using STS = Teuchos::ScalarTraits<Scalar>;
3837  const char tfecfFuncName[] = "multiply: ";
3838  ProfilingRegion region("Tpetra::MV::multiply");
3839 
3840  // This routine performs a variety of matrix-matrix multiply
3841  // operations, interpreting the MultiVector (this-aka C , A and B)
3842  // as 2D matrices. Variations are due to the fact that A, B and C
3843  // can be locally replicated or globally distributed MultiVectors
3844  // and that we may or may not operate with the transpose of A and
3845  // B. Possible cases are:
3846  //
3847  // Operations # Cases Notes
3848  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3849  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3850  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3851  //
3852  // The following operations are not meaningful for 1-D
3853  // distributions:
3854  //
3855  // u1) C(local) = A^T(distr) * B^T(distr) 1
3856  // u2) C(local) = A (distr) * B^X(distr) 2
3857  // u3) C(distr) = A^X(local) * B^X(local) 4
3858  // u4) C(distr) = A^X(local) * B^X(distr) 4
3859  // u5) C(distr) = A^T(distr) * B^X(local) 2
3860  // u6) C(local) = A^X(distr) * B^X(local) 4
3861  // u7) C(distr) = A^X(distr) * B^X(local) 4
3862  // u8) C(local) = A^X(local) * B^X(distr) 4
3863  //
3864  // Total number of cases: 32 (= 2^5).
3865 
3866  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3867 
3868  const bool A_is_local = !A.isDistributed();
3869  const bool B_is_local = !B.isDistributed();
3870  const bool C_is_local = !this->isDistributed();
3871 
3872  // In debug mode, check compatibility of local dimensions. We
3873  // only do this in debug mode, since it requires an all-reduce
3874  // to ensure correctness on all processses. It's entirely
3875  // possible that only some processes may have incompatible local
3876  // dimensions. Throwing an exception only on those processes
3877  // could cause this method to hang.
3878  const bool debug = ::Tpetra::Details::Behavior::debug();
3879  if (debug) {
3880  auto myMap = this->getMap();
3881  if (!myMap.is_null() && !myMap->getComm().is_null()) {
3882  using Teuchos::outArg;
3883  using Teuchos::REDUCE_MIN;
3884  using Teuchos::reduceAll;
3885 
3886  auto comm = myMap->getComm();
3887  const size_t A_nrows =
3888  (transA != NO_TRANS) ? A.getNumVectors() : A.getLocalLength();
3889  const size_t A_ncols =
3890  (transA != NO_TRANS) ? A.getLocalLength() : A.getNumVectors();
3891  const size_t B_nrows =
3892  (transB != NO_TRANS) ? B.getNumVectors() : B.getLocalLength();
3893  const size_t B_ncols =
3894  (transB != NO_TRANS) ? B.getLocalLength() : B.getNumVectors();
3895 
3896  const bool lclBad = this->getLocalLength() != A_nrows ||
3897  this->getNumVectors() != B_ncols || A_ncols != B_nrows;
3898 
3899  const int myRank = comm->getRank();
3900  std::ostringstream errStrm;
3901  if (this->getLocalLength() != A_nrows) {
3902  errStrm << "Proc " << myRank << ": this->getLocalLength()="
3903  << this->getLocalLength() << " != A_nrows=" << A_nrows
3904  << "." << std::endl;
3905  }
3906  if (this->getNumVectors() != B_ncols) {
3907  errStrm << "Proc " << myRank << ": this->getNumVectors()="
3908  << this->getNumVectors() << " != B_ncols=" << B_ncols
3909  << "." << std::endl;
3910  }
3911  if (A_ncols != B_nrows) {
3912  errStrm << "Proc " << myRank << ": A_ncols="
3913  << A_ncols << " != B_nrows=" << B_nrows
3914  << "." << std::endl;
3915  }
3916 
3917  const int lclGood = lclBad ? 0 : 1;
3918  int gblGood = 0;
3919  reduceAll<int, int>(*comm, REDUCE_MIN, lclGood, outArg(gblGood));
3920  if (gblGood != 1) {
3921  std::ostringstream os;
3922  ::Tpetra::Details::gathervPrint(os, errStrm.str(), *comm);
3923 
3924  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3925  "Inconsistent local dimensions on at "
3926  "least one process in this object's communicator."
3927  << std::endl
3928  << "Operation: "
3929  << "C(" << (C_is_local ? "local" : "distr") << ") = "
3930  << alpha << "*A"
3931  << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3932  << "(" << (A_is_local ? "local" : "distr") << ") * "
3933  << beta << "*B"
3934  << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3935  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3936  << "Global dimensions: C(" << this->getGlobalLength() << ", "
3937  << this->getNumVectors() << "), A(" << A.getGlobalLength()
3938  << ", " << A.getNumVectors() << "), B(" << B.getGlobalLength()
3939  << ", " << B.getNumVectors() << ")." << std::endl
3940  << os.str());
3941  }
3942  }
3943  }
3944 
3945  // Case 1: C(local) = A^X(local) * B^X(local)
3946  const bool Case1 = C_is_local && A_is_local && B_is_local;
3947  // Case 2: C(local) = A^T(distr) * B (distr)
3948  const bool Case2 = C_is_local && !A_is_local && !B_is_local &&
3949  transA != NO_TRANS &&
3950  transB == NO_TRANS;
3951  // Case 3: C(distr) = A (distr) * B^X(local)
3952  const bool Case3 = !C_is_local && !A_is_local && B_is_local &&
3953  transA == NO_TRANS;
3954 
3955  // Test that we are considering a meaningful case
3956  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!Case1 && !Case2 && !Case3, std::runtime_error,
3957  "Multiplication of op(A) and op(B) into *this is not a "
3958  "supported use case.");
3959 
3960  if (beta != STS::zero() && Case2) {
3961  // If Case2, then C is local and contributions must be summed
3962  // across all processes. However, if beta != 0, then accumulate
3963  // beta*C into the sum. When summing across all processes, we
3964  // only want to accumulate this once, so set beta == 0 on all
3965  // processes except Process 0.
3966  const int myRank = this->getMap()->getComm()->getRank();
3967  if (myRank != 0) {
3968  beta_local = ATS::zero();
3969  }
3970  }
3971 
3972  // We only know how to do matrix-matrix multiplies if all the
3973  // MultiVectors have constant stride. If not, we have to make
3974  // temporary copies of those MultiVectors (including possibly
3975  // *this) that don't have constant stride.
3976  RCP<MV> C_tmp;
3977  if (!isConstantStride()) {
3978  C_tmp = rcp(new MV(*this, Teuchos::Copy)); // deep copy
3979  } else {
3980  C_tmp = rcp(this, false);
3981  }
3982 
3983  RCP<const MV> A_tmp;
3984  if (!A.isConstantStride()) {
3985  A_tmp = rcp(new MV(A, Teuchos::Copy)); // deep copy
3986  } else {
3987  A_tmp = rcpFromRef(A);
3988  }
3989 
3990  RCP<const MV> B_tmp;
3991  if (!B.isConstantStride()) {
3992  B_tmp = rcp(new MV(B, Teuchos::Copy)); // deep copy
3993  } else {
3994  B_tmp = rcpFromRef(B);
3995  }
3996 
3997  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!C_tmp->isConstantStride() || !B_tmp->isConstantStride() ||
3998  !A_tmp->isConstantStride(),
3999  std::logic_error,
4000  "Failed to make temporary constant-stride copies of MultiVectors.");
4001 
4002  {
4003  const LO A_lclNumRows = A_tmp->getLocalLength();
4004  const LO A_numVecs = A_tmp->getNumVectors();
4005  auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4006  auto A_sub = Kokkos::subview(A_lcl,
4007  std::make_pair(LO(0), A_lclNumRows),
4008  std::make_pair(LO(0), A_numVecs));
4009 
4010  const LO B_lclNumRows = B_tmp->getLocalLength();
4011  const LO B_numVecs = B_tmp->getNumVectors();
4012  auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4013  auto B_sub = Kokkos::subview(B_lcl,
4014  std::make_pair(LO(0), B_lclNumRows),
4015  std::make_pair(LO(0), B_numVecs));
4016 
4017  const LO C_lclNumRows = C_tmp->getLocalLength();
4018  const LO C_numVecs = C_tmp->getNumVectors();
4019 
4020  auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4021  auto C_sub = Kokkos::subview(C_lcl,
4022  std::make_pair(LO(0), C_lclNumRows),
4023  std::make_pair(LO(0), C_numVecs));
4024  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' : (transA == Teuchos::TRANS ? 'T' : 'C'));
4025  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' : (transB == Teuchos::TRANS ? 'T' : 'C'));
4026  const impl_scalar_type alpha_IST(alpha);
4027 
4028  ProfilingRegion regionGemm("Tpetra::MV::multiply-call-gemm");
4029 
4030  KokkosBlas::gemm(&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4031  beta_local, C_sub);
4032  }
4033 
4034  if (!isConstantStride()) {
4035  ::Tpetra::deep_copy(*this, *C_tmp); // Copy the result back into *this.
4036  }
4037 
4038  // Dispose of (possibly) extra copies of A and B.
4039  A_tmp = Teuchos::null;
4040  B_tmp = Teuchos::null;
4041 
4042  // If Case 2 then sum up *this and distribute it to all processes.
4043  if (Case2) {
4044  this->reduce();
4045  }
4046 }
4047 
4048 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4050  elementWiseMultiply(Scalar scalarAB,
4053  Scalar scalarThis) {
4054  using Kokkos::ALL;
4055  using Kokkos::subview;
4056  const char tfecfFuncName[] = "elementWiseMultiply: ";
4057 
4058  const size_t lclNumRows = this->getLocalLength();
4059  const size_t numVecs = this->getNumVectors();
4060 
4061  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength() || lclNumRows != B.getLocalLength(),
4062  std::runtime_error, "MultiVectors do not have the same local length.");
4063  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4064  numVecs != B.getNumVectors(), std::runtime_error,
4065  "this->getNumVectors"
4066  "() = "
4067  << numVecs << " != B.getNumVectors() = " << B.getNumVectors()
4068  << ".");
4069 
4070  auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4071  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4072  auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4073 
4074  if (isConstantStride() && B.isConstantStride()) {
4075  // A is just a Vector; it only has one column, so it always has
4076  // constant stride.
4077  //
4078  // If both *this and B have constant stride, we can do an
4079  // element-wise multiply on all columns at once.
4080  KokkosBlas::mult(scalarThis,
4081  this_view,
4082  scalarAB,
4083  subview(A_view, ALL(), 0),
4084  B_view);
4085  } else {
4086  for (size_t j = 0; j < numVecs; ++j) {
4087  const size_t C_col = isConstantStride() ? j : whichVectors_[j];
4088  const size_t B_col = B.isConstantStride() ? j : B.whichVectors_[j];
4089  KokkosBlas::mult(scalarThis,
4090  subview(this_view, ALL(), C_col),
4091  scalarAB,
4092  subview(A_view, ALL(), 0),
4093  subview(B_view, ALL(), B_col));
4094  }
4095  }
4096 }
4097 
4098 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4102  using ::Tpetra::Details::ProfilingRegion;
4103  ProfilingRegion region("Tpetra::MV::reduce");
4104 
4105  const auto map = this->getMap();
4106  if (map.get() == nullptr) {
4107  return;
4108  }
4109  const auto comm = map->getComm();
4110  if (comm.get() == nullptr) {
4111  return;
4112  }
4113 
4114  // Avoid giving device buffers to MPI. Even if MPI handles them
4115  // correctly, doing so may not perform well.
4116  const bool changed_on_device = this->need_sync_host();
4117  if (changed_on_device) {
4118  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4119  // and host will be separate allocations. In that case, it may
4120  // pay to do the all-reduce from device to host.
4121  Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4122  auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4123  allReduceView(X_lcl, X_lcl, *comm);
4124  } else {
4125  auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4126  allReduceView(X_lcl, X_lcl, *comm);
4127  }
4128 }
4129 
4130 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4132  replaceLocalValue(const LocalOrdinal lclRow,
4133  const size_t col,
4134  const impl_scalar_type& ScalarValue) {
4136  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4137  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4138  TEUCHOS_TEST_FOR_EXCEPTION(
4139  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4140  std::runtime_error,
4141  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4142  << " is invalid. The range of valid row indices on this process "
4143  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4144  << ", " << maxLocalIndex << "].");
4145  TEUCHOS_TEST_FOR_EXCEPTION(
4146  vectorIndexOutOfRange(col),
4147  std::runtime_error,
4148  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4149  << " of the multivector is invalid.");
4150  }
4151 
4152  auto view = this->getLocalViewHost(Access::ReadWrite);
4153  const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4154  view(lclRow, colInd) = ScalarValue;
4155 }
4156 
4157 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4159  sumIntoLocalValue(const LocalOrdinal lclRow,
4160  const size_t col,
4161  const impl_scalar_type& value,
4162  const bool atomic) {
4164  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4165  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4166  TEUCHOS_TEST_FOR_EXCEPTION(
4167  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4168  std::runtime_error,
4169  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4170  << " is invalid. The range of valid row indices on this process "
4171  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4172  << ", " << maxLocalIndex << "].");
4173  TEUCHOS_TEST_FOR_EXCEPTION(
4174  vectorIndexOutOfRange(col),
4175  std::runtime_error,
4176  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4177  << " of the multivector is invalid.");
4178  }
4179 
4180  const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4181 
4182  auto view = this->getLocalViewHost(Access::ReadWrite);
4183  if (atomic) {
4184  Kokkos::atomic_add(&(view(lclRow, colInd)), value);
4185  } else {
4186  view(lclRow, colInd) += value;
4187  }
4188 }
4189 
4190 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4192  replaceGlobalValue(const GlobalOrdinal gblRow,
4193  const size_t col,
4194  const impl_scalar_type& ScalarValue) {
4195  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4196  // touches the RCP's reference count, which isn't thread safe.
4197  const LocalOrdinal lclRow = this->map_->getLocalElement(gblRow);
4198 
4200  const char tfecfFuncName[] = "replaceGlobalValue: ";
4201  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4202  std::runtime_error,
4203  "Global row index " << gblRow << " is not present on this process "
4204  << this->getMap()->getComm()->getRank() << ".");
4205  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->vectorIndexOutOfRange(col), std::runtime_error,
4206  "Vector index " << col << " of the MultiVector is invalid.");
4207  }
4208 
4209  this->replaceLocalValue(lclRow, col, ScalarValue);
4210 }
4211 
4212 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4214  sumIntoGlobalValue(const GlobalOrdinal globalRow,
4215  const size_t col,
4216  const impl_scalar_type& value,
4217  const bool atomic) {
4218  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4219  // touches the RCP's reference count, which isn't thread safe.
4220  const LocalOrdinal lclRow = this->map_->getLocalElement(globalRow);
4221 
4223  TEUCHOS_TEST_FOR_EXCEPTION(
4224  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4225  std::runtime_error,
4226  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4227  << " is not present on this process "
4228  << this->getMap()->getComm()->getRank() << ".");
4229  TEUCHOS_TEST_FOR_EXCEPTION(
4230  vectorIndexOutOfRange(col),
4231  std::runtime_error,
4232  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4233  << " of the multivector is invalid.");
4234  }
4235 
4236  this->sumIntoLocalValue(lclRow, col, value, atomic);
4237 }
4238 
4239 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4240 template <class T>
4241 Teuchos::ArrayRCP<T>
4243  getSubArrayRCP(Teuchos::ArrayRCP<T> arr,
4244  size_t j) const {
4245  typedef Kokkos::DualView<impl_scalar_type*,
4246  typename dual_view_type::array_layout,
4248  col_dual_view_type;
4249  const size_t col = isConstantStride() ? j : whichVectors_[j];
4250  col_dual_view_type X_col =
4251  Kokkos::subview(view_, Kokkos::ALL(), col);
4252  return Kokkos::Compat::persistingView(X_col.view_device());
4253 }
4254 
4255 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4258  return view_.need_sync_host();
4259 }
4260 
4261 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4264  return view_.need_sync_device();
4265 }
4266 
4267 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4268 std::string
4270  descriptionImpl(const std::string& className) const {
4271  using Teuchos::TypeNameTraits;
4272 
4273  std::ostringstream out;
4274  out << "\"" << className << "\": {";
4275  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name()
4276  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name()
4277  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name()
4278  << ", Node" << Node::name()
4279  << "}, ";
4280  if (this->getObjectLabel() != "") {
4281  out << "Label: \"" << this->getObjectLabel() << "\", ";
4282  }
4283  out << ", numRows: " << this->getGlobalLength();
4284  if (className != "Tpetra::Vector") {
4285  out << ", numCols: " << this->getNumVectors()
4286  << ", isConstantStride: " << this->isConstantStride();
4287  }
4288  if (this->isConstantStride()) {
4289  out << ", columnStride: " << this->getStride();
4290  }
4291  out << "}";
4292 
4293  return out.str();
4294 }
4295 
4296 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4297 std::string
4299  description() const {
4300  return this->descriptionImpl("Tpetra::MultiVector");
4301 }
4302 
4303 namespace Details {
4304 template <typename ViewType>
4305 void print_vector(Teuchos::FancyOStream& out, const char* prefix, const ViewType& v) {
4306  using std::endl;
4307  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4308  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4309  static_assert(ViewType::rank == 2,
4310  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4311  // The square braces [] and their contents are in Matlab
4312  // format, so users may copy and paste directly into Matlab.
4313  out << "Values(" << prefix << "): " << std::endl
4314  << "[";
4315  const size_t numRows = v.extent(0);
4316  const size_t numCols = v.extent(1);
4317  if (numCols == 1) {
4318  for (size_t i = 0; i < numRows; ++i) {
4319  out << v(i, 0);
4320  if (i + 1 < numRows) {
4321  out << "; ";
4322  }
4323  }
4324  } else {
4325  for (size_t i = 0; i < numRows; ++i) {
4326  for (size_t j = 0; j < numCols; ++j) {
4327  out << v(i, j);
4328  if (j + 1 < numCols) {
4329  out << ", ";
4330  }
4331  }
4332  if (i + 1 < numRows) {
4333  out << "; ";
4334  }
4335  }
4336  }
4337  out << "]" << endl;
4338 }
4339 } // namespace Details
4340 
4341 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4342 std::string
4344  localDescribeToString(const Teuchos::EVerbosityLevel vl) const {
4345  typedef LocalOrdinal LO;
4346  using std::endl;
4347 
4348  if (vl <= Teuchos::VERB_LOW) {
4349  return std::string();
4350  }
4351  auto map = this->getMap();
4352  if (map.is_null()) {
4353  return std::string();
4354  }
4355  auto outStringP = Teuchos::rcp(new std::ostringstream());
4356  auto outp = Teuchos::getFancyOStream(outStringP);
4357  Teuchos::FancyOStream& out = *outp;
4358  auto comm = map->getComm();
4359  const int myRank = comm->getRank();
4360  const int numProcs = comm->getSize();
4361 
4362  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4363  Teuchos::OSTab tab1(out);
4364 
4365  // VERB_MEDIUM and higher prints getLocalLength()
4366  const LO lclNumRows = static_cast<LO>(this->getLocalLength());
4367  out << "Local number of rows: " << lclNumRows << endl;
4368 
4369  if (vl > Teuchos::VERB_MEDIUM) {
4370  // VERB_HIGH and higher prints isConstantStride() and getStride().
4371  // The first is only relevant if the Vector has multiple columns.
4372  if (this->getNumVectors() != static_cast<size_t>(1)) {
4373  out << "isConstantStride: " << this->isConstantStride() << endl;
4374  }
4375  if (this->isConstantStride()) {
4376  out << "Column stride: " << this->getStride() << endl;
4377  }
4378 
4379  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4380  // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4382  // so we can't use our regular accessor functins
4383 
4384  // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4385  // to trigger (since this function is conceptually const). Thus, we
4386  // get *copies* of the view's data instead.
4387  auto X_dev = view_.getDeviceCopy();
4388  auto X_host = view_.getHostCopy();
4389 
4390  if (X_dev.data() == X_host.data()) {
4391  // One single allocation
4392  Details::print_vector(out, "unified", X_host);
4393  } else {
4394  Details::print_vector(out, "host", X_host);
4395  Details::print_vector(out, "dev", X_dev);
4396  }
4397  }
4398  }
4399  out.flush(); // make sure the ostringstream got everything
4400  return outStringP->str();
4401 }
4402 
4403 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4405  describeImpl(Teuchos::FancyOStream& out,
4406  const std::string& className,
4407  const Teuchos::EVerbosityLevel verbLevel) const {
4408  using std::endl;
4409  using Teuchos::TypeNameTraits;
4410  using Teuchos::VERB_DEFAULT;
4411  using Teuchos::VERB_LOW;
4412  using Teuchos::VERB_NONE;
4413  const Teuchos::EVerbosityLevel vl =
4414  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4415 
4416  if (vl == VERB_NONE) {
4417  return; // don't print anything
4418  }
4419  // If this Vector's Comm is null, then the Vector does not
4420  // participate in collective operations with the other processes.
4421  // In that case, it is not even legal to call this method. The
4422  // reasonable thing to do in that case is nothing.
4423  auto map = this->getMap();
4424  if (map.is_null()) {
4425  return;
4426  }
4427  auto comm = map->getComm();
4428  if (comm.is_null()) {
4429  return;
4430  }
4431 
4432  const int myRank = comm->getRank();
4433 
4434  // Only Process 0 should touch the output stream, but this method
4435  // in general may need to do communication. Thus, we may need to
4436  // preserve the current tab level across multiple "if (myRank ==
4437  // 0) { ... }" inner scopes. This is why we sometimes create
4438  // OSTab instances by pointer, instead of by value. We only need
4439  // to create them by pointer if the tab level must persist through
4440  // multiple inner scopes.
4441  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4442 
4443  // VERB_LOW and higher prints the equivalent of description().
4444  if (myRank == 0) {
4445  tab0 = Teuchos::rcp(new Teuchos::OSTab(out));
4446  out << "\"" << className << "\":" << endl;
4447  tab1 = Teuchos::rcp(new Teuchos::OSTab(out));
4448  {
4449  out << "Template parameters:" << endl;
4450  Teuchos::OSTab tab2(out);
4451  out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
4452  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name() << endl
4453  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name() << endl
4454  << "Node: " << Node::name() << endl;
4455  }
4456  if (this->getObjectLabel() != "") {
4457  out << "Label: \"" << this->getObjectLabel() << "\", ";
4458  }
4459  out << "Global number of rows: " << this->getGlobalLength() << endl;
4460  if (className != "Tpetra::Vector") {
4461  out << "Number of columns: " << this->getNumVectors() << endl;
4462  }
4463  // getStride() may differ on different processes, so it (and
4464  // isConstantStride()) properly belong to per-process data.
4465  }
4466 
4467  // This is collective over the Map's communicator.
4468  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4469  const std::string lclStr = this->localDescribeToString(vl);
4470  ::Tpetra::Details::gathervPrint(out, lclStr, *comm);
4471  }
4472 }
4473 
4474 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4476  describe(Teuchos::FancyOStream& out,
4477  const Teuchos::EVerbosityLevel verbLevel) const {
4478  this->describeImpl(out, "Tpetra::MultiVector", verbLevel);
4479 }
4480 
4481 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4483  removeEmptyProcessesInPlace(const Teuchos::RCP<const map_type>& newMap) {
4484  replaceMap(newMap);
4485 }
4486 
4487 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4490  using ::Tpetra::Details::localDeepCopy;
4491  const char prefix[] = "Tpetra::MultiVector::assign: ";
4492 
4493  TEUCHOS_TEST_FOR_EXCEPTION(this->getGlobalLength() != src.getGlobalLength() ||
4494  this->getNumVectors() != src.getNumVectors(),
4495  std::invalid_argument,
4496  prefix << "Global dimensions of the two Tpetra::MultiVector "
4497  "objects do not match. src has dimensions ["
4498  << src.getGlobalLength()
4499  << "," << src.getNumVectors() << "], and *this has dimensions ["
4500  << this->getGlobalLength() << "," << this->getNumVectors() << "].");
4501  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4502  TEUCHOS_TEST_FOR_EXCEPTION(this->getLocalLength() != src.getLocalLength(), std::invalid_argument,
4503  prefix << "The local row counts of the two Tpetra::MultiVector "
4504  "objects do not match. src has "
4505  << src.getLocalLength() << " row(s) "
4506  << " and *this has " << this->getLocalLength() << " row(s).");
4507 
4508  // See #1510. We're writing to *this, so we don't care about its
4509  // contents in either memory space, and we don't want
4510  // DualView::modify to complain about "concurrent modification" of
4511  // host and device Views.
4512 
4513  // KJ: this is problematic. assign funtion is used to construct a subvector
4514  // if the sync flag is reset here, it lose all our control over getLocalView interface
4515  // this->clear_sync_state();
4516 
4517  // If need sync to device, then host has most recent version.
4518  const bool src_last_updated_on_host = src.need_sync_device();
4519 
4520  if (src_last_updated_on_host) {
4521  localDeepCopy(this->getLocalViewHost(Access::ReadWrite),
4522  src.getLocalViewHost(Access::ReadOnly),
4523  this->isConstantStride(),
4524  src.isConstantStride(),
4525  this->whichVectors_(),
4526  src.whichVectors_());
4527  } else {
4528  localDeepCopy(this->getLocalViewDevice(Access::ReadWrite),
4529  src.getLocalViewDevice(Access::ReadOnly),
4530  this->isConstantStride(),
4531  src.isConstantStride(),
4532  this->whichVectors_(),
4533  src.whichVectors_());
4534  }
4535 }
4536 
4537 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4538 template <class T>
4539 Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4541  convert() const {
4542  using Teuchos::RCP;
4543 
4544  auto newMV = Teuchos::rcp(new MultiVector<T, LocalOrdinal, GlobalOrdinal, Node>(this->getMap(),
4545  this->getNumVectors(),
4546  /*zeroOut=*/false));
4547  Tpetra::deep_copy(*newMV, *this);
4548  return newMV;
4549 }
4550 
4551 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4554  using ::Tpetra::Details::PackTraits;
4555 
4556  const size_t l1 = this->getLocalLength();
4557  const size_t l2 = vec.getLocalLength();
4558  if ((l1 != l2) || (this->getNumVectors() != vec.getNumVectors())) {
4559  return false;
4560  }
4561 
4562  return true;
4563 }
4564 
4565 template <class ST, class LO, class GO, class NT>
4568  std::swap(mv.map_, this->map_);
4569  std::swap(mv.view_, this->view_);
4570  std::swap(mv.whichVectors_, this->whichVectors_);
4571 }
4572 
4573 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4574 template <class ST, class LO, class GO, class NT>
4576  const Teuchos::SerialDenseMatrix<int, ST>& src) {
4577  using ::Tpetra::Details::localDeepCopy;
4578  using MV = MultiVector<ST, LO, GO, NT>;
4579  using IST = typename MV::impl_scalar_type;
4580  using input_view_type =
4581  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4582  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4583  using pair_type = std::pair<LO, LO>;
4584 
4585  const LO numRows = static_cast<LO>(src.numRows());
4586  const LO numCols = static_cast<LO>(src.numCols());
4587 
4588  TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(dst.getLocalLength()) ||
4589  numCols != static_cast<LO>(dst.getNumVectors()),
4590  std::invalid_argument, "Tpetra::deep_copy: On Process " << dst.getMap()->getComm()->getRank() << ", dst is " << dst.getLocalLength() << " x " << dst.getNumVectors() << ", but src is " << numRows << " x " << numCols << ".");
4591 
4592  const IST* src_raw = reinterpret_cast<const IST*>(src.values());
4593  input_view_type src_orig(src_raw, src.stride(), numCols);
4594  input_view_type src_view =
4595  Kokkos::subview(src_orig, pair_type(0, numRows), Kokkos::ALL());
4596 
4597  constexpr bool src_isConstantStride = true;
4598  Teuchos::ArrayView<const size_t> srcWhichVectors(nullptr, 0);
4599  localDeepCopy(dst.getLocalViewHost(Access::ReadWrite),
4600  src_view,
4601  dst.isConstantStride(),
4602  src_isConstantStride,
4603  getMultiVectorWhichVectors(dst),
4604  srcWhichVectors);
4605 }
4606 
4607 template <class ST, class LO, class GO, class NT>
4608 void deep_copy(Teuchos::SerialDenseMatrix<int, ST>& dst,
4609  const MultiVector<ST, LO, GO, NT>& src) {
4610  using ::Tpetra::Details::localDeepCopy;
4611  using MV = MultiVector<ST, LO, GO, NT>;
4612  using IST = typename MV::impl_scalar_type;
4613  using output_view_type =
4614  Kokkos::View<IST**, Kokkos::LayoutLeft,
4615  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4616  using pair_type = std::pair<LO, LO>;
4617 
4618  const LO numRows = static_cast<LO>(dst.numRows());
4619  const LO numCols = static_cast<LO>(dst.numCols());
4620 
4621  TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(src.getLocalLength()) ||
4622  numCols != static_cast<LO>(src.getNumVectors()),
4623  std::invalid_argument, "Tpetra::deep_copy: On Process " << src.getMap()->getComm()->getRank() << ", src is " << src.getLocalLength() << " x " << src.getNumVectors() << ", but dst is " << numRows << " x " << numCols << ".");
4624 
4625  IST* dst_raw = reinterpret_cast<IST*>(dst.values());
4626  output_view_type dst_orig(dst_raw, dst.stride(), numCols);
4627  auto dst_view =
4628  Kokkos::subview(dst_orig, pair_type(0, numRows), Kokkos::ALL());
4629 
4630  constexpr bool dst_isConstantStride = true;
4631  Teuchos::ArrayView<const size_t> dstWhichVectors(nullptr, 0);
4632 
4633  // Prefer the host version of src's data.
4634  localDeepCopy(dst_view,
4635  src.getLocalViewHost(Access::ReadOnly),
4636  dst_isConstantStride,
4637  src.isConstantStride(),
4638  dstWhichVectors,
4639  getMultiVectorWhichVectors(src));
4640 }
4641 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4642 
4643 template <class Scalar, class LO, class GO, class NT>
4644 Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4645 createMultiVector(const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4646  size_t numVectors) {
4648  return Teuchos::rcp(new MV(map, numVectors));
4649 }
4650 
4651 template <class ST, class LO, class GO, class NT>
4654  typedef MultiVector<ST, LO, GO, NT> MV;
4655  MV cpy(src.getMap(), src.getNumVectors(), false);
4656  cpy.assign(src);
4657  return cpy;
4658 }
4659 
4660 } // namespace Tpetra
4661 
4662 //
4663 // Explicit instantiation macro
4664 //
4665 // Must be expanded from within the Tpetra namespace!
4666 //
4667 
4668 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4669 #define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4670  template class MultiVector<SCALAR, LO, GO, NODE>; \
4671  template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src); \
4672  template Teuchos::RCP<MultiVector<SCALAR, LO, GO, NODE> > createMultiVector(const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4673  template void deep_copy(MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4674  template void deep_copy(Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4675 
4676 #else
4677 #define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4678  template class MultiVector<SCALAR, LO, GO, NODE>; \
4679  template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src);
4680 
4681 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4682 
4683 #define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO, SI, LO, GO, NODE) \
4684  \
4685  template Teuchos::RCP<MultiVector<SO, LO, GO, NODE> > \
4686  MultiVector<SI, LO, GO, NODE>::convert<SO>() const;
4687 
4688 #endif // TPETRA_MULTIVECTOR_DEF_HPP
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
One or more distributed dense vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
bool isDistributed() const
Whether this is a globally distributed object.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
static bool debug()
Whether Tpetra is in debug mode.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
int local_ordinal_type
Default value of Scalar template parameter.
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
Functions for initializing and finalizing Tpetra.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Declaration of Tpetra::Details::isInterComm.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
Replace existing values with new values.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
void start()
Start the deep_copy counter.
A parallel distribution of indices over processes.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
Base class for distributed Tpetra objects that support data redistribution.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node >> &map, const size_t numVectors)
Nonmember MultiVector &quot;constructor&quot;: Create a MultiVector from a given Map.
EWhichNorm
Input argument for normImpl() (which see).
Accumulate new values into existing values (may not be supported in all classes)
size_t getOrigNumLocalRows() const
&quot;Original&quot; number of rows in the (local) data.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.