Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 // clang-format off
41 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
42 #define TPETRA_MULTIVECTOR_DEF_HPP
43 
51 
52 #include "Tpetra_Core.hpp"
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
58 #include "Tpetra_Details_fill.hpp"
66 #include "Tpetra_Details_Random.hpp"
67 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
68 # include "Teuchos_SerialDenseMatrix.hpp"
69 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
70 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
71 #include "KokkosCompat_View.hpp"
72 #include "KokkosBlas.hpp"
73 #include "KokkosKernels_Utils.hpp"
74 #include "Kokkos_Random.hpp"
75 #include "Kokkos_ArithTraits.hpp"
76 #include <memory>
77 #include <sstream>
78 
79 #ifdef HAVE_TPETRA_INST_FLOAT128
80 namespace Kokkos {
81  // FIXME (mfh 04 Sep 2015) Just a stub for now!
82  template<class Generator>
83  struct rand<Generator, __float128> {
84  static KOKKOS_INLINE_FUNCTION __float128 max ()
85  {
86  return static_cast<__float128> (1.0);
87  }
88  static KOKKOS_INLINE_FUNCTION __float128
89  draw (Generator& gen)
90  {
91  // Half the smallest normalized double, is the scaling factor of
92  // the lower-order term in the double-double representation.
93  const __float128 scalingFactor =
94  static_cast<__float128> (std::numeric_limits<double>::min ()) /
95  static_cast<__float128> (2.0);
96  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
97  const __float128 lowerOrderTerm =
98  static_cast<__float128> (gen.drand ()) * scalingFactor;
99  return higherOrderTerm + lowerOrderTerm;
100  }
101  static KOKKOS_INLINE_FUNCTION __float128
102  draw (Generator& gen, const __float128& range)
103  {
104  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
105  const __float128 scalingFactor =
106  static_cast<__float128> (std::numeric_limits<double>::min ()) /
107  static_cast<__float128> (2.0);
108  const __float128 higherOrderTerm =
109  static_cast<__float128> (gen.drand (range));
110  const __float128 lowerOrderTerm =
111  static_cast<__float128> (gen.drand (range)) * scalingFactor;
112  return higherOrderTerm + lowerOrderTerm;
113  }
114  static KOKKOS_INLINE_FUNCTION __float128
115  draw (Generator& gen, const __float128& start, const __float128& end)
116  {
117  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
118  const __float128 scalingFactor =
119  static_cast<__float128> (std::numeric_limits<double>::min ()) /
120  static_cast<__float128> (2.0);
121  const __float128 higherOrderTerm =
122  static_cast<__float128> (gen.drand (start, end));
123  const __float128 lowerOrderTerm =
124  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
125  return higherOrderTerm + lowerOrderTerm;
126  }
127  };
128 } // namespace Kokkos
129 #endif // HAVE_TPETRA_INST_FLOAT128
130 
131 namespace { // (anonymous)
132 
147  template<class ST, class LO, class GO, class NT>
149  allocDualView (const size_t lclNumRows,
150  const size_t numCols,
151  const bool zeroOut = true,
152  const bool allowPadding = false)
153  {
154  using ::Tpetra::Details::Behavior;
155  using Kokkos::AllowPadding;
156  using Kokkos::view_alloc;
157  using Kokkos::WithoutInitializing;
158  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
159  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
160  typedef typename dual_view_type::t_dev dev_view_type;
161 
162  // This needs to be a string and not a char*, if given as an
163  // argument to Kokkos::view_alloc. This is because view_alloc
164  // also allows a raw pointer as its first argument. See
165  // https://github.com/kokkos/kokkos/issues/434.
166  const std::string label ("MV::DualView");
167  const bool debug = Behavior::debug ();
168 
169  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
170  // creation of the DualView's Views works around
171  // Kokkos::DualView's current inability to accept an
172  // AllocationProperties initial argument (as Kokkos::View does).
173  // However, the work-around is harmless, since it does what the
174  // (currently nonexistent) equivalent DualView constructor would
175  // have done anyway.
176 
177  dev_view_type d_view;
178  if (zeroOut) {
179  if (allowPadding) {
180  d_view = dev_view_type (view_alloc (label, AllowPadding),
181  lclNumRows, numCols);
182  }
183  else {
184  d_view = dev_view_type (view_alloc (label),
185  lclNumRows, numCols);
186  }
187  }
188  else {
189  if (allowPadding) {
190  d_view = dev_view_type (view_alloc (label,
191  WithoutInitializing,
192  AllowPadding),
193  lclNumRows, numCols);
194  }
195  else {
196  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
197  lclNumRows, numCols);
198  }
199  if (debug) {
200  // Filling with NaN is a cheap and effective way to tell if
201  // downstream code is trying to use a MultiVector's data
202  // without them having been initialized. ArithTraits lets us
203  // call nan() even if the scalar type doesn't define it; it
204  // just returns some undefined value in the latter case. This
205  // won't hurt anything because by setting zeroOut=false, users
206  // already agreed that they don't care about the contents of
207  // the MultiVector.
208  const ST nan = Kokkos::ArithTraits<ST>::nan ();
209  KokkosBlas::fill (d_view, nan);
210  }
211  }
212  if (debug) {
213  TEUCHOS_TEST_FOR_EXCEPTION
214  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
215  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
216  "allocDualView: d_view's dimensions actual dimensions do not match "
217  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
218  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
219  << ". Please report this bug to the Tpetra developers.");
220  }
221 
222  return wrapped_dual_view_type(d_view);
223  }
224 
225  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
226  //
227  // T: The type of the entries of the View.
228  // ExecSpace: The Kokkos execution space.
229  template<class T, class ExecSpace>
230  struct MakeUnmanagedView {
231  // The 'false' part of the branch carefully ensures that this
232  // won't attempt to use a host execution space that hasn't been
233  // initialized. For example, if Kokkos::OpenMP is disabled and
234  // Kokkos::Threads is enabled, the latter is always the default
235  // execution space of Kokkos::HostSpace, even when ExecSpace is
236  // Kokkos::Serial. That's why we go through the trouble of asking
237  // Kokkos::DualView what _its_ space is. That seems to work
238  // around this default execution space issue.
239  //
240  typedef typename std::conditional<
241  Kokkos::SpaceAccessibility<
242  typename ExecSpace::memory_space,
243  Kokkos::HostSpace>::accessible,
244  typename ExecSpace::device_type,
245  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
246  typedef Kokkos::LayoutLeft array_layout;
247  typedef Kokkos::View<T*, array_layout, host_exec_space,
248  Kokkos::MemoryUnmanaged> view_type;
249 
250  static view_type getView (const Teuchos::ArrayView<T>& x_in)
251  {
252  const size_t numEnt = static_cast<size_t> (x_in.size ());
253  if (numEnt == 0) {
254  return view_type ();
255  } else {
256  return view_type (x_in.getRawPtr (), numEnt);
257  }
258  }
259  };
260 
261 
262  template<class WrappedDualViewType>
263  WrappedDualViewType
264  takeSubview (const WrappedDualViewType& X,
265  const std::pair<size_t, size_t>& rowRng,
266  const Kokkos::ALL_t& colRng)
267 
268  {
269  // The bug we saw below should be harder to trigger here.
270  return WrappedDualViewType(X,rowRng,colRng);
271  }
272 
273  template<class WrappedDualViewType>
274  WrappedDualViewType
275  takeSubview (const WrappedDualViewType& X,
276  const Kokkos::ALL_t& rowRng,
277  const std::pair<size_t, size_t>& colRng)
278  {
279  using DualViewType = typename WrappedDualViewType::DVT;
280  // Look carefullly at the comment in the below version of this function.
281  // The comment applies here as well.
282  if (X.extent (0) == 0 && X.extent (1) != 0) {
283  return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
284  }
285  else {
286  return WrappedDualViewType(X,rowRng,colRng);
287  }
288  }
289 
290  template<class WrappedDualViewType>
291  WrappedDualViewType
292  takeSubview (const WrappedDualViewType& X,
293  const std::pair<size_t, size_t>& rowRng,
294  const std::pair<size_t, size_t>& colRng)
295  {
296  using DualViewType = typename WrappedDualViewType::DVT;
297  // If you take a subview of a view with zero rows Kokkos::subview()
298  // always returns a DualView with the same data pointers. This will break
299  // pointer equality testing in between two subviews of the same 2D View if
300  // it has zero row extent. While the one (known) case where this was actually used
301  // has been fixed, that sort of check could very easily be reintroduced in the future,
302  // hence I've added this if check here.
303  //
304  // This is not a bug in Kokkos::subview(), just some very subtle behavior which
305  // future developers should be wary of.
306  if (X.extent (0) == 0 && X.extent (1) != 0) {
307  return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
308  }
309  else {
310  return WrappedDualViewType(X,rowRng,colRng);
311  }
312  }
313 
314  template<class WrappedOrNotDualViewType>
315  size_t
316  getDualViewStride (const WrappedOrNotDualViewType& dv)
317  {
318  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
319  // method yet, but its Views do.
320  // NOTE: dv.stride() returns a vector of length one
321  // more than its rank
322  size_t strides[WrappedOrNotDualViewType::t_dev::rank+1];
323  dv.stride(strides);
324  const size_t LDA = strides[1];
325  const size_t numRows = dv.extent (0);
326 
327  if (LDA == 0) {
328  return (numRows == 0) ? size_t (1) : numRows;
329  }
330  else {
331  return LDA;
332  }
333  }
334 
335  template<class ViewType>
336  size_t
337  getViewStride (const ViewType& view)
338  {
339  const size_t LDA = view.stride (1);
340  const size_t numRows = view.extent (0);
341 
342  if (LDA == 0) {
343  return (numRows == 0) ? size_t (1) : numRows;
344  }
345  else {
346  return LDA;
347  }
348  }
349 
350  template <class impl_scalar_type, class buffer_device_type>
351  bool
352  runKernelOnHost (
353  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
354  )
355  {
356  if (! imports.need_sync_device ()) {
357  return false; // most up-to-date on device
358  }
359  else { // most up-to-date on host,
360  // but if large enough, worth running on device anyway
361  size_t localLengthThreshold =
363  return imports.extent(0) <= localLengthThreshold;
364  }
365  }
366 
367 
368  template <class SC, class LO, class GO, class NT>
369  bool
370  runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
371  {
372  if (! X.need_sync_device ()) {
373  return false; // most up-to-date on device
374  }
375  else { // most up-to-date on host
376  // but if large enough, worth running on device anyway
377  size_t localLengthThreshold =
379  return X.getLocalLength () <= localLengthThreshold;
380  }
381  }
382 
383  template <class SC, class LO, class GO, class NT>
384  void
385  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
388  {
389  using namespace Tpetra;
392  using val_type = typename MV::impl_scalar_type;
393  using mag_type = typename MV::mag_type;
394  using dual_view_type = typename MV::dual_view_type;
395 
396  auto map = X.getMap ();
397  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
398  auto whichVecs = getMultiVectorWhichVectors (X);
399  const bool isConstantStride = X.isConstantStride ();
400  const bool isDistributed = X.isDistributed ();
401 
402  const bool runOnHost = runKernelOnHost (X);
403  if (runOnHost) {
404  using view_type = typename dual_view_type::t_host;
405  using array_layout = typename view_type::array_layout;
406  using device_type = typename view_type::device_type;
407 
408  auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
409  normImpl<val_type, array_layout, device_type,
410  mag_type> (norms, X_lcl, whichNorm, whichVecs,
411  isConstantStride, isDistributed, comm);
412  }
413  else {
414  using view_type = typename dual_view_type::t_dev;
415  using array_layout = typename view_type::array_layout;
416  using device_type = typename view_type::device_type;
417 
418  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
419  normImpl<val_type, array_layout, device_type,
420  mag_type> (norms, X_lcl, whichNorm, whichVecs,
421  isConstantStride, isDistributed, comm);
422  }
423  }
424 } // namespace (anonymous)
425 
426 
427 namespace Tpetra {
428 
429  namespace Details {
430  template <typename DstView, typename SrcView>
431  struct AddAssignFunctor {
432  // This functor would be better as a lambda, but CUDA cannot compile
433  // lambdas in protected functions. It compiles fine with the functor.
434  AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
435 
436  KOKKOS_INLINE_FUNCTION void
437  operator () (const size_t k) const { tgt(k) += src(k); }
438 
439  DstView tgt;
440  SrcView src;
441  };
442  }
443 
444  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
445  bool
447  vectorIndexOutOfRange (const size_t VectorIndex) const {
448  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
449  }
450 
451  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
454  base_type (Teuchos::rcp (new map_type ()))
455  {}
456 
457  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
459  MultiVector (const Teuchos::RCP<const map_type>& map,
460  const size_t numVecs,
461  const bool zeroOut) : /* default is true */
462  base_type (map)
463  {
464  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
465 
466  const size_t lclNumRows = this->getLocalLength ();
467  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
468  }
469 
470  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
473  const Teuchos::DataAccess copyOrView) :
474  base_type (source),
475  view_ (source.view_),
476  whichVectors_ (source.whichVectors_)
477  {
479  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
480  "const Teuchos::DataAccess): ";
481 
482  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
483 
484  if (copyOrView == Teuchos::Copy) {
485  // Reuse the conveniently already existing function that creates
486  // a deep copy.
487  MV cpy = createCopy (source);
488  this->view_ = cpy.view_;
489  this->whichVectors_ = cpy.whichVectors_;
490  }
491  else if (copyOrView == Teuchos::View) {
492  }
493  else {
494  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
495  true, std::invalid_argument, "Second argument 'copyOrView' has an "
496  "invalid value " << copyOrView << ". Valid values include "
497  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
498  << Teuchos::View << ".");
499  }
500  }
501 
502  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
504  MultiVector (const Teuchos::RCP<const map_type>& map,
505  const dual_view_type& view) :
506  base_type (map),
507  view_ (wrapped_dual_view_type(view))
508  {
509  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
510  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
511  map->getLocalNumElements ();
512  const size_t lclNumRows_view = view.extent (0);
513  const size_t LDA = getDualViewStride (view_);
514 
515  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
516  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
517  std::invalid_argument, "Kokkos::DualView does not match Map. "
518  "map->getLocalNumElements() = " << lclNumRows_map
519  << ", view.extent(0) = " << lclNumRows_view
520  << ", and getStride() = " << LDA << ".");
521 
522  using ::Tpetra::Details::Behavior;
523  const bool debug = Behavior::debug ();
524  if (debug) {
525  using ::Tpetra::Details::checkGlobalDualViewValidity;
526  std::ostringstream gblErrStrm;
527  const bool verbose = Behavior::verbose ();
528  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
529  const bool gblValid =
530  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
531  comm.getRawPtr ());
532  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
533  (! gblValid, std::runtime_error, gblErrStrm.str ());
534  }
535  }
536 
537 
538  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
540  MultiVector (const Teuchos::RCP<const map_type>& map,
541  const wrapped_dual_view_type& view) :
542  base_type (map),
543  view_ (view)
544  {
545  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
546  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
547  map->getLocalNumElements ();
548  const size_t lclNumRows_view = view.extent (0);
549  const size_t LDA = getDualViewStride (view);
550 
551  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
552  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
553  std::invalid_argument, "Kokkos::DualView does not match Map. "
554  "map->getLocalNumElements() = " << lclNumRows_map
555  << ", view.extent(0) = " << lclNumRows_view
556  << ", and getStride() = " << LDA << ".");
557 
558  using ::Tpetra::Details::Behavior;
559  const bool debug = Behavior::debug ();
560  if (debug) {
561  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
562  std::ostringstream gblErrStrm;
563  const bool verbose = Behavior::verbose ();
564  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
565  const bool gblValid =
566  checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
567  comm.getRawPtr ());
568  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
569  (! gblValid, std::runtime_error, gblErrStrm.str ());
570  }
571  }
572 
573 
574 
575  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
577  MultiVector (const Teuchos::RCP<const map_type>& map,
578  const typename dual_view_type::t_dev& d_view) :
579  base_type (map)
580  {
581  using Teuchos::ArrayRCP;
582  using Teuchos::RCP;
583  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
584 
585  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
586 
587  const size_t LDA = getViewStride (d_view);
588  const size_t lclNumRows = map->getLocalNumElements ();
589  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
590  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
591  "Kokkos::View. map->getLocalNumElements() = " << lclNumRows
592  << ", View's column stride = " << LDA
593  << ", and View's extent(0) = " << d_view.extent (0) << ".");
594 
595  auto h_view = Kokkos::create_mirror_view (d_view);
596  auto dual_view = dual_view_type (d_view, h_view);
597  view_ = wrapped_dual_view_type(dual_view);
598 
599  using ::Tpetra::Details::Behavior;
600  const bool debug = Behavior::debug ();
601  if (debug) {
602  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
603  std::ostringstream gblErrStrm;
604  const bool verbose = Behavior::verbose ();
605  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
606  const bool gblValid =
607  checkGlobalWrappedDualViewValidity (&gblErrStrm, view_, verbose,
608  comm.getRawPtr ());
609  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
610  (! gblValid, std::runtime_error, gblErrStrm.str ());
611  }
612  // The user gave us a device view. In order to respect its
613  // initial contents, we mark the DualView as "modified on device."
614  // That way, the next sync will synchronize it with the host view.
615  dual_view.modify_device ();
616  }
617 
618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
620  MultiVector (const Teuchos::RCP<const map_type>& map,
621  const dual_view_type& view,
622  const dual_view_type& origView) :
623  base_type (map),
624  view_ (wrapped_dual_view_type(view,origView))
625  {
626  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
627 
628  const size_t LDA = getDualViewStride (origView);
629  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
630  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
631  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
632  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
633  << ". This may also mean that the input origView's first dimension (number "
634  "of rows = " << origView.extent (0) << ") does not not match the number "
635  "of entries in the Map on the calling process.");
636 
637  using ::Tpetra::Details::Behavior;
638  const bool debug = Behavior::debug ();
639  if (debug) {
640  using ::Tpetra::Details::checkGlobalDualViewValidity;
641  std::ostringstream gblErrStrm;
642  const bool verbose = Behavior::verbose ();
643  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
644  const bool gblValid_0 =
645  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
646  comm.getRawPtr ());
647  const bool gblValid_1 =
648  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
649  comm.getRawPtr ());
650  const bool gblValid = gblValid_0 && gblValid_1;
651  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
652  (! gblValid, std::runtime_error, gblErrStrm.str ());
653  }
654  }
655 
656 
657  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
659  MultiVector (const Teuchos::RCP<const map_type>& map,
660  const dual_view_type& view,
661  const Teuchos::ArrayView<const size_t>& whichVectors) :
662  base_type (map),
663  view_ (view),
664  whichVectors_ (whichVectors.begin (), whichVectors.end ())
665  {
666  using Kokkos::ALL;
667  using Kokkos::subview;
668  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
669 
670  using ::Tpetra::Details::Behavior;
671  const bool debug = Behavior::debug ();
672  if (debug) {
673  using ::Tpetra::Details::checkGlobalDualViewValidity;
674  std::ostringstream gblErrStrm;
675  const bool verbose = Behavior::verbose ();
676  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
677  const bool gblValid =
678  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
679  comm.getRawPtr ());
680  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
681  (! gblValid, std::runtime_error, gblErrStrm.str ());
682  }
683 
684  const size_t lclNumRows = map.is_null () ? size_t (0) :
685  map->getLocalNumElements ();
686  // Check dimensions of the input DualView. We accept that Kokkos
687  // might not allow construction of a 0 x m (Dual)View with m > 0,
688  // so we only require the number of rows to match if the
689  // (Dual)View has more than zero columns. Likewise, we only
690  // require the number of columns to match if the (Dual)View has
691  // more than zero rows.
692  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
693  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
694  std::invalid_argument, "view.extent(0) = " << view.extent (0)
695  << " < map->getLocalNumElements() = " << lclNumRows << ".");
696  if (whichVectors.size () != 0) {
697  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
698  view.extent (1) != 0 && view.extent (1) == 0,
699  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
700  " = " << whichVectors.size () << " > 0.");
701  size_t maxColInd = 0;
702  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
703  for (size_type k = 0; k < whichVectors.size (); ++k) {
704  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
705  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
706  std::invalid_argument, "whichVectors[" << k << "] = "
707  "Teuchos::OrdinalTraits<size_t>::invalid().");
708  maxColInd = std::max (maxColInd, whichVectors[k]);
709  }
710  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
711  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
712  std::invalid_argument, "view.extent(1) = " << view.extent (1)
713  << " <= max(whichVectors) = " << maxColInd << ".");
714  }
715 
716  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
717  // zero strides, so modify in that case.
718  const size_t LDA = getDualViewStride (view);
719  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
720  (LDA < lclNumRows, std::invalid_argument,
721  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
722 
723  if (whichVectors.size () == 1) {
724  // If whichVectors has only one entry, we don't need to bother
725  // with nonconstant stride. Just shift the view over so it
726  // points to the desired column.
727  //
728  // NOTE (mfh 10 May 2014) This is a special case where we set
729  // origView_ just to view that one column, not all of the
730  // original columns. This ensures that the use of origView_ in
731  // offsetView works correctly.
732  //
733  const std::pair<size_t, size_t> colRng (whichVectors[0],
734  whichVectors[0] + 1);
735  view_ = takeSubview (view_, ALL (), colRng);
736  // whichVectors_.size() == 0 means "constant stride."
737  whichVectors_.clear ();
738  }
739  }
740 
741  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
743  MultiVector (const Teuchos::RCP<const map_type>& map,
744  const wrapped_dual_view_type& view,
745  const Teuchos::ArrayView<const size_t>& whichVectors) :
746  base_type (map),
747  view_ (view),
748  whichVectors_ (whichVectors.begin (), whichVectors.end ())
749  {
750  using Kokkos::ALL;
751  using Kokkos::subview;
752  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
753 
754  using ::Tpetra::Details::Behavior;
755  const bool debug = Behavior::debug ();
756  if (debug) {
757  using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
758  std::ostringstream gblErrStrm;
759  const bool verbose = Behavior::verbose ();
760  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
761  const bool gblValid =
762  checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
763  comm.getRawPtr ());
764  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
765  (! gblValid, std::runtime_error, gblErrStrm.str ());
766  }
767 
768  const size_t lclNumRows = map.is_null () ? size_t (0) :
769  map->getLocalNumElements ();
770  // Check dimensions of the input DualView. We accept that Kokkos
771  // might not allow construction of a 0 x m (Dual)View with m > 0,
772  // so we only require the number of rows to match if the
773  // (Dual)View has more than zero columns. Likewise, we only
774  // require the number of columns to match if the (Dual)View has
775  // more than zero rows.
776  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
777  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
778  std::invalid_argument, "view.extent(0) = " << view.extent (0)
779  << " < map->getLocalNumElements() = " << lclNumRows << ".");
780  if (whichVectors.size () != 0) {
781  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
782  view.extent (1) != 0 && view.extent (1) == 0,
783  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
784  " = " << whichVectors.size () << " > 0.");
785  size_t maxColInd = 0;
786  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
787  for (size_type k = 0; k < whichVectors.size (); ++k) {
788  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
789  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
790  std::invalid_argument, "whichVectors[" << k << "] = "
791  "Teuchos::OrdinalTraits<size_t>::invalid().");
792  maxColInd = std::max (maxColInd, whichVectors[k]);
793  }
794  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
795  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
796  std::invalid_argument, "view.extent(1) = " << view.extent (1)
797  << " <= max(whichVectors) = " << maxColInd << ".");
798  }
799 
800  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
801  // zero strides, so modify in that case.
802  const size_t LDA = getDualViewStride (view);
803  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
804  (LDA < lclNumRows, std::invalid_argument,
805  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
806 
807  if (whichVectors.size () == 1) {
808  // If whichVectors has only one entry, we don't need to bother
809  // with nonconstant stride. Just shift the view over so it
810  // points to the desired column.
811  //
812  // NOTE (mfh 10 May 2014) This is a special case where we set
813  // origView_ just to view that one column, not all of the
814  // original columns. This ensures that the use of origView_ in
815  // offsetView works correctly.
816  //
817  const std::pair<size_t, size_t> colRng (whichVectors[0],
818  whichVectors[0] + 1);
819  view_ = takeSubview (view_, ALL (), colRng);
820  // whichVectors_.size() == 0 means "constant stride."
821  whichVectors_.clear ();
822  }
823  }
824 
825 
826  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
828  MultiVector (const Teuchos::RCP<const map_type>& map,
829  const dual_view_type& view,
830  const dual_view_type& origView,
831  const Teuchos::ArrayView<const size_t>& whichVectors) :
832  base_type (map),
833  view_ (wrapped_dual_view_type(view,origView)),
834  whichVectors_ (whichVectors.begin (), whichVectors.end ())
835  {
836  using Kokkos::ALL;
837  using Kokkos::subview;
838  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
839 
840  using ::Tpetra::Details::Behavior;
841  const bool debug = Behavior::debug ();
842  if (debug) {
843  using ::Tpetra::Details::checkGlobalDualViewValidity;
844  std::ostringstream gblErrStrm;
845  const bool verbose = Behavior::verbose ();
846  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
847  const bool gblValid_0 =
848  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
849  comm.getRawPtr ());
850  const bool gblValid_1 =
851  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
852  comm.getRawPtr ());
853  const bool gblValid = gblValid_0 && gblValid_1;
854  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
855  (! gblValid, std::runtime_error, gblErrStrm.str ());
856  }
857 
858  const size_t lclNumRows = this->getLocalLength ();
859  // Check dimensions of the input DualView. We accept that Kokkos
860  // might not allow construction of a 0 x m (Dual)View with m > 0,
861  // so we only require the number of rows to match if the
862  // (Dual)View has more than zero columns. Likewise, we only
863  // require the number of columns to match if the (Dual)View has
864  // more than zero rows.
865  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
866  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
867  std::invalid_argument, "view.extent(0) = " << view.extent (0)
868  << " < map->getLocalNumElements() = " << lclNumRows << ".");
869  if (whichVectors.size () != 0) {
870  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
871  view.extent (1) != 0 && view.extent (1) == 0,
872  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
873  " = " << whichVectors.size () << " > 0.");
874  size_t maxColInd = 0;
875  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
876  for (size_type k = 0; k < whichVectors.size (); ++k) {
877  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
878  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
879  std::invalid_argument, "whichVectors[" << k << "] = "
880  "Teuchos::OrdinalTraits<size_t>::invalid().");
881  maxColInd = std::max (maxColInd, whichVectors[k]);
882  }
883  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
884  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
885  std::invalid_argument, "view.extent(1) = " << view.extent (1)
886  << " <= max(whichVectors) = " << maxColInd << ".");
887  }
888 
889  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
890  // zero strides, so modify in that case.
891  const size_t LDA = getDualViewStride (origView);
892  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
893  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
894  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
895  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
896  << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
897 
898  if (whichVectors.size () == 1) {
899  // If whichVectors has only one entry, we don't need to bother
900  // with nonconstant stride. Just shift the view over so it
901  // points to the desired column.
902  //
903  // NOTE (mfh 10 May 2014) This is a special case where we set
904  // origView_ just to view that one column, not all of the
905  // original columns. This ensures that the use of origView_ in
906  // offsetView works correctly.
907  const std::pair<size_t, size_t> colRng (whichVectors[0],
908  whichVectors[0] + 1);
909  view_ = takeSubview (view_, ALL (), colRng);
910  // origView_ = takeSubview (origView_, ALL (), colRng);
911  // whichVectors_.size() == 0 means "constant stride."
912  whichVectors_.clear ();
913  }
914  }
915 
916  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
918  MultiVector (const Teuchos::RCP<const map_type>& map,
919  const Teuchos::ArrayView<const Scalar>& data,
920  const size_t LDA,
921  const size_t numVecs) :
922  base_type (map)
923  {
924  typedef LocalOrdinal LO;
925  typedef GlobalOrdinal GO;
926  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
927  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
928 
929  // Deep copy constructor, constant stride (NO whichVectors_).
930  // There is no need for a deep copy constructor with nonconstant stride.
931 
932  const size_t lclNumRows =
933  map.is_null () ? size_t (0) : map->getLocalNumElements ();
934  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
935  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
936  "map->getLocalNumElements() = " << lclNumRows << ".");
937  if (numVecs != 0) {
938  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
939  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
940  (static_cast<size_t> (data.size ()) < minNumEntries,
941  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
942  "entries, given the input Map and number of vectors in the MultiVector."
943  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
944  "map->getLocalNumElements () = " << minNumEntries << ".");
945  }
946 
947  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
948  //Note: X_out will be completely overwritten
949  auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
950 
951  // Make an unmanaged host Kokkos::View of the input data. First
952  // create a View (X_in_orig) with the original stride. Then,
953  // create a subview (X_in) with the right number of columns.
954  const impl_scalar_type* const X_in_raw =
955  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
956  Kokkos::View<const impl_scalar_type**,
957  Kokkos::LayoutLeft,
958  Kokkos::HostSpace,
959  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
960  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
961  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
962 
963  // If LDA != X_out's column stride, then we need to copy one
964  // column at a time; Kokkos::deep_copy refuses to work in that
965  // case.
966  const size_t outStride =
967  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
968  if (LDA == outStride) { // strides are the same; deep_copy once
969  // This only works because MultiVector uses LayoutLeft.
970  // We would need a custom copy functor otherwise.
971  // DEEP_COPY REVIEW - HOST-TO-DEVICE
972  Kokkos::deep_copy (execution_space(), X_out, X_in);
973  }
974  else { // strides differ; copy one column at a time
975  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
976  out_col_view_type;
977  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
978  in_col_view_type;
979  for (size_t j = 0; j < numVecs; ++j) {
980  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
981  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
982  // DEEP_COPY REVIEW - HOST-TO-DEVICE
983  Kokkos::deep_copy (execution_space(), X_out_j, X_in_j);
984  }
985  }
986  }
987 
988  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
990  MultiVector (const Teuchos::RCP<const map_type>& map,
991  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
992  const size_t numVecs) :
993  base_type (map)
994  {
995  typedef impl_scalar_type IST;
996  typedef LocalOrdinal LO;
997  typedef GlobalOrdinal GO;
998  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
999  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
1000 
1001  const size_t lclNumRows =
1002  map.is_null () ? size_t (0) : map->getLocalNumElements ();
1003  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1004  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
1005  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
1006  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
1007  for (size_t j = 0; j < numVecs; ++j) {
1008  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
1009  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1010  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
1011  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
1012  << X_j_av.size () << " < map->getLocalNumElements() = " << lclNumRows
1013  << ".");
1014  }
1015 
1016  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
1017  auto X_out = this->getLocalViewDevice(Access::ReadWrite);
1018 
1019  // Make sure that the type of a single input column has the same
1020  // array layout as each output column does, so we can deep_copy.
1021  using array_layout = typename decltype (X_out)::array_layout;
1022  using input_col_view_type = typename Kokkos::View<const IST*,
1023  array_layout,
1024  Kokkos::HostSpace,
1025  Kokkos::MemoryUnmanaged>;
1026 
1027  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1028  for (size_t j = 0; j < numVecs; ++j) {
1029  Teuchos::ArrayView<const IST> X_j_av =
1030  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
1031  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
1032  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
1033  // DEEP_COPY REVIEW - HOST-TO-DEVICE
1034  Kokkos::deep_copy (execution_space(), X_j_out, X_j_in);
1035  }
1036  }
1037 
1038  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1041  return whichVectors_.empty ();
1042  }
1043 
1044  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1045  size_t
1048  {
1049  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1050  return static_cast<size_t> (0);
1051  } else {
1052  return this->getMap ()->getLocalNumElements ();
1053  }
1054  }
1055 
1056  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1060  {
1061  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1062  return static_cast<size_t> (0);
1063  } else {
1064  return this->getMap ()->getGlobalNumElements ();
1065  }
1066  }
1067 
1068  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1069  size_t
1071  getStride () const
1072  {
1073  return isConstantStride () ? getDualViewStride (view_) : size_t (0);
1074  }
1075 
1076  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1077  bool
1080  {
1081  //Don't actually get a view, just get pointers.
1082  auto thisData = view_.getDualView().h_view.data();
1083  auto otherData = other.view_.getDualView().h_view.data();
1084  size_t thisSpan = view_.getDualView().h_view.span();
1085  size_t otherSpan = other.view_.getDualView().h_view.span();
1086  return (otherData <= thisData && thisData < otherData + otherSpan)
1087  || (thisData <= otherData && otherData < thisData + thisSpan);
1088  }
1089 
1090  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1091  bool
1093  checkSizes (const SrcDistObject& sourceObj)
1094  {
1095  // Check whether the source object is a MultiVector. If not, then
1096  // we can't even compare sizes, so it's definitely not OK to
1097  // Import or Export from it.
1099  const MV* src = dynamic_cast<const MV*> (&sourceObj);
1100  if (src == nullptr) {
1101  return false;
1102  }
1103  else {
1104  // The target of the Import or Export calls checkSizes() in
1105  // DistObject::doTransfer(). By that point, we've already
1106  // constructed an Import or Export object using the two
1107  // multivectors' Maps, which means that (hopefully) we've
1108  // already checked other attributes of the multivectors. Thus,
1109  // all we need to do here is check the number of columns.
1110  return src->getNumVectors () == this->getNumVectors ();
1111  }
1112  }
1113 
1114  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1115  size_t
1118  return this->getNumVectors ();
1119  }
1120 
1121  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1122  void
1125  (const SrcDistObject& sourceObj,
1126  const size_t numSameIDs,
1127  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1128  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1129  const CombineMode CM,
1130  const execution_space &space)
1131  {
1132  using ::Tpetra::Details::Behavior;
1134  using ::Tpetra::Details::ProfilingRegion;
1135  using std::endl;
1136  using KokkosRefactor::Details::permute_array_multi_column;
1137  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1138  using Kokkos::Compat::create_const_view;
1140 
1141  // We've already called checkSizes(), so this cast must succeed.
1142  MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1143  const bool copyOnHost = runKernelOnHost(sourceMV);
1144  const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1145  const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1146  const char tfecfFuncName[] = "copyAndPermute: ";
1147  ProfilingRegion regionCAP (copyOnHost ? longFuncNameHost : longFuncNameDevice);
1148 
1149  const bool verbose = Behavior::verbose ();
1150  std::unique_ptr<std::string> prefix;
1151  if (verbose) {
1152  auto map = this->getMap ();
1153  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1154  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1155  std::ostringstream os;
1156  os << "Proc " << myRank << ": MV::copyAndPermute: ";
1157  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1158  os << "Start" << endl;
1159  std::cerr << os.str ();
1160  }
1161 
1162  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1163  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1164  std::logic_error, "permuteToLIDs.extent(0) = "
1165  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1166  << permuteFromLIDs.extent (0) << ".");
1167  const size_t numCols = this->getNumVectors ();
1168 
1169  // sourceMV doesn't belong to us, so we can't sync it. Do the
1170  // copying where it's currently sync'd.
1171  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1172  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1173  std::logic_error, "Input MultiVector needs sync to both host "
1174  "and device.");
1175  if (verbose) {
1176  std::ostringstream os;
1177  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1178  std::cerr << os.str ();
1179  }
1180 
1181  if (verbose) {
1182  std::ostringstream os;
1183  os << *prefix << "Copy" << endl;
1184  std::cerr << os.str ();
1185  }
1186 
1187  // TODO (mfh 15 Sep 2013) When we replace
1188  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1189  // ways to copy the data:
1190  //
1191  // 1. Get a (sub)view of each column and call deep_copy on that.
1192  // 2. Write a custom kernel to copy the data.
1193  //
1194  // The first is easier, but the second might be more performant in
1195  // case we decide to use layouts other than LayoutLeft. It might
1196  // even make sense to hide whichVectors_ in an entirely new layout
1197  // for Kokkos Views.
1198 
1199  // Copy rows [0, numSameIDs-1] of the local multivectors.
1200  //
1201  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1202  // time, even when both multivectors are constant-stride, since
1203  // deep_copy between strided subviews with more than one column
1204  // doesn't currently work.
1205 
1206  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1207  // both source and target are constant stride and have multiple
1208  // columns.
1209  if (numSameIDs > 0) {
1210  const std::pair<size_t, size_t> rows (0, numSameIDs);
1211  if (copyOnHost) {
1212  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1213  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1214 
1215  for (size_t j = 0; j < numCols; ++j) {
1216  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1217  const size_t srcCol =
1218  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1219 
1220  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1221  auto src_j = Kokkos::subview (src_h, rows, srcCol);
1222  if (CM == ADD_ASSIGN) {
1223  // Sum src_j into tgt_j
1224  using range_t =
1225  Kokkos::RangePolicy<execution_space, size_t>;
1226  range_t rp(space, 0,numSameIDs);
1227  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1228  aaf(tgt_j, src_j);
1229  Kokkos::parallel_for(rp, aaf);
1230  }
1231  else {
1232  // Copy src_j into tgt_j
1233  // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1234  Kokkos::deep_copy (space, tgt_j, src_j);
1235  space.fence();
1236  }
1237  }
1238  }
1239  else { // copy on device
1240  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1241  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1242 
1243  for (size_t j = 0; j < numCols; ++j) {
1244  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1245  const size_t srcCol =
1246  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1247 
1248  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1249  auto src_j = Kokkos::subview (src_d, rows, srcCol);
1250  if (CM == ADD_ASSIGN) {
1251  // Sum src_j into tgt_j
1252  using range_t =
1253  Kokkos::RangePolicy<execution_space, size_t>;
1254  range_t rp(space, 0,numSameIDs);
1255  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1256  aaf(tgt_j, src_j);
1257  Kokkos::parallel_for(rp, aaf);
1258  }
1259  else {
1260  // Copy src_j into tgt_j
1261  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1262  Kokkos::deep_copy (space, tgt_j, src_j);
1263  space.fence();
1264  }
1265  }
1266  }
1267  }
1268 
1269 
1270  // For the remaining GIDs, execute the permutations. This may
1271  // involve noncontiguous access of both source and destination
1272  // vectors, depending on the LID lists.
1273  //
1274  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1275  // the same process, this merges their values by replacement of
1276  // the last encountered GID, not by the specified merge rule
1277  // (such as ADD).
1278 
1279  // If there are no permutations, we are done
1280  if (permuteFromLIDs.extent (0) == 0 ||
1281  permuteToLIDs.extent (0) == 0) {
1282  if (verbose) {
1283  std::ostringstream os;
1284  os << *prefix << "No permutations. Done!" << endl;
1285  std::cerr << os.str ();
1286  }
1287  return;
1288  }
1289 
1290  if (verbose) {
1291  std::ostringstream os;
1292  os << *prefix << "Permute" << endl;
1293  std::cerr << os.str ();
1294  }
1295 
1296  // We could in theory optimize for the case where exactly one of
1297  // them is constant stride, but we don't currently do that.
1298  const bool nonConstStride =
1299  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1300 
1301  if (verbose) {
1302  std::ostringstream os;
1303  os << *prefix << "nonConstStride="
1304  << (nonConstStride ? "true" : "false") << endl;
1305  std::cerr << os.str ();
1306  }
1307 
1308  // We only need the "which vectors" arrays if either the source or
1309  // target MV is not constant stride. Since we only have one
1310  // kernel that must do double-duty, we have to create a "which
1311  // vectors" array for the MV that _is_ constant stride.
1312  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1313  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1314  if (nonConstStride) {
1315  if (this->whichVectors_.size () == 0) {
1316  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1317  tmpTgt.modify_host ();
1318  for (size_t j = 0; j < numCols; ++j) {
1319  tmpTgt.h_view(j) = j;
1320  }
1321  if (! copyOnHost) {
1322  tmpTgt.sync_device ();
1323  }
1324  tgtWhichVecs = tmpTgt;
1325  }
1326  else {
1327  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1328  tgtWhichVecs =
1329  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1330  "tgtWhichVecs",
1331  copyOnHost);
1332  }
1333  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1334  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1335  this->getNumVectors (),
1336  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1337  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1338  this->getNumVectors () << ".");
1339 
1340  if (sourceMV.whichVectors_.size () == 0) {
1341  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1342  tmpSrc.modify_host ();
1343  for (size_t j = 0; j < numCols; ++j) {
1344  tmpSrc.h_view(j) = j;
1345  }
1346  if (! copyOnHost) {
1347  tmpSrc.sync_device ();
1348  }
1349  srcWhichVecs = tmpSrc;
1350  }
1351  else {
1352  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1353  sourceMV.whichVectors_ ();
1354  srcWhichVecs =
1355  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1356  "srcWhichVecs",
1357  copyOnHost);
1358  }
1359  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1360  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1361  sourceMV.getNumVectors (), std::logic_error,
1362  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1363  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1364  << ".");
1365  }
1366 
1367  if (copyOnHost) { // permute on host too
1368  if (verbose) {
1369  std::ostringstream os;
1370  os << *prefix << "Get permute LIDs on host" << std::endl;
1371  std::cerr << os.str ();
1372  }
1373  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1374  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1375 
1376  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1377  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1378  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1379  auto permuteFromLIDs_h =
1380  create_const_view (permuteFromLIDs.view_host ());
1381 
1382  if (verbose) {
1383  std::ostringstream os;
1384  os << *prefix << "Permute on host" << endl;
1385  std::cerr << os.str ();
1386  }
1387  if (nonConstStride) {
1388  // No need to sync first, because copyOnHost argument to
1389  // getDualViewCopyFromArrayView puts them in the right place.
1390  auto tgtWhichVecs_h =
1391  create_const_view (tgtWhichVecs.view_host ());
1392  auto srcWhichVecs_h =
1393  create_const_view (srcWhichVecs.view_host ());
1394  if (CM == ADD_ASSIGN) {
1395  using op_type = KokkosRefactor::Details::AddOp;
1396  permute_array_multi_column_variable_stride (tgt_h, src_h,
1397  permuteToLIDs_h,
1398  permuteFromLIDs_h,
1399  tgtWhichVecs_h,
1400  srcWhichVecs_h, numCols,
1401  op_type());
1402  }
1403  else {
1404  using op_type = KokkosRefactor::Details::InsertOp;
1405  permute_array_multi_column_variable_stride (tgt_h, src_h,
1406  permuteToLIDs_h,
1407  permuteFromLIDs_h,
1408  tgtWhichVecs_h,
1409  srcWhichVecs_h, numCols,
1410  op_type());
1411  }
1412  }
1413  else {
1414  if (CM == ADD_ASSIGN) {
1415  using op_type = KokkosRefactor::Details::AddOp;
1416  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1417  permuteFromLIDs_h, numCols, op_type());
1418  }
1419  else {
1420  using op_type = KokkosRefactor::Details::InsertOp;
1421  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1422  permuteFromLIDs_h, numCols, op_type());
1423  }
1424  }
1425  }
1426  else { // permute on device
1427  if (verbose) {
1428  std::ostringstream os;
1429  os << *prefix << "Get permute LIDs on device" << endl;
1430  std::cerr << os.str ();
1431  }
1432  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1433  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1434 
1435  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1436  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1437  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1438  auto permuteFromLIDs_d =
1439  create_const_view (permuteFromLIDs.view_device ());
1440 
1441  if (verbose) {
1442  std::ostringstream os;
1443  os << *prefix << "Permute on device" << endl;
1444  std::cerr << os.str ();
1445  }
1446  if (nonConstStride) {
1447  // No need to sync first, because copyOnHost argument to
1448  // getDualViewCopyFromArrayView puts them in the right place.
1449  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1450  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1451  if (CM == ADD_ASSIGN) {
1452  using op_type = KokkosRefactor::Details::AddOp;
1453  permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1454  permuteToLIDs_d,
1455  permuteFromLIDs_d,
1456  tgtWhichVecs_d,
1457  srcWhichVecs_d, numCols,
1458  op_type());
1459  }
1460  else {
1461  using op_type = KokkosRefactor::Details::InsertOp;
1462  permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1463  permuteToLIDs_d,
1464  permuteFromLIDs_d,
1465  tgtWhichVecs_d,
1466  srcWhichVecs_d, numCols,
1467  op_type());
1468  }
1469  }
1470  else {
1471  if (CM == ADD_ASSIGN) {
1472  using op_type = KokkosRefactor::Details::AddOp;
1473  permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1474  permuteFromLIDs_d, numCols, op_type());
1475  }
1476  else {
1477  using op_type = KokkosRefactor::Details::InsertOp;
1478  permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1479  permuteFromLIDs_d, numCols, op_type());
1480  }
1481  }
1482  }
1483 
1484  if (verbose) {
1485  std::ostringstream os;
1486  os << *prefix << "Done!" << endl;
1487  std::cerr << os.str ();
1488  }
1489  }
1490 
1491 // clang-format on
1492 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1494  const SrcDistObject &sourceObj, const size_t numSameIDs,
1495  const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1496  &permuteToLIDs,
1497  const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1498  &permuteFromLIDs,
1499  const CombineMode CM) {
1500  copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1501  execution_space());
1502 }
1503 // clang-format off
1504 
1505  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1506  void
1509  (const SrcDistObject& sourceObj,
1510  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1511  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1512  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1513  size_t& constantNumPackets,
1514  const execution_space &space)
1515  {
1516  using ::Tpetra::Details::Behavior;
1517  using ::Tpetra::Details::ProfilingRegion;
1519  using Kokkos::Compat::create_const_view;
1520  using Kokkos::Compat::getKokkosViewDeepCopy;
1521  using std::endl;
1523 
1524  // We've already called checkSizes(), so this cast must succeed.
1525  MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1526 
1527  const bool packOnHost = runKernelOnHost(sourceMV);
1528  const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1529  const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1530  const char tfecfFuncName[] = "packAndPrepare: ";
1531  ProfilingRegion regionPAP (packOnHost ? longFuncNameHost : longFuncNameDevice);
1532 
1533  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1534  // have the option to check indices. We do so when Tpetra is in
1535  // debug mode. It is in debug mode by default in a debug build,
1536  // but you may control this at run time, before launching the
1537  // executable, by setting the TPETRA_DEBUG environment variable to
1538  // "1" (or "TRUE").
1539  const bool debugCheckIndices = Behavior::debug ();
1540  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1541  // environment variable to "1" (or "TRUE") for copious debug
1542  // output to std::cerr on every MPI process. This is unwise for
1543  // runs with large numbers of MPI processes.
1544  const bool printDebugOutput = Behavior::verbose ();
1545 
1546  std::unique_ptr<std::string> prefix;
1547  if (printDebugOutput) {
1548  auto map = this->getMap ();
1549  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1550  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1551  std::ostringstream os;
1552  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1553  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1554  os << "Start" << endl;
1555  std::cerr << os.str ();
1556  }
1557 
1558 
1559  const size_t numCols = sourceMV.getNumVectors ();
1560 
1561  // This spares us from needing to fill numExportPacketsPerLID.
1562  // Setting constantNumPackets to a nonzero value signals that
1563  // all packets have the same number of entries.
1564  constantNumPackets = numCols;
1565 
1566  // If we have no exports, there is nothing to do. Make sure this
1567  // goes _after_ setting constantNumPackets correctly.
1568  if (exportLIDs.extent (0) == 0) {
1569  if (printDebugOutput) {
1570  std::ostringstream os;
1571  os << *prefix << "No exports on this proc, DONE" << std::endl;
1572  std::cerr << os.str ();
1573  }
1574  return;
1575  }
1576 
1577  /* The layout in the export for MultiVectors is as follows:
1578  exports = { all of the data from row exportLIDs.front() ;
1579  ....
1580  all of the data from row exportLIDs.back() }
1581  This doesn't have the best locality, but is necessary because
1582  the data for a Packet (all data associated with an LID) is
1583  required to be contiguous. */
1584 
1585  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1586  // packing scheme in the above comment? The data going to a
1587  // particular process must be contiguous, of course, but those
1588  // data could include entries from multiple LIDs. DistObject just
1589  // needs to know how to index into that data. Kokkos is good at
1590  // decoupling storage intent from data layout choice.
1591 
1592  const size_t numExportLIDs = exportLIDs.extent (0);
1593  const size_t newExportsSize = numCols * numExportLIDs;
1594  if (printDebugOutput) {
1595  std::ostringstream os;
1596  os << *prefix << "realloc: "
1597  << "numExportLIDs: " << numExportLIDs
1598  << ", exports.extent(0): " << exports.extent (0)
1599  << ", newExportsSize: " << newExportsSize << std::endl;
1600  std::cerr << os.str ();
1601  }
1602  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1603 
1604  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1605  // sync it. Pack it where it's currently sync'd.
1606  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1607  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1608  std::logic_error, "Input MultiVector needs sync to both host "
1609  "and device.");
1610  if (printDebugOutput) {
1611  std::ostringstream os;
1612  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1613  std::cerr << os.str ();
1614  }
1615 
1616  // Mark 'exports' here, since we might have resized it above.
1617  // Resizing currently requires calling the constructor, which
1618  // clears out the 'modified' flags.
1619  if (packOnHost) {
1620  // nde 06 Feb 2020: If 'exports' does not require resize
1621  // when reallocDualViewIfNeeded is called, the modified flags
1622  // are not cleared out. This can result in host and device views
1623  // being out-of-sync, resuling in an error in exports.modify_* calls.
1624  // Clearing the sync flags prevents this possible case.
1625  exports.clear_sync_state ();
1626  exports.modify_host ();
1627  }
1628  else {
1629  // nde 06 Feb 2020: If 'exports' does not require resize
1630  // when reallocDualViewIfNeeded is called, the modified flags
1631  // are not cleared out. This can result in host and device views
1632  // being out-of-sync, resuling in an error in exports.modify_* calls.
1633  // Clearing the sync flags prevents this possible case.
1634  exports.clear_sync_state ();
1635  exports.modify_device ();
1636  }
1637 
1638  if (numCols == 1) { // special case for one column only
1639  // MultiVector always represents a single column with constant
1640  // stride, but it doesn't hurt to implement both cases anyway.
1641  //
1642  // ETP: I'm not sure I agree with the above statement. Can't a single-
1643  // column multivector be a subview of another multi-vector, in which case
1644  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1645  // separately.
1646  //
1647  // mfh 18 Jan 2016: In answer to ETP's comment above:
1648  // MultiVector treats single-column MultiVectors created using a
1649  // "nonconstant stride constructor" as a special case, and makes
1650  // them constant stride (by making whichVectors_ have length 0).
1651  if (sourceMV.isConstantStride ()) {
1652  using KokkosRefactor::Details::pack_array_single_column;
1653  if (printDebugOutput) {
1654  std::ostringstream os;
1655  os << *prefix << "Pack numCols=1 const stride" << endl;
1656  std::cerr << os.str ();
1657  }
1658  if (packOnHost) {
1659  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1660  pack_array_single_column (exports.view_host (),
1661  src_host,
1662  exportLIDs.view_host (),
1663  0,
1664  debugCheckIndices);
1665  }
1666  else { // pack on device
1667  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1668  pack_array_single_column (exports.view_device (),
1669  src_dev,
1670  exportLIDs.view_device (),
1671  0,
1672  debugCheckIndices,
1673  space);
1674  }
1675  }
1676  else {
1677  using KokkosRefactor::Details::pack_array_single_column;
1678  if (printDebugOutput) {
1679  std::ostringstream os;
1680  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1681  std::cerr << os.str ();
1682  }
1683  if (packOnHost) {
1684  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1685  pack_array_single_column (exports.view_host (),
1686  src_host,
1687  exportLIDs.view_host (),
1688  sourceMV.whichVectors_[0],
1689  debugCheckIndices);
1690  }
1691  else { // pack on device
1692  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1693  pack_array_single_column (exports.view_device (),
1694  src_dev,
1695  exportLIDs.view_device (),
1696  sourceMV.whichVectors_[0],
1697  debugCheckIndices,
1698  space);
1699  }
1700  }
1701  }
1702  else { // the source MultiVector has multiple columns
1703  if (sourceMV.isConstantStride ()) {
1704  using KokkosRefactor::Details::pack_array_multi_column;
1705  if (printDebugOutput) {
1706  std::ostringstream os;
1707  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1708  std::cerr << os.str ();
1709  }
1710  if (packOnHost) {
1711  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1712  pack_array_multi_column (exports.view_host (),
1713  src_host,
1714  exportLIDs.view_host (),
1715  numCols,
1716  debugCheckIndices);
1717  }
1718  else { // pack on device
1719  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1720  pack_array_multi_column (exports.view_device (),
1721  src_dev,
1722  exportLIDs.view_device (),
1723  numCols,
1724  debugCheckIndices,
1725  space);
1726  }
1727  }
1728  else {
1729  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1730  if (printDebugOutput) {
1731  std::ostringstream os;
1732  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1733  << endl;
1734  std::cerr << os.str ();
1735  }
1736  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1737  // whichVectors_ can be expensive, but pack and unpack for
1738  // nonconstant-stride MultiVectors is slower anyway.
1739  using IST = impl_scalar_type;
1740  using DV = Kokkos::DualView<IST*, device_type>;
1741  using HES = typename DV::t_host::execution_space;
1742  using DES = typename DV::t_dev::execution_space;
1743  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1744  if (packOnHost) {
1745  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1746  pack_array_multi_column_variable_stride
1747  (exports.view_host (),
1748  src_host,
1749  exportLIDs.view_host (),
1750  getKokkosViewDeepCopy<HES> (whichVecs),
1751  numCols,
1752  debugCheckIndices);
1753  }
1754  else { // pack on device
1755  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1756  pack_array_multi_column_variable_stride
1757  (exports.view_device (),
1758  src_dev,
1759  exportLIDs.view_device (),
1760  getKokkosViewDeepCopy<DES> (whichVecs),
1761  numCols,
1762  debugCheckIndices, space);
1763  }
1764  }
1765  }
1766 
1767  if (printDebugOutput) {
1768  std::ostringstream os;
1769  os << *prefix << "Done!" << endl;
1770  std::cerr << os.str ();
1771  }
1772 
1773  }
1774 
1775 // clang-format on
1776  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1777  void
1780  (const SrcDistObject& sourceObj,
1781  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1782  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1783  Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1784  size_t& constantNumPackets) {
1785  packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space());
1786  }
1787 // clang-format off
1788 
1789  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1790  template <class NO>
1791  typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1792  typename NO::device_type::memory_space>::value,
1793  bool>::type
1795  reallocImportsIfNeededImpl (const size_t newSize,
1796  const bool verbose,
1797  const std::string* prefix,
1798  const bool areRemoteLIDsContiguous,
1799  const CombineMode CM)
1800  {
1801  // This implementation of reallocImportsIfNeeded is an
1802  // optimization that is specific to MultiVector. We check if the
1803  // imports_ view can be aliased to the end of the data view_. If
1804  // that is the case, we can skip the unpackAndCombine call.
1805 
1806  if (verbose) {
1807  std::ostringstream os;
1808  os << *prefix << "Realloc (if needed) imports_ from "
1809  << this->imports_.extent (0) << " to " << newSize << std::endl;
1810  std::cerr << os.str ();
1811  }
1812 
1813  bool reallocated = false;
1814 
1815  using IST = impl_scalar_type;
1816  using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1817 
1818  // Conditions for aliasing memory:
1819  // - When both sides of the dual view are in the same memory
1820  // space, we do not need to worry about syncing things.
1821  // - When both memory spaces are different, we only alias if this
1822  // does not incur additional sync'ing.
1823  // - The remote LIDs need to be contiguous, so that we do not need
1824  // to reorder received information.
1825  // - CombineMode needs to be INSERT.
1826  // - The number of vectors needs to be 1, otherwise we need to
1827  // reorder the received data.
1828  if ((dual_view_type::impl_dualview_is_single_device::value ||
1829  (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) ||
1830  (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) &&
1831  areRemoteLIDsContiguous &&
1832  (CM == INSERT || CM == REPLACE) &&
1833  (getNumVectors() == 1) &&
1834  (newSize > 0)) {
1835 
1836  size_t offset = getLocalLength () - newSize;
1837  reallocated = this->imports_.d_view.data() != view_.getDualView().d_view.data() + offset;
1838  if (reallocated) {
1839  typedef std::pair<size_t, size_t> range_type;
1840  this->imports_ = DV(view_.getDualView(),
1841  range_type (offset, getLocalLength () ),
1842  0);
1843 
1844  if (verbose) {
1845  std::ostringstream os;
1846  os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1847  std::cerr << os.str ();
1848  }
1849  }
1850  return reallocated;
1851  }
1852  {
1854  reallocated =
1855  reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1856  if (verbose) {
1857  std::ostringstream os;
1858  os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1859  std::cerr << os.str ();
1860  }
1861  this->imports_ = this->unaliased_imports_;
1862  }
1863  return reallocated;
1864  }
1865 
1866  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1867  template <class NO>
1868  typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1869  typename NO::device_type::memory_space>::value,
1870  bool>::type
1872  reallocImportsIfNeededImpl (const size_t newSize,
1873  const bool verbose,
1874  const std::string* prefix,
1875  const bool areRemoteLIDsContiguous,
1876  const CombineMode CM)
1877  {
1878  return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1879  }
1880 
1881  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1882  bool
1884  reallocImportsIfNeeded(const size_t newSize,
1885  const bool verbose,
1886  const std::string* prefix,
1887  const bool areRemoteLIDsContiguous,
1888  const CombineMode CM) {
1890  return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1891  }
1892 
1893  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1894  bool
1897  return (this->imports_.d_view.data() + this->imports_.d_view.extent(0) ==
1898  view_.getDualView().d_view.data() + view_.getDualView().d_view.extent(0));
1899  }
1900 
1901 
1902  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1903  void
1906  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1907  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1908  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1909  const size_t constantNumPackets,
1910  const CombineMode CM,
1911  const execution_space &space)
1912  {
1913  using ::Tpetra::Details::Behavior;
1914  using ::Tpetra::Details::ProfilingRegion;
1915  using KokkosRefactor::Details::unpack_array_multi_column;
1916  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1917  using Kokkos::Compat::getKokkosViewDeepCopy;
1918  using std::endl;
1919 
1920  const bool unpackOnHost = runKernelOnHost(imports);
1921 
1922  const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1923  const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1924  const char * longFuncName = unpackOnHost ? longFuncNameHost : longFuncNameDevice;
1925  const char tfecfFuncName[] = "unpackAndCombine: ";
1926  ProfilingRegion regionUAC (longFuncName);
1927 
1928  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1929  // have the option to check indices. We do so when Tpetra is in
1930  // debug mode. It is in debug mode by default in a debug build,
1931  // but you may control this at run time, before launching the
1932  // executable, by setting the TPETRA_DEBUG environment variable to
1933  // "1" (or "TRUE").
1934  const bool debugCheckIndices = Behavior::debug ();
1935 
1936  const bool printDebugOutput = Behavior::verbose ();
1937  std::unique_ptr<std::string> prefix;
1938  if (printDebugOutput) {
1939  auto map = this->getMap ();
1940  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1941  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1942  std::ostringstream os;
1943  os << "Proc " << myRank << ": " << longFuncName << ": ";
1944  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1945  os << "Start" << endl;
1946  std::cerr << os.str ();
1947  }
1948 
1949  // If we have no imports, there is nothing to do
1950  if (importLIDs.extent (0) == 0) {
1951  if (printDebugOutput) {
1952  std::ostringstream os;
1953  os << *prefix << "No imports. Done!" << endl;
1954  std::cerr << os.str ();
1955  }
1956  return;
1957  }
1958 
1959  // Check, whether imports_ is a subview of the MV view.
1960  if (importsAreAliased()) {
1961  if (printDebugOutput) {
1962  std::ostringstream os;
1963  os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1964  std::cerr << os.str ();
1965  }
1966  return;
1967  }
1968 
1969 
1970  const size_t numVecs = getNumVectors ();
1971  if (debugCheckIndices) {
1972  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1973  (static_cast<size_t> (imports.extent (0)) !=
1974  numVecs * importLIDs.extent (0),
1975  std::runtime_error,
1976  "imports.extent(0) = " << imports.extent (0)
1977  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1978  << " * " << importLIDs.extent (0) << " = "
1979  << numVecs * importLIDs.extent (0) << ".");
1980 
1981  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1982  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1983  "constantNumPackets input argument must be nonzero.");
1984 
1985  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1986  (static_cast<size_t> (numVecs) !=
1987  static_cast<size_t> (constantNumPackets),
1988  std::runtime_error, "constantNumPackets must equal numVecs.");
1989  }
1990 
1991  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1992  // the memory space in which the imports buffer was last modified and
1993  // the size of the imports buffer.
1994  // DistObject::doTransferNew decides where it was last modified (based on
1995  // whether communication buffers used were on host or device).
1996  if (unpackOnHost) {
1997  if (this->imports_.need_sync_host()) this->imports_.sync_host();
1998  }
1999  else {
2000  if (this->imports_.need_sync_device()) this->imports_.sync_device();
2001  }
2002 
2003  if (printDebugOutput) {
2004  std::ostringstream os;
2005  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
2006  << endl;
2007  std::cerr << os.str ();
2008  }
2009 
2010  // We have to sync before modifying, because this method may read
2011  // as well as write (depending on the CombineMode).
2012  auto imports_d = imports.view_device ();
2013  auto imports_h = imports.view_host ();
2014  auto importLIDs_d = importLIDs.view_device ();
2015  auto importLIDs_h = importLIDs.view_host ();
2016 
2017  Kokkos::DualView<size_t*, device_type> whichVecs;
2018  if (! isConstantStride ()) {
2019  Kokkos::View<const size_t*, Kokkos::HostSpace,
2020  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
2021  numVecs);
2022  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
2023  if (unpackOnHost) {
2024  whichVecs.modify_host ();
2025  // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
2026  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
2027  }
2028  else {
2029  whichVecs.modify_device ();
2030  // DEEP_COPY REVIEW - HOST-TO-DEVICE
2031  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
2032  }
2033  }
2034  auto whichVecs_d = whichVecs.view_device ();
2035  auto whichVecs_h = whichVecs.view_host ();
2036 
2037  /* The layout in the export for MultiVectors is as follows:
2038  imports = { all of the data from row exportLIDs.front() ;
2039  ....
2040  all of the data from row exportLIDs.back() }
2041  This doesn't have the best locality, but is necessary because
2042  the data for a Packet (all data associated with an LID) is
2043  required to be contiguous. */
2044 
2045  if (numVecs > 0 && importLIDs.extent (0) > 0) {
2046  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
2047  using host_exec_space = typename dual_view_type::t_host::execution_space;
2048 
2049  // This fixes GitHub Issue #4418.
2050  const bool use_atomic_updates = unpackOnHost ?
2051  host_exec_space().concurrency () != 1 :
2052  dev_exec_space().concurrency () != 1;
2053 
2054  if (printDebugOutput) {
2055  std::ostringstream os;
2056  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
2057  std::cerr << os.str ();
2058  }
2059 
2060  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
2061  // custom combine modes, start editing here.
2062 
2063  if (CM == INSERT || CM == REPLACE) {
2064  using op_type = KokkosRefactor::Details::InsertOp;
2065  if (isConstantStride ()) {
2066  if (unpackOnHost) {
2067  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2068  unpack_array_multi_column (host_exec_space (),
2069  X_h, imports_h, importLIDs_h,
2070  op_type (), numVecs,
2071  use_atomic_updates,
2072  debugCheckIndices);
2073 
2074  }
2075  else { // unpack on device
2076  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2077  unpack_array_multi_column (space,
2078  X_d, imports_d, importLIDs_d,
2079  op_type (), numVecs,
2080  use_atomic_updates,
2081  debugCheckIndices);
2082  }
2083  }
2084  else { // not constant stride
2085  if (unpackOnHost) {
2086  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2087  unpack_array_multi_column_variable_stride (host_exec_space (),
2088  X_h, imports_h,
2089  importLIDs_h,
2090  whichVecs_h,
2091  op_type (),
2092  numVecs,
2093  use_atomic_updates,
2094  debugCheckIndices);
2095  }
2096  else { // unpack on device
2097  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2098  unpack_array_multi_column_variable_stride (space,
2099  X_d, imports_d,
2100  importLIDs_d,
2101  whichVecs_d,
2102  op_type (),
2103  numVecs,
2104  use_atomic_updates,
2105  debugCheckIndices);
2106  }
2107  }
2108  }
2109  else if (CM == ADD || CM == ADD_ASSIGN) {
2110  using op_type = KokkosRefactor::Details::AddOp;
2111  if (isConstantStride ()) {
2112  if (unpackOnHost) {
2113  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2114  unpack_array_multi_column (host_exec_space (),
2115  X_h, imports_h, importLIDs_h,
2116  op_type (), numVecs,
2117  use_atomic_updates,
2118  debugCheckIndices);
2119  }
2120  else { // unpack on device
2121  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2122  unpack_array_multi_column (space,
2123  X_d, imports_d, importLIDs_d,
2124  op_type (), numVecs,
2125  use_atomic_updates,
2126  debugCheckIndices);
2127  }
2128  }
2129  else { // not constant stride
2130  if (unpackOnHost) {
2131  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2132  unpack_array_multi_column_variable_stride (host_exec_space (),
2133  X_h, imports_h,
2134  importLIDs_h,
2135  whichVecs_h,
2136  op_type (),
2137  numVecs,
2138  use_atomic_updates,
2139  debugCheckIndices);
2140  }
2141  else { // unpack on device
2142  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2143  unpack_array_multi_column_variable_stride (space,
2144  X_d, imports_d,
2145  importLIDs_d,
2146  whichVecs_d,
2147  op_type (),
2148  numVecs,
2149  use_atomic_updates,
2150  debugCheckIndices);
2151  }
2152  }
2153  }
2154  else if (CM == ABSMAX) {
2155  using op_type = KokkosRefactor::Details::AbsMaxOp;
2156  if (isConstantStride ()) {
2157  if (unpackOnHost) {
2158  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2159  unpack_array_multi_column (host_exec_space (),
2160  X_h, imports_h, importLIDs_h,
2161  op_type (), numVecs,
2162  use_atomic_updates,
2163  debugCheckIndices);
2164  }
2165  else { // unpack on device
2166  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2167  unpack_array_multi_column (space,
2168  X_d, imports_d, importLIDs_d,
2169  op_type (), numVecs,
2170  use_atomic_updates,
2171  debugCheckIndices);
2172  }
2173  }
2174  else {
2175  if (unpackOnHost) {
2176  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2177  unpack_array_multi_column_variable_stride (host_exec_space (),
2178  X_h, imports_h,
2179  importLIDs_h,
2180  whichVecs_h,
2181  op_type (),
2182  numVecs,
2183  use_atomic_updates,
2184  debugCheckIndices);
2185  }
2186  else { // unpack on device
2187  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2188  unpack_array_multi_column_variable_stride (space,
2189  X_d, imports_d,
2190  importLIDs_d,
2191  whichVecs_d,
2192  op_type (),
2193  numVecs,
2194  use_atomic_updates,
2195  debugCheckIndices);
2196  }
2197  }
2198  }
2199  else {
2200  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2201  (true, std::logic_error, "Invalid CombineMode");
2202  }
2203  }
2204  else {
2205  if (printDebugOutput) {
2206  std::ostringstream os;
2207  os << *prefix << "Nothing to unpack" << endl;
2208  std::cerr << os.str ();
2209  }
2210  }
2211 
2212  if (printDebugOutput) {
2213  std::ostringstream os;
2214  os << *prefix << "Done!" << endl;
2215  std::cerr << os.str ();
2216  }
2217  }
2218 
2219  // clang-format on
2220  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2221  void
2224  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2225  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2226  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2227  const size_t constantNumPackets,
2228  const CombineMode CM) {
2229  unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2230  }
2231  // clang-format off
2232 
2233  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2234  size_t
2237  {
2238  if (isConstantStride ()) {
2239  return static_cast<size_t> (view_.extent (1));
2240  } else {
2241  return static_cast<size_t> (whichVectors_.size ());
2242  }
2243  }
2244 
2245  namespace { // (anonymous)
2246 
2247  template<class RV>
2248  void
2249  gblDotImpl (const RV& dotsOut,
2250  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2251  const bool distributed)
2252  {
2253  using Teuchos::REDUCE_MAX;
2254  using Teuchos::REDUCE_SUM;
2255  using Teuchos::reduceAll;
2256  typedef typename RV::non_const_value_type dot_type;
2257 
2258  const size_t numVecs = dotsOut.extent (0);
2259 
2260  // If the MultiVector is distributed over multiple processes, do
2261  // the distributed (interprocess) part of the dot product. We
2262  // assume that the MPI implementation can read from and write to
2263  // device memory.
2264  //
2265  // replaceMap() may have removed some processes. Those
2266  // processes have a null Map. They must not participate in any
2267  // collective operations. We ask first whether the Map is null,
2268  // because isDistributed() defers that question to the Map. We
2269  // still compute and return local dots for processes not
2270  // participating in collective operations; those probably don't
2271  // make any sense, but it doesn't hurt to do them, since it's
2272  // illegal to call dot() on those processes anyway.
2273  if (distributed && ! comm.is_null ()) {
2274  // The calling process only participates in the collective if
2275  // both the Map and its Comm on that process are nonnull.
2276  const int nv = static_cast<int> (numVecs);
2277  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
2278 
2279  if (commIsInterComm) {
2280  // If comm is an intercomm, then we may not alias input and
2281  // output buffers, so we have to make a copy of the local
2282  // sum.
2283  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2284  // DEEP_COPY REVIEW - NOT TESTED
2285  Kokkos::deep_copy (lclDots, dotsOut);
2286  const dot_type* const lclSum = lclDots.data ();
2287  dot_type* const gblSum = dotsOut.data ();
2288  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2289  }
2290  else {
2291  dot_type* const inout = dotsOut.data ();
2292  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
2293  }
2294  }
2295  }
2296  } // namespace (anonymous)
2297 
2298  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2299  void
2302  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2303  {
2304  using ::Tpetra::Details::Behavior;
2305  using Kokkos::subview;
2306  using Teuchos::Comm;
2307  using Teuchos::null;
2308  using Teuchos::RCP;
2309  // View of all the dot product results.
2310  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2311  typedef typename dual_view_type::t_dev_const XMV;
2312  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2313 
2314  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2315 
2316  const size_t numVecs = this->getNumVectors ();
2317  if (numVecs == 0) {
2318  return; // nothing to do
2319  }
2320  const size_t lclNumRows = this->getLocalLength ();
2321  const size_t numDots = static_cast<size_t> (dots.extent (0));
2322  const bool debug = Behavior::debug ();
2323 
2324  if (debug) {
2325  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2326  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2327  (! compat, std::invalid_argument, "'*this' MultiVector is not "
2328  "compatible with the input MultiVector A. We only test for this "
2329  "in debug mode.");
2330  }
2331 
2332  // FIXME (mfh 11 Jul 2014) These exception tests may not
2333  // necessarily be thrown on all processes consistently. We should
2334  // instead pass along error state with the inner product. We
2335  // could do this by setting an extra slot to
2336  // Kokkos::ArithTraits<dot_type>::one() on error. The
2337  // final sum should be
2338  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2339  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2340  lclNumRows != A.getLocalLength (), std::runtime_error,
2341  "MultiVectors do not have the same local length. "
2342  "this->getLocalLength() = " << lclNumRows << " != "
2343  "A.getLocalLength() = " << A.getLocalLength () << ".");
2344  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2345  numVecs != A.getNumVectors (), std::runtime_error,
2346  "MultiVectors must have the same number of columns (vectors). "
2347  "this->getNumVectors() = " << numVecs << " != "
2348  "A.getNumVectors() = " << A.getNumVectors () << ".");
2349  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2350  numDots != numVecs, std::runtime_error,
2351  "The output array 'dots' must have the same number of entries as the "
2352  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2353  numDots << " != this->getNumVectors() = " << numVecs << ".");
2354 
2355  const std::pair<size_t, size_t> colRng (0, numVecs);
2356  RV dotsOut = subview (dots, colRng);
2357  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2358  this->getMap ()->getComm ();
2359 
2360  auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2361  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2362 
2363  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2364  this->whichVectors_.getRawPtr (),
2365  A.whichVectors_.getRawPtr (),
2366  this->isConstantStride (), A.isConstantStride ());
2367 
2368  // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2369  // functions unless they explicitely return a value to Host.
2370  // Here while the lclDot are on host, they are not a return
2371  // value, therefore they might be avaible to us immediately.
2372  // Adding a frnce here guarantees that we will have the lclDot
2373  // ahead of the MPI reduction.
2374  execution_space exec_space_instance = execution_space();
2375  exec_space_instance.fence();
2376 
2377  gblDotImpl (dotsOut, comm, this->isDistributed ());
2378  }
2379 
2380  namespace { // (anonymous)
2381  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2383  multiVectorSingleColumnDot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
2385  {
2386  using ::Tpetra::Details::ProfilingRegion;
2388  using dot_type = typename MV::dot_type;
2389  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2390 
2391  auto map = x.getMap ();
2392  Teuchos::RCP<const Teuchos::Comm<int> > comm =
2393  map.is_null () ? Teuchos::null : map->getComm ();
2394  if (comm.is_null ()) {
2395  return Kokkos::ArithTraits<dot_type>::zero ();
2396  }
2397  else {
2398  using LO = LocalOrdinal;
2399  // The min just ensures that we don't overwrite memory that
2400  // doesn't belong to us, in the erroneous input case where x
2401  // and y have different numbers of rows.
2402  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2403  y.getLocalLength ()));
2404  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2405  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2406  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2407 
2408  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2409  //const_cast<MV&>(x).sync_device ();
2410  //const_cast<MV&>(y).sync_device ();
2411 
2412  auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2413  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2414  auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2415  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2416  lclDot = KokkosBlas::dot (x_1d, y_1d);
2417 
2418  if (x.isDistributed ()) {
2419  using Teuchos::outArg;
2420  using Teuchos::REDUCE_SUM;
2421  using Teuchos::reduceAll;
2422  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2423  }
2424  else {
2425  gblDot = lclDot;
2426  }
2427  return gblDot;
2428  }
2429  }
2430  } // namespace (anonymous)
2431 
2432 
2433 
2434  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2435  void
2438  const Teuchos::ArrayView<dot_type>& dots) const
2439  {
2440  const char tfecfFuncName[] = "dot: ";
2441  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2442 
2443  const size_t numVecs = this->getNumVectors ();
2444  const size_t lclNumRows = this->getLocalLength ();
2445  const size_t numDots = static_cast<size_t> (dots.size ());
2446 
2447  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2448  // not necessarily be thrown on all processes consistently. We
2449  // keep them for now, because MultiVector's unit tests insist on
2450  // them. In the future, we should instead pass along error state
2451  // with the inner product. We could do this by setting an extra
2452  // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2453  // The final sum should be
2454  // Kokkos::ArithTraits<dot_type>::zero() if not error.
2455  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2456  (lclNumRows != A.getLocalLength (), std::runtime_error,
2457  "MultiVectors do not have the same local length. "
2458  "this->getLocalLength() = " << lclNumRows << " != "
2459  "A.getLocalLength() = " << A.getLocalLength () << ".");
2460  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2461  (numVecs != A.getNumVectors (), std::runtime_error,
2462  "MultiVectors must have the same number of columns (vectors). "
2463  "this->getNumVectors() = " << numVecs << " != "
2464  "A.getNumVectors() = " << A.getNumVectors () << ".");
2465  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2466  (numDots != numVecs, std::runtime_error,
2467  "The output array 'dots' must have the same number of entries as the "
2468  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2469  numDots << " != this->getNumVectors() = " << numVecs << ".");
2470 
2471  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2472  const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
2473  dots[0] = gblDot;
2474  }
2475  else {
2476  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2477  }
2478  }
2479 
2480  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2481  void
2483  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2484  {
2486  using ::Tpetra::Details::NORM_TWO;
2487  using ::Tpetra::Details::ProfilingRegion;
2488  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2489 
2490  // The function needs to be able to sync X.
2491  MV& X = const_cast<MV&> (*this);
2492  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2493  }
2494 
2495  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2496  void
2498  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2499  {
2500  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2501  this->norm2 (norms_av);
2502  }
2503 
2504 
2505  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2506  void
2508  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2509  {
2511  using ::Tpetra::Details::NORM_ONE;
2512  using ::Tpetra::Details::ProfilingRegion;
2513  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2514 
2515  // The function needs to be able to sync X.
2516  MV& X = const_cast<MV&> (*this);
2517  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2518  }
2519 
2520  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2521  void
2523  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2524  {
2525  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2526  this->norm1 (norms_av);
2527  }
2528 
2529  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2530  void
2532  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2533  {
2535  using ::Tpetra::Details::NORM_INF;
2536  using ::Tpetra::Details::ProfilingRegion;
2537  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2538 
2539  // The function needs to be able to sync X.
2540  MV& X = const_cast<MV&> (*this);
2541  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2542  }
2543 
2544  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2545  void
2547  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2548  {
2549  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2550  this->normInf (norms_av);
2551  }
2552 
2553  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2554  void
2556  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2557  {
2558  // KR FIXME Overload this method to take a View.
2559  using Kokkos::ALL;
2560  using Kokkos::subview;
2561  using Teuchos::Comm;
2562  using Teuchos::RCP;
2563  using Teuchos::reduceAll;
2564  using Teuchos::REDUCE_SUM;
2565  typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2566 
2567  const size_t lclNumRows = this->getLocalLength ();
2568  const size_t numVecs = this->getNumVectors ();
2569  const size_t numMeans = static_cast<size_t> (means.size ());
2570 
2571  TEUCHOS_TEST_FOR_EXCEPTION(
2572  numMeans != numVecs, std::runtime_error,
2573  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2574  << " != this->getNumVectors() = " << numVecs << ".");
2575 
2576  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2577  const std::pair<size_t, size_t> colRng (0, numVecs);
2578 
2579  // Make sure that the final output view has the same layout as the
2580  // temporary view's HostMirror. Left or Right doesn't matter for
2581  // a 1-D array anyway; this is just to placate the compiler.
2582  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2583  typedef Kokkos::View<impl_scalar_type*,
2584  typename local_view_type::HostMirror::array_layout,
2585  Kokkos::HostSpace,
2586  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2587  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2588 
2589  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2590  this->getMap ()->getComm ();
2591 
2592  // If we need sync to device, then host has the most recent version.
2593  const bool useHostVersion = this->need_sync_device ();
2594  if (useHostVersion) {
2595  // DualView was last modified on host, so run the local kernel there.
2596  auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2597  rowRng, Kokkos::ALL ());
2598  // Compute the local sum of each column.
2599  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2600  if (isConstantStride ()) {
2601  KokkosBlas::sum (lclSums, X_lcl);
2602  }
2603  else {
2604  for (size_t j = 0; j < numVecs; ++j) {
2605  const size_t col = whichVectors_[j];
2606  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2607  }
2608  }
2609 
2610  // If there are multiple MPI processes, the all-reduce reads
2611  // from lclSums, and writes to meansOut. Otherwise, we just
2612  // copy lclSums into meansOut.
2613  if (! comm.is_null () && this->isDistributed ()) {
2614  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2615  lclSums.data (), meansOut.data ());
2616  }
2617  else {
2618  // DEEP_COPY REVIEW - NOT TESTED
2619  Kokkos::deep_copy (meansOut, lclSums);
2620  }
2621  }
2622  else {
2623  // DualView was last modified on device, so run the local kernel there.
2624  auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2625  rowRng, Kokkos::ALL ());
2626 
2627  // Compute the local sum of each column.
2628  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2629  if (isConstantStride ()) {
2630  KokkosBlas::sum (lclSums, X_lcl);
2631  }
2632  else {
2633  for (size_t j = 0; j < numVecs; ++j) {
2634  const size_t col = whichVectors_[j];
2635  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2636  }
2637  }
2638  // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2639  // functions unless they explicitly return a value to Host.
2640  // Here while the lclSums are on the host, they are not a return
2641  // value, therefore they might be available to us immediately.
2642  // Adding a fence here guarantees that we will have the lclSums
2643  // ahead of the MPI reduction.
2644  execution_space exec_space_instance = execution_space();
2645  exec_space_instance.fence();
2646 
2647  // If there are multiple MPI processes, the all-reduce reads
2648  // from lclSums, and writes to meansOut. (We assume that MPI
2649  // can read device memory.) Otherwise, we just copy lclSums
2650  // into meansOut.
2651  if (! comm.is_null () && this->isDistributed ()) {
2652  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2653  lclSums.data (), meansOut.data ());
2654  }
2655  else {
2656  // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2657  Kokkos::deep_copy (meansOut, lclSums);
2658  }
2659  }
2660 
2661  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2662  // to the magnitude type, since operator/ (std::complex<T>, int)
2663  // isn't necessarily defined.
2664  const impl_scalar_type OneOverN =
2665  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2666  for (size_t k = 0; k < numMeans; ++k) {
2667  meansOut(k) = meansOut(k) * OneOverN;
2668  }
2669  }
2670 
2671 
2672  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2673  void
2676  {
2677  typedef impl_scalar_type IST;
2678  typedef Kokkos::ArithTraits<IST> ATS;
2679  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2680  typedef typename pool_type::generator_type generator_type;
2681 
2682  const IST max = Kokkos::rand<generator_type, IST>::max ();
2683  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2684 
2685  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2686  }
2687 
2688 
2689  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2690  void
2692  randomize (const Scalar& minVal, const Scalar& maxVal)
2693  {
2694  typedef impl_scalar_type IST;
2695  typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2696  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2697 
2698  // Seed the pool based on the system RNG and the MPI rank, if needed
2699  if(!tpetra_pool_type::isSet())
2700  tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2701 
2702  pool_type & rand_pool = tpetra_pool_type::getPool();
2703  const IST max = static_cast<IST> (maxVal);
2704  const IST min = static_cast<IST> (minVal);
2705 
2706  auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2707 
2708  if (isConstantStride ()) {
2709  Kokkos::fill_random (thisView, rand_pool, min, max);
2710  }
2711  else {
2712  const size_t numVecs = getNumVectors ();
2713  for (size_t k = 0; k < numVecs; ++k) {
2714  const size_t col = whichVectors_[k];
2715  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2716  Kokkos::fill_random (X_k, rand_pool, min, max);
2717  }
2718  }
2719  }
2720 
2721  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2722  void
2724  putScalar (const Scalar& alpha)
2725  {
2726  using ::Tpetra::Details::ProfilingRegion;
2727  using ::Tpetra::Details::Blas::fill;
2728  using DES = typename dual_view_type::t_dev::execution_space;
2729  using HES = typename dual_view_type::t_host::execution_space;
2730  using LO = LocalOrdinal;
2731  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2732 
2733  // We need this cast for cases like Scalar = std::complex<T> but
2734  // impl_scalar_type = Kokkos::complex<T>.
2735  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2736  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2737  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2738 
2739  // Modify the most recently updated version of the data. This
2740  // avoids sync'ing, which could violate users' expectations.
2741  //
2742  // If we need sync to device, then host has the most recent version.
2743  const bool runOnHost = runKernelOnHost(*this);
2744  if (! runOnHost) {
2745  auto X = this->getLocalViewDevice(Access::OverwriteAll);
2746  if (this->isConstantStride ()) {
2747  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2748  }
2749  else {
2750  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2751  this->whichVectors_.getRawPtr ());
2752  }
2753  }
2754  else { // last modified in host memory, so modify data there.
2755  auto X = this->getLocalViewHost(Access::OverwriteAll);
2756  if (this->isConstantStride ()) {
2757  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2758  }
2759  else {
2760  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2761  this->whichVectors_.getRawPtr ());
2762  }
2763  }
2764  }
2765 
2766 
2767  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2768  void
2770  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2771  {
2772  using Teuchos::ArrayRCP;
2773  using Teuchos::Comm;
2774  using Teuchos::RCP;
2775  using ST = Scalar;
2776  using LO = LocalOrdinal;
2777  using GO = GlobalOrdinal;
2778 
2779  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2780  // it might work if the MV is a column view of another MV.
2781  // However, things might go wrong when restoring the original
2782  // Map, so we don't allow this case for now.
2783  TEUCHOS_TEST_FOR_EXCEPTION(
2784  ! this->isConstantStride (), std::logic_error,
2785  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2786  "if the MultiVector is a column view of another MultiVector (that is, if "
2787  "isConstantStride() == false).");
2788 
2789  // Case 1: current Map and new Map are both nonnull on this process.
2790  // Case 2: current Map is nonnull, new Map is null.
2791  // Case 3: current Map is null, new Map is nonnull.
2792  // Case 4: both Maps are null: forbidden.
2793  //
2794  // Case 1 means that we don't have to do anything on this process,
2795  // other than assign the new Map. (We always have to do that.)
2796  // It's an error for the user to supply a Map that requires
2797  // resizing in this case.
2798  //
2799  // Case 2 means that the calling process is in the current Map's
2800  // communicator, but will be excluded from the new Map's
2801  // communicator. We don't have to do anything on the calling
2802  // process; just leave whatever data it may have alone.
2803  //
2804  // Case 3 means that the calling process is excluded from the
2805  // current Map's communicator, but will be included in the new
2806  // Map's communicator. This means we need to (re)allocate the
2807  // local DualView if it does not have the right number of rows.
2808  // If the new number of rows is nonzero, we'll fill the newly
2809  // allocated local data with zeros, as befits a projection
2810  // operation.
2811  //
2812  // The typical use case for Case 3 is that the MultiVector was
2813  // first created with the Map with more processes, then that Map
2814  // was replaced with a Map with fewer processes, and finally the
2815  // original Map was restored on this call to replaceMap.
2816 
2817  if (this->getMap ().is_null ()) { // current Map is null
2818  // If this->getMap() is null, that means that this MultiVector
2819  // has already had replaceMap happen to it. In that case, just
2820  // reallocate the DualView with the right size.
2821 
2822  TEUCHOS_TEST_FOR_EXCEPTION(
2823  newMap.is_null (), std::invalid_argument,
2824  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2825  "This probably means that the input Map is incorrect.");
2826 
2827  // Case 3: current Map is null, new Map is nonnull.
2828  // Reallocate the DualView with the right dimensions.
2829  const size_t newNumRows = newMap->getLocalNumElements ();
2830  const size_t origNumRows = view_.extent (0);
2831  const size_t numCols = this->getNumVectors ();
2832 
2833  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2834  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2835  }
2836  }
2837  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2838  // I am an excluded process. Reinitialize my data so that I
2839  // have 0 rows. Keep the number of columns as before.
2840  const size_t newNumRows = static_cast<size_t> (0);
2841  const size_t numCols = this->getNumVectors ();
2842  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2843  }
2844 
2845  this->map_ = newMap;
2846  }
2847 
2848  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2849  void
2851  scale (const Scalar& alpha)
2852  {
2853  using Kokkos::ALL;
2854  using IST = impl_scalar_type;
2855 
2856  const IST theAlpha = static_cast<IST> (alpha);
2857  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2858  return; // do nothing
2859  }
2860  const size_t lclNumRows = getLocalLength ();
2861  const size_t numVecs = getNumVectors ();
2862  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2863  const std::pair<size_t, size_t> colRng (0, numVecs);
2864 
2865  // We can't substitute putScalar(0.0) for scale(0.0), because the
2866  // former will overwrite NaNs present in the MultiVector. The
2867  // semantics of this call require multiplying them by 0, which
2868  // IEEE 754 requires to be NaN.
2869 
2870  // If we need sync to device, then host has the most recent version.
2871  const bool useHostVersion = need_sync_device ();
2872  if (useHostVersion) {
2873  auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2874  if (isConstantStride ()) {
2875  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2876  }
2877  else {
2878  for (size_t k = 0; k < numVecs; ++k) {
2879  const size_t Y_col = whichVectors_[k];
2880  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2881  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2882  }
2883  }
2884  }
2885  else { // work on device
2886  auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2887  if (isConstantStride ()) {
2888  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2889  }
2890  else {
2891  for (size_t k = 0; k < numVecs; ++k) {
2892  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2893  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2894  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2895  }
2896  }
2897  }
2898  }
2899 
2900 
2901  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2902  void
2904  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2905  {
2906  const size_t numVecs = this->getNumVectors ();
2907  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2908  TEUCHOS_TEST_FOR_EXCEPTION(
2909  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2910  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2911  << numVecs << ".");
2912 
2913  // Use a DualView to copy the scaling constants onto the device.
2914  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2915  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2916  k_alphas.modify_host ();
2917  for (size_t i = 0; i < numAlphas; ++i) {
2918  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2919  }
2920  k_alphas.sync_device ();
2921  // Invoke the scale() overload that takes a device View of coefficients.
2922  this->scale (k_alphas.view_device ());
2923  }
2924 
2925  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2926  void
2928  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2929  {
2930  using Kokkos::ALL;
2931  using Kokkos::subview;
2932 
2933  const size_t lclNumRows = this->getLocalLength ();
2934  const size_t numVecs = this->getNumVectors ();
2935  TEUCHOS_TEST_FOR_EXCEPTION(
2936  static_cast<size_t> (alphas.extent (0)) != numVecs,
2937  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2938  "alphas.extent(0) = " << alphas.extent (0)
2939  << " != this->getNumVectors () = " << numVecs << ".");
2940  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2941  const std::pair<size_t, size_t> colRng (0, numVecs);
2942 
2943  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2944  // type of the return value of subview. This is because if we
2945  // switch the array layout from LayoutLeft to LayoutRight
2946  // (preferred for performance of block operations), the types
2947  // below won't be valid. (A view of a column of a LayoutRight
2948  // multivector has LayoutStride, not LayoutLeft.)
2949 
2950  // If we need sync to device, then host has the most recent version.
2951  const bool useHostVersion = this->need_sync_device ();
2952  if (useHostVersion) {
2953  // Work in host memory. This means we need to create a host
2954  // mirror of the input View of coefficients.
2955  auto alphas_h = Kokkos::create_mirror_view (alphas);
2956  // DEEP_COPY REVIEW - NOT TESTED
2957  Kokkos::deep_copy (alphas_h, alphas);
2958 
2959  auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2960  if (isConstantStride ()) {
2961  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2962  }
2963  else {
2964  for (size_t k = 0; k < numVecs; ++k) {
2965  const size_t Y_col = this->isConstantStride () ? k :
2966  this->whichVectors_[k];
2967  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2968  // We don't have to use the entire 1-D View here; we can use
2969  // the version that takes a scalar coefficient.
2970  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2971  }
2972  }
2973  }
2974  else { // Work in device memory, using the input View 'alphas' directly.
2975  auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2976  if (isConstantStride ()) {
2977  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2978  }
2979  else {
2980  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2981  // as values on host, so copy them to host. Another approach
2982  // would be to fix scal() so that it takes a 0-D View as the
2983  // second argument.
2984  auto alphas_h = Kokkos::create_mirror_view (alphas);
2985  // DEEP_COPY REVIEW - NOT TESTED
2986  Kokkos::deep_copy (alphas_h, alphas);
2987 
2988  for (size_t k = 0; k < numVecs; ++k) {
2989  const size_t Y_col = this->isConstantStride () ? k :
2990  this->whichVectors_[k];
2991  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2992  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2993  }
2994  }
2995  }
2996  }
2997 
2998  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2999  void
3001  scale (const Scalar& alpha,
3003  {
3004  using Kokkos::ALL;
3005  using Kokkos::subview;
3006  const char tfecfFuncName[] = "scale: ";
3007 
3008  const size_t lclNumRows = getLocalLength ();
3009  const size_t numVecs = getNumVectors ();
3010 
3011  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3012  lclNumRows != A.getLocalLength (), std::invalid_argument,
3013  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3014  << A.getLocalLength () << ".");
3015  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3016  numVecs != A.getNumVectors (), std::invalid_argument,
3017  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3018  << A.getNumVectors () << ".");
3019 
3020  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3021  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3022  const std::pair<size_t, size_t> colRng (0, numVecs);
3023 
3024  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3025  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3026  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
3027  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
3028 
3029  if (isConstantStride () && A.isConstantStride ()) {
3030  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
3031  }
3032  else {
3033  // Make sure that Kokkos only uses the local length for add.
3034  for (size_t k = 0; k < numVecs; ++k) {
3035  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3036  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3037  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3038  auto X_k = subview (X_lcl, ALL (), X_col);
3039 
3040  KokkosBlas::scal (Y_k, theAlpha, X_k);
3041  }
3042  }
3043  }
3044 
3045 
3046 
3047  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3048  void
3051  {
3052  const char tfecfFuncName[] = "reciprocal: ";
3053 
3054  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3055  getLocalLength () != A.getLocalLength (), std::runtime_error,
3056  "MultiVectors do not have the same local length. "
3057  "this->getLocalLength() = " << getLocalLength ()
3058  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3059  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3060  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3061  ": MultiVectors do not have the same number of columns (vectors). "
3062  "this->getNumVectors() = " << getNumVectors ()
3063  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3064 
3065  const size_t numVecs = getNumVectors ();
3066 
3067  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3068  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3069 
3070  if (isConstantStride () && A.isConstantStride ()) {
3071  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
3072  }
3073  else {
3074  using Kokkos::ALL;
3075  using Kokkos::subview;
3076  for (size_t k = 0; k < numVecs; ++k) {
3077  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3078  auto vector_k = subview (this_view_dev, ALL (), this_col);
3079  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3080  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3081  KokkosBlas::reciprocal (vector_k, vector_Ak);
3082  }
3083  }
3084  }
3085 
3086  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3087  void
3090  {
3091  const char tfecfFuncName[] = "abs";
3092 
3093  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3094  getLocalLength () != A.getLocalLength (), std::runtime_error,
3095  ": MultiVectors do not have the same local length. "
3096  "this->getLocalLength() = " << getLocalLength ()
3097  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3098  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3099  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3100  ": MultiVectors do not have the same number of columns (vectors). "
3101  "this->getNumVectors() = " << getNumVectors ()
3102  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3103  const size_t numVecs = getNumVectors ();
3104 
3105  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3106  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3107 
3108  if (isConstantStride () && A.isConstantStride ()) {
3109  KokkosBlas::abs (this_view_dev, A_view_dev);
3110  }
3111  else {
3112  using Kokkos::ALL;
3113  using Kokkos::subview;
3114 
3115  for (size_t k=0; k < numVecs; ++k) {
3116  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3117  auto vector_k = subview (this_view_dev, ALL (), this_col);
3118  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3119  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3120  KokkosBlas::abs (vector_k, vector_Ak);
3121  }
3122  }
3123  }
3124 
3125  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3126  void
3128  update (const Scalar& alpha,
3130  const Scalar& beta)
3131  {
3132  const char tfecfFuncName[] = "update: ";
3133  using Kokkos::subview;
3134  using Kokkos::ALL;
3135 
3136  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3137 
3138  const size_t lclNumRows = getLocalLength ();
3139  const size_t numVecs = getNumVectors ();
3140 
3142  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3143  lclNumRows != A.getLocalLength (), std::invalid_argument,
3144  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3145  << A.getLocalLength () << ".");
3146  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3147  numVecs != A.getNumVectors (), std::invalid_argument,
3148  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3149  << A.getNumVectors () << ".");
3150  }
3151 
3152  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3153  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3154  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3155  const std::pair<size_t, size_t> colRng (0, numVecs);
3156 
3157  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3158  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3159  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3160  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3161 
3162  // The device memory of *this is about to be modified
3163  if (isConstantStride () && A.isConstantStride ()) {
3164  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3165  }
3166  else {
3167  // Make sure that Kokkos only uses the local length for add.
3168  for (size_t k = 0; k < numVecs; ++k) {
3169  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3170  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3171  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3172  auto X_k = subview (X_lcl, ALL (), X_col);
3173 
3174  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3175  }
3176  }
3177  }
3178 
3179  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3180  void
3182  update (const Scalar& alpha,
3184  const Scalar& beta,
3186  const Scalar& gamma)
3187  {
3188  using Kokkos::ALL;
3189  using Kokkos::subview;
3190 
3191  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3192 
3193  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3194 
3195  const size_t lclNumRows = this->getLocalLength ();
3196  const size_t numVecs = getNumVectors ();
3197 
3199  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3200  lclNumRows != A.getLocalLength (), std::invalid_argument,
3201  "The input MultiVector A has " << A.getLocalLength () << " local "
3202  "row(s), but this MultiVector has " << lclNumRows << " local "
3203  "row(s).");
3204  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3205  lclNumRows != B.getLocalLength (), std::invalid_argument,
3206  "The input MultiVector B has " << B.getLocalLength () << " local "
3207  "row(s), but this MultiVector has " << lclNumRows << " local "
3208  "row(s).");
3209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3210  A.getNumVectors () != numVecs, std::invalid_argument,
3211  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3212  "but this MultiVector has " << numVecs << " column(s).");
3213  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3214  B.getNumVectors () != numVecs, std::invalid_argument,
3215  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3216  "but this MultiVector has " << numVecs << " column(s).");
3217  }
3218 
3219  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3220  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3221  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3222 
3223  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3224  const std::pair<size_t, size_t> colRng (0, numVecs);
3225 
3226  // Prefer 'auto' over specifying the type explicitly. This avoids
3227  // issues with a subview possibly having a different type than the
3228  // original view.
3229  auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3230  auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3231  auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3232 
3233  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3234  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3235  }
3236  else {
3237  // Some input (or *this) is not constant stride,
3238  // so perform the update one column at a time.
3239  for (size_t k = 0; k < numVecs; ++k) {
3240  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3241  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3242  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3243  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3244  theBeta, subview (B_lcl, rowRng, B_col),
3245  theGamma, subview (C_lcl, rowRng, this_col));
3246  }
3247  }
3248  }
3249 
3250  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3251  Teuchos::ArrayRCP<const Scalar>
3253  getData (size_t j) const
3254  {
3255  using Kokkos::ALL;
3256  using IST = impl_scalar_type;
3257  const char tfecfFuncName[] = "getData: ";
3258 
3259  // Any MultiVector method that called the (classic) Kokkos Node's
3260  // viewBuffer or viewBufferNonConst methods always implied a
3261  // device->host synchronization. Thus, we synchronize here as
3262  // well.
3263 
3264  auto hostView = getLocalViewHost(Access::ReadOnly);
3265  const size_t col = isConstantStride () ? j : whichVectors_[j];
3266  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3267  Teuchos::ArrayRCP<const IST> dataAsArcp =
3268  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3269 
3270  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3271  (static_cast<size_t> (hostView_j.extent (0)) <
3272  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3273  "hostView_j.extent(0) = " << hostView_j.extent (0)
3274  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3275  "Please report this bug to the Tpetra developers.");
3276 
3277  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3278  }
3279 
3280  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3281  Teuchos::ArrayRCP<Scalar>
3284  {
3285  using Kokkos::ALL;
3286  using Kokkos::subview;
3287  using IST = impl_scalar_type;
3288  const char tfecfFuncName[] = "getDataNonConst: ";
3289 
3290  auto hostView = getLocalViewHost(Access::ReadWrite);
3291  const size_t col = isConstantStride () ? j : whichVectors_[j];
3292  auto hostView_j = subview (hostView, ALL (), col);
3293  Teuchos::ArrayRCP<IST> dataAsArcp =
3294  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3295 
3296  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3297  (static_cast<size_t> (hostView_j.extent (0)) <
3298  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3299  "hostView_j.extent(0) = " << hostView_j.extent (0)
3300  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3301  "Please report this bug to the Tpetra developers.");
3302 
3303  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3304  }
3305 
3306  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3307  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3309  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3310  {
3311  using Teuchos::RCP;
3313 
3314  // Check whether the index set in cols is contiguous. If it is,
3315  // use the more efficient Range1D version of subCopy.
3316  bool contiguous = true;
3317  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3318  for (size_t j = 1; j < numCopyVecs; ++j) {
3319  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3320  contiguous = false;
3321  break;
3322  }
3323  }
3324  if (contiguous && numCopyVecs > 0) {
3325  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3326  }
3327  else {
3328  RCP<const MV> X_sub = this->subView (cols);
3329  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3330  Y->assign (*X_sub);
3331  return Y;
3332  }
3333  }
3334 
3335  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3336  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3338  subCopy (const Teuchos::Range1D &colRng) const
3339  {
3340  using Teuchos::RCP;
3342 
3343  RCP<const MV> X_sub = this->subView (colRng);
3344  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3345  Y->assign (*X_sub);
3346  return Y;
3347  }
3348 
3349  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3350  size_t
3353  return view_.origExtent(0);
3354  }
3355 
3356  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3357  size_t
3360  return view_.origExtent(1);
3361  }
3362 
3363  template <class Scalar, class LO, class GO, class Node>
3366  const Teuchos::RCP<const map_type>& subMap,
3367  const local_ordinal_type rowOffset) :
3368  base_type (subMap)
3369  {
3370  using Kokkos::ALL;
3371  using Kokkos::subview;
3372  using Teuchos::outArg;
3373  using Teuchos::RCP;
3374  using Teuchos::rcp;
3375  using Teuchos::reduceAll;
3376  using Teuchos::REDUCE_MIN;
3377  using std::endl;
3379  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3380  const char suffix[] = "Please report this bug to the Tpetra developers.";
3381  int lclGood = 1;
3382  int gblGood = 1;
3383  std::unique_ptr<std::ostringstream> errStrm;
3384  const bool debug = ::Tpetra::Details::Behavior::debug ();
3385  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3386 
3387  // Be careful to use the input Map's communicator, not X's. The
3388  // idea is that, on return, *this is a subview of X, using the
3389  // input Map.
3390  const auto comm = subMap->getComm ();
3391  TEUCHOS_ASSERT( ! comm.is_null () );
3392  const int myRank = comm->getRank ();
3393 
3394  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3395  const LO numCols = static_cast<LO> (X.getNumVectors ());
3396  const LO newNumRows = static_cast<LO> (subMap->getLocalNumElements ());
3397  if (verbose) {
3398  std::ostringstream os;
3399  os << "Proc " << myRank << ": " << prefix
3400  << "X: {lclNumRows: " << lclNumRowsBefore
3401  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3402  << ", numCols: " << numCols << "}, "
3403  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3404  std::cerr << os.str ();
3405  }
3406  // We ask for the _original_ number of rows in X, because X could
3407  // be a shorter (fewer rows) view of a longer MV. (For example, X
3408  // could be a domain Map view of a column Map MV.)
3409  const bool tooManyElts =
3410  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3411  if (tooManyElts) {
3412  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3413  *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3414  << newNumRows << ") + rowOffset (=" << rowOffset
3415  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3416  << ")." << endl;
3417  lclGood = 0;
3418  TEUCHOS_TEST_FOR_EXCEPTION
3419  (! debug && tooManyElts, std::invalid_argument,
3420  prefix << errStrm->str () << suffix);
3421  }
3422 
3423  if (debug) {
3424  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3425  if (gblGood != 1) {
3426  std::ostringstream gblErrStrm;
3427  const std::string myErrStr =
3428  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3429  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3430  TEUCHOS_TEST_FOR_EXCEPTION
3431  (true, std::invalid_argument, gblErrStrm.str ());
3432  }
3433  }
3434 
3435  using range_type = std::pair<LO, LO>;
3436  const range_type origRowRng
3437  (rowOffset, static_cast<LO> (X.view_.origExtent (0)));
3438  const range_type rowRng
3439  (rowOffset, rowOffset + newNumRows);
3440 
3441  wrapped_dual_view_type newView = takeSubview (X.view_, rowRng, ALL ());
3442 
3443  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3444  // handling subviews of degenerate Views quite so well. For some
3445  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3446  // 0. We work around by creating a new empty DualView of the
3447  // desired (degenerate) dimensions.
3448  if (newView.extent (0) == 0 &&
3449  newView.extent (1) != X.view_.extent (1)) {
3450  newView =
3451  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3452  }
3453 
3454  MV subViewMV = X.isConstantStride () ?
3455  MV (subMap, newView) :
3456  MV (subMap, newView, X.whichVectors_ ());
3457 
3458  if (debug) {
3459  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3460  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3461  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3462  lclGood = 0;
3463  if (errStrm.get () == nullptr) {
3464  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3465  }
3466  *errStrm << " Proc " << myRank <<
3467  ": subMap.getLocalNumElements(): " << newNumRows <<
3468  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3469  ", X.getNumVectors(): " << numCols <<
3470  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3471  }
3472  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3473  if (gblGood != 1) {
3474  std::ostringstream gblErrStrm;
3475  if (myRank == 0) {
3476  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3477  "dimensions on one or more processes:" << endl;
3478  }
3479  const std::string myErrStr =
3480  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3481  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3482  gblErrStrm << suffix << endl;
3483  TEUCHOS_TEST_FOR_EXCEPTION
3484  (true, std::invalid_argument, gblErrStrm.str ());
3485  }
3486  }
3487 
3488  if (verbose) {
3489  std::ostringstream os;
3490  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3491  std::cerr << os.str ();
3492  }
3493 
3494  *this = subViewMV; // shallow copy
3495 
3496  if (verbose) {
3497  std::ostringstream os;
3498  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3499  std::cerr << os.str ();
3500  }
3501  }
3502 
3503  template <class Scalar, class LO, class GO, class Node>
3506  const map_type& subMap,
3507  const size_t rowOffset) :
3508  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3509  static_cast<local_ordinal_type> (rowOffset))
3510  {}
3511 
3512  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3513  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3515  offsetView (const Teuchos::RCP<const map_type>& subMap,
3516  const size_t offset) const
3517  {
3519  return Teuchos::rcp (new MV (*this, *subMap, offset));
3520  }
3521 
3522  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3523  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3525  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3526  const size_t offset)
3527  {
3529  return Teuchos::rcp (new MV (*this, *subMap, offset));
3530  }
3531 
3532  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3533  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3535  subView (const Teuchos::ArrayView<const size_t>& cols) const
3536  {
3537  using Teuchos::Array;
3538  using Teuchos::rcp;
3540 
3541  const size_t numViewCols = static_cast<size_t> (cols.size ());
3542  TEUCHOS_TEST_FOR_EXCEPTION(
3543  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3544  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3545  "contain at least one entry, but cols.size() = " << cols.size ()
3546  << " == 0.");
3547 
3548  // Check whether the index set in cols is contiguous. If it is,
3549  // use the more efficient Range1D version of subView.
3550  bool contiguous = true;
3551  for (size_t j = 1; j < numViewCols; ++j) {
3552  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3553  contiguous = false;
3554  break;
3555  }
3556  }
3557  if (contiguous) {
3558  if (numViewCols == 0) {
3559  // The output MV has no columns, so there is nothing to view.
3560  return rcp (new MV (this->getMap (), numViewCols));
3561  } else {
3562  // Use the more efficient contiguous-index-range version.
3563  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3564  }
3565  }
3566 
3567  if (isConstantStride ()) {
3568  return rcp (new MV (this->getMap (), view_, cols));
3569  }
3570  else {
3571  Array<size_t> newcols (cols.size ());
3572  for (size_t j = 0; j < numViewCols; ++j) {
3573  newcols[j] = whichVectors_[cols[j]];
3574  }
3575  return rcp (new MV (this->getMap (), view_, newcols ()));
3576  }
3577  }
3578 
3579 
3580  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3581  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3583  subView (const Teuchos::Range1D& colRng) const
3584  {
3585  using ::Tpetra::Details::Behavior;
3586  using Kokkos::ALL;
3587  using Kokkos::subview;
3588  using Teuchos::Array;
3589  using Teuchos::RCP;
3590  using Teuchos::rcp;
3592  const char tfecfFuncName[] = "subView(Range1D): ";
3593 
3594  const size_t lclNumRows = this->getLocalLength ();
3595  const size_t numVecs = this->getNumVectors ();
3596  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3597  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3598  // "at least one vector.");
3599  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3600  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3601  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3602  << numVecs << ".");
3603  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3604  numVecs != 0 && colRng.size () != 0 &&
3605  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3606  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3607  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3608  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3609  "[0, " << numVecs << "].");
3610 
3611  RCP<const MV> X_ret; // the MultiVector subview to return
3612 
3613  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3614  // broken for the case of views with zero rows. I will brutally
3615  // enforce that the subview has the correct dimensions. In
3616  // particular, in the case of zero rows, I will, if necessary,
3617  // create a new dual_view_type with zero rows and the correct
3618  // number of columns. In a debug build, I will use an all-reduce
3619  // to ensure that it has the correct dimensions on all processes.
3620 
3621  const std::pair<size_t, size_t> rows (0, lclNumRows);
3622  if (colRng.size () == 0) {
3623  const std::pair<size_t, size_t> cols (0, 0); // empty range
3624  wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3625  X_ret = rcp (new MV (this->getMap (), X_sub));
3626  }
3627  else {
3628  // Returned MultiVector is constant stride only if *this is.
3629  if (isConstantStride ()) {
3630  const std::pair<size_t, size_t> cols (colRng.lbound (),
3631  colRng.ubound () + 1);
3632  wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3633  X_ret = rcp (new MV (this->getMap (), X_sub));
3634  }
3635  else {
3636  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3637  // We're only asking for one column, so the result does have
3638  // constant stride, even though this MultiVector does not.
3639  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3640  whichVectors_[0] + colRng.ubound () + 1);
3641  wrapped_dual_view_type X_sub = takeSubview (view_, ALL (), col);
3642  X_ret = rcp (new MV (this->getMap (), X_sub));
3643  }
3644  else {
3645  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3646  whichVectors_.begin () + colRng.ubound () + 1);
3647  X_ret = rcp (new MV (this->getMap (), view_, which));
3648  }
3649  }
3650  }
3651 
3652  const bool debug = Behavior::debug ();
3653  if (debug) {
3654  using Teuchos::Comm;
3655  using Teuchos::outArg;
3656  using Teuchos::REDUCE_MIN;
3657  using Teuchos::reduceAll;
3658 
3659  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3660  Teuchos::null : this->getMap ()->getComm ();
3661  if (! comm.is_null ()) {
3662  int lclSuccess = 1;
3663  int gblSuccess = 1;
3664 
3665  if (X_ret.is_null ()) {
3666  lclSuccess = 0;
3667  }
3668  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3669  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3670  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3671  "MultiVector; the return value of this method) is null on some MPI "
3672  "process in this MultiVector's communicator. This should never "
3673  "happen. Please report this bug to the Tpetra developers.");
3674  if (! X_ret.is_null () &&
3675  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3676  lclSuccess = 0;
3677  }
3678  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3679  outArg (gblSuccess));
3680  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3681  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3682  "colRng.size(), on at least one MPI process in this MultiVector's "
3683  "communicator. This should never happen. "
3684  "Please report this bug to the Tpetra developers.");
3685  }
3686  }
3687  return X_ret;
3688  }
3689 
3690 
3691  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3692  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3694  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3695  {
3697  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3698  }
3699 
3700 
3701  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3702  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3704  subViewNonConst (const Teuchos::Range1D &colRng)
3705  {
3707  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3708  }
3709 
3710 
3711  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3714  const size_t j)
3715  : base_type (X.getMap ())
3716  {
3717  using Kokkos::subview;
3718  typedef std::pair<size_t, size_t> range_type;
3719  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3720 
3721  const size_t numCols = X.getNumVectors ();
3722  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3723  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3724  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3725  const size_t jj = X.isConstantStride () ?
3726  static_cast<size_t> (j) :
3727  static_cast<size_t> (X.whichVectors_[j]);
3728  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3729 
3730  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3731  // Export or Import operations with different subviews of a
3732  // MultiVector. Thus, it is safe to reuse communication buffers.
3733  // See #1560 discussion.
3734  //
3735  // We only need one column's worth of buffer for imports_ and
3736  // exports_. Taking subviews now ensures that their lengths will
3737  // be exactly what we need, so we won't have to resize them later.
3738  {
3739  const size_t newSize = X.imports_.extent (0) / numCols;
3740  const size_t offset = jj*newSize;
3741  auto newImports = X.imports_;
3742  newImports.d_view = subview (X.imports_.d_view,
3743  range_type (offset, offset+newSize));
3744  newImports.h_view = subview (X.imports_.h_view,
3745  range_type (offset, offset+newSize));
3746  this->imports_ = newImports;
3747  }
3748  {
3749  const size_t newSize = X.exports_.extent (0) / numCols;
3750  const size_t offset = jj*newSize;
3751  auto newExports = X.exports_;
3752  newExports.d_view = subview (X.exports_.d_view,
3753  range_type (offset, offset+newSize));
3754  newExports.h_view = subview (X.exports_.h_view,
3755  range_type (offset, offset+newSize));
3756  this->exports_ = newExports;
3757  }
3758  // These two DualViews already either have the right number of
3759  // entries, or zero entries. This means that we don't need to
3760  // resize them.
3763  }
3764 
3765 
3766  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3767  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3769  getVector (const size_t j) const
3770  {
3772  return Teuchos::rcp (new V (*this, j));
3773  }
3774 
3775 
3776  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3777  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3779  getVectorNonConst (const size_t j)
3780  {
3782  return Teuchos::rcp (new V (*this, j));
3783  }
3784 
3785 
3786  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3787  void
3789  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3790  {
3791  using IST = impl_scalar_type;
3792  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3793  Kokkos::HostSpace,
3794  Kokkos::MemoryUnmanaged>;
3795  const char tfecfFuncName[] = "get1dCopy: ";
3796 
3797  const size_t numRows = this->getLocalLength ();
3798  const size_t numCols = this->getNumVectors ();
3799 
3800  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3801  (LDA < numRows, std::runtime_error,
3802  "LDA = " << LDA << " < numRows = " << numRows << ".");
3803  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3804  (numRows > size_t (0) && numCols > size_t (0) &&
3805  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3806  std::runtime_error,
3807  "A.size() = " << A.size () << ", but its size must be at least "
3808  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3809 
3810  const std::pair<size_t, size_t> rowRange (0, numRows);
3811  const std::pair<size_t, size_t> colRange (0, numCols);
3812 
3813  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3814  LDA, numCols);
3815  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3816 
3828  if (this->need_sync_host() && this->need_sync_device()) {
3831  throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3832  }
3833  else {
3834  const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3835  if (this->isConstantStride ()) {
3836  if (useHostView) {
3837  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3838  // DEEP_COPY REVIEW - NOT TESTED
3839  Kokkos::deep_copy (A_view, srcView_host);
3840  } else {
3841  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3842  // DEEP_COPY REVIEW - NOT TESTED
3843  Kokkos::deep_copy (A_view, srcView_device);
3844  }
3845  }
3846  else {
3847  for (size_t j = 0; j < numCols; ++j) {
3848  const size_t srcCol = this->whichVectors_[j];
3849  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3850 
3851  if (useHostView) {
3852  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3853  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3854  // DEEP_COPY REVIEW - NOT TESTED
3855  Kokkos::deep_copy (dstColView, srcColView_host);
3856  } else {
3857  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3858  auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3859  // DEEP_COPY REVIEW - NOT TESTED
3860  Kokkos::deep_copy (dstColView, srcColView_device);
3861  }
3862  }
3863  }
3864  }
3865  }
3866 
3867 
3868  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3869  void
3871  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3872  {
3874  const char tfecfFuncName[] = "get2dCopy: ";
3875  const size_t numRows = this->getLocalLength ();
3876  const size_t numCols = this->getNumVectors ();
3877 
3878  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3879  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3880  std::runtime_error, "Input array of pointers must contain as many "
3881  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3882  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3883 
3884  if (numRows != 0 && numCols != 0) {
3885  // No side effects until we've validated the input.
3886  for (size_t j = 0; j < numCols; ++j) {
3887  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3888  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3889  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3890  "the input array of arrays is not long enough to fit all entries in "
3891  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3892  << " < getLocalLength() = " << numRows << ".");
3893  }
3894 
3895  // We've validated the input, so it's safe to start copying.
3896  for (size_t j = 0; j < numCols; ++j) {
3897  Teuchos::RCP<const V> X_j = this->getVector (j);
3898  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3899  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3900  }
3901  }
3902  }
3903 
3904 
3905  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3906  Teuchos::ArrayRCP<const Scalar>
3908  get1dView () const
3909  {
3910  if (getLocalLength () == 0 || getNumVectors () == 0) {
3911  return Teuchos::null;
3912  } else {
3913  TEUCHOS_TEST_FOR_EXCEPTION(
3914  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3915  "get1dView: This MultiVector does not have constant stride, so it is "
3916  "not possible to view its data as a single array. You may check "
3917  "whether a MultiVector has constant stride by calling "
3918  "isConstantStride().");
3919  // Since get1dView() is and was always marked const, I have to
3920  // cast away const here in order not to break backwards
3921  // compatibility.
3922  auto X_lcl = getLocalViewHost(Access::ReadOnly);
3923  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3924  Kokkos::Compat::persistingView (X_lcl);
3925  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3926  }
3927  }
3928 
3929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3930  Teuchos::ArrayRCP<Scalar>
3933  {
3934  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3935  return Teuchos::null;
3936  }
3937  else {
3938  TEUCHOS_TEST_FOR_EXCEPTION
3939  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3940  "get1dViewNonConst: This MultiVector does not have constant stride, "
3941  "so it is not possible to view its data as a single array. You may "
3942  "check whether a MultiVector has constant stride by calling "
3943  "isConstantStride().");
3944  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3945  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3946  Kokkos::Compat::persistingView (X_lcl);
3947  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3948  }
3949  }
3950 
3951  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3952  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3955  {
3956  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3957 
3958  // Don't use the row range here on the outside, in order to avoid
3959  // a strided return type (in case Kokkos::subview is conservative
3960  // about that). Instead, use the row range for the column views
3961  // in the loop.
3962  const size_t myNumRows = this->getLocalLength ();
3963  const size_t numCols = this->getNumVectors ();
3964  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3965 
3966  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3967  for (size_t j = 0; j < numCols; ++j) {
3968  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3969  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3970  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3971  Kokkos::Compat::persistingView (X_lcl_j);
3972  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3973  }
3974  return views;
3975  }
3976 
3977  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3980  getLocalViewHost(Access::ReadOnlyStruct s) const
3981  {
3982  return view_.getHostView(s);
3983  }
3984 
3985  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3988  getLocalViewHost(Access::ReadWriteStruct s)
3989  {
3990  return view_.getHostView(s);
3991  }
3992 
3993  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3996  getLocalViewHost(Access::OverwriteAllStruct s)
3997  {
3998  return view_.getHostView(s);
3999  }
4000 
4001  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4004  getLocalViewDevice(Access::ReadOnlyStruct s) const
4005  {
4006  return view_.getDeviceView(s);
4007  }
4008 
4009  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4012  getLocalViewDevice(Access::ReadWriteStruct s)
4013  {
4014  return view_.getDeviceView(s);
4015  }
4016 
4017  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4020  getLocalViewDevice(Access::OverwriteAllStruct s)
4021  {
4022  return view_.getDeviceView(s);
4023  }
4024 
4025  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4029  return view_;
4030  }
4031 
4032  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4033  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
4035  get2dView () const
4036  {
4037  // Since get2dView() is and was always marked const, I have to
4038  // cast away const here in order not to break backwards
4039  // compatibility.
4040  auto X_lcl = getLocalViewHost(Access::ReadOnly);
4041 
4042  // Don't use the row range here on the outside, in order to avoid
4043  // a strided return type (in case Kokkos::subview is conservative
4044  // about that). Instead, use the row range for the column views
4045  // in the loop.
4046  const size_t myNumRows = this->getLocalLength ();
4047  const size_t numCols = this->getNumVectors ();
4048  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
4049 
4050  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
4051  for (size_t j = 0; j < numCols; ++j) {
4052  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
4053  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
4054  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
4055  Kokkos::Compat::persistingView (X_lcl_j);
4056  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
4057  }
4058  return views;
4059  }
4060 
4061  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4062  void
4064  multiply (Teuchos::ETransp transA,
4065  Teuchos::ETransp transB,
4066  const Scalar& alpha,
4069  const Scalar& beta)
4070  {
4071  using ::Tpetra::Details::ProfilingRegion;
4072  using Teuchos::CONJ_TRANS;
4073  using Teuchos::NO_TRANS;
4074  using Teuchos::TRANS;
4075  using Teuchos::RCP;
4076  using Teuchos::rcp;
4077  using Teuchos::rcpFromRef;
4078  using std::endl;
4079  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
4080  using LO = local_ordinal_type;
4081  using STS = Teuchos::ScalarTraits<Scalar>;
4083  const char tfecfFuncName[] = "multiply: ";
4084  ProfilingRegion region ("Tpetra::MV::multiply");
4085 
4086  // This routine performs a variety of matrix-matrix multiply
4087  // operations, interpreting the MultiVector (this-aka C , A and B)
4088  // as 2D matrices. Variations are due to the fact that A, B and C
4089  // can be locally replicated or globally distributed MultiVectors
4090  // and that we may or may not operate with the transpose of A and
4091  // B. Possible cases are:
4092  //
4093  // Operations # Cases Notes
4094  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4095  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4096  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4097  //
4098  // The following operations are not meaningful for 1-D
4099  // distributions:
4100  //
4101  // u1) C(local) = A^T(distr) * B^T(distr) 1
4102  // u2) C(local) = A (distr) * B^X(distr) 2
4103  // u3) C(distr) = A^X(local) * B^X(local) 4
4104  // u4) C(distr) = A^X(local) * B^X(distr) 4
4105  // u5) C(distr) = A^T(distr) * B^X(local) 2
4106  // u6) C(local) = A^X(distr) * B^X(local) 4
4107  // u7) C(distr) = A^X(distr) * B^X(local) 4
4108  // u8) C(local) = A^X(local) * B^X(distr) 4
4109  //
4110  // Total number of cases: 32 (= 2^5).
4111 
4112  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4113 
4114  const bool A_is_local = ! A.isDistributed ();
4115  const bool B_is_local = ! B.isDistributed ();
4116  const bool C_is_local = ! this->isDistributed ();
4117 
4118  // In debug mode, check compatibility of local dimensions. We
4119  // only do this in debug mode, since it requires an all-reduce
4120  // to ensure correctness on all processses. It's entirely
4121  // possible that only some processes may have incompatible local
4122  // dimensions. Throwing an exception only on those processes
4123  // could cause this method to hang.
4124  const bool debug = ::Tpetra::Details::Behavior::debug ();
4125  if (debug) {
4126  auto myMap = this->getMap ();
4127  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4128  using Teuchos::REDUCE_MIN;
4129  using Teuchos::reduceAll;
4130  using Teuchos::outArg;
4131 
4132  auto comm = myMap->getComm ();
4133  const size_t A_nrows =
4134  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4135  const size_t A_ncols =
4136  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4137  const size_t B_nrows =
4138  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4139  const size_t B_ncols =
4140  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4141 
4142  const bool lclBad = this->getLocalLength () != A_nrows ||
4143  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4144 
4145  const int myRank = comm->getRank ();
4146  std::ostringstream errStrm;
4147  if (this->getLocalLength () != A_nrows) {
4148  errStrm << "Proc " << myRank << ": this->getLocalLength()="
4149  << this->getLocalLength () << " != A_nrows=" << A_nrows
4150  << "." << std::endl;
4151  }
4152  if (this->getNumVectors () != B_ncols) {
4153  errStrm << "Proc " << myRank << ": this->getNumVectors()="
4154  << this->getNumVectors () << " != B_ncols=" << B_ncols
4155  << "." << std::endl;
4156  }
4157  if (A_ncols != B_nrows) {
4158  errStrm << "Proc " << myRank << ": A_ncols="
4159  << A_ncols << " != B_nrows=" << B_nrows
4160  << "." << std::endl;
4161  }
4162 
4163  const int lclGood = lclBad ? 0 : 1;
4164  int gblGood = 0;
4165  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4166  if (gblGood != 1) {
4167  std::ostringstream os;
4168  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
4169 
4170  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4171  (true, std::runtime_error, "Inconsistent local dimensions on at "
4172  "least one process in this object's communicator." << std::endl
4173  << "Operation: "
4174  << "C(" << (C_is_local ? "local" : "distr") << ") = "
4175  << alpha << "*A"
4176  << (transA == Teuchos::TRANS ? "^T" :
4177  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4178  << "(" << (A_is_local ? "local" : "distr") << ") * "
4179  << beta << "*B"
4180  << (transA == Teuchos::TRANS ? "^T" :
4181  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4182  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4183  << "Global dimensions: C(" << this->getGlobalLength () << ", "
4184  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4185  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4186  << ", " << B.getNumVectors () << ")." << std::endl
4187  << os.str ());
4188  }
4189  }
4190  }
4191 
4192  // Case 1: C(local) = A^X(local) * B^X(local)
4193  const bool Case1 = C_is_local && A_is_local && B_is_local;
4194  // Case 2: C(local) = A^T(distr) * B (distr)
4195  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4196  transA != NO_TRANS &&
4197  transB == NO_TRANS;
4198  // Case 3: C(distr) = A (distr) * B^X(local)
4199  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4200  transA == NO_TRANS;
4201 
4202  // Test that we are considering a meaningful case
4203  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4204  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4205  "Multiplication of op(A) and op(B) into *this is not a "
4206  "supported use case.");
4207 
4208  if (beta != STS::zero () && Case2) {
4209  // If Case2, then C is local and contributions must be summed
4210  // across all processes. However, if beta != 0, then accumulate
4211  // beta*C into the sum. When summing across all processes, we
4212  // only want to accumulate this once, so set beta == 0 on all
4213  // processes except Process 0.
4214  const int myRank = this->getMap ()->getComm ()->getRank ();
4215  if (myRank != 0) {
4216  beta_local = ATS::zero ();
4217  }
4218  }
4219 
4220  // We only know how to do matrix-matrix multiplies if all the
4221  // MultiVectors have constant stride. If not, we have to make
4222  // temporary copies of those MultiVectors (including possibly
4223  // *this) that don't have constant stride.
4224  RCP<MV> C_tmp;
4225  if (! isConstantStride ()) {
4226  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4227  } else {
4228  C_tmp = rcp (this, false);
4229  }
4230 
4231  RCP<const MV> A_tmp;
4232  if (! A.isConstantStride ()) {
4233  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4234  } else {
4235  A_tmp = rcpFromRef (A);
4236  }
4237 
4238  RCP<const MV> B_tmp;
4239  if (! B.isConstantStride ()) {
4240  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4241  } else {
4242  B_tmp = rcpFromRef (B);
4243  }
4244 
4245  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4246  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4247  ! A_tmp->isConstantStride (), std::logic_error,
4248  "Failed to make temporary constant-stride copies of MultiVectors.");
4249 
4250  {
4251  const LO A_lclNumRows = A_tmp->getLocalLength ();
4252  const LO A_numVecs = A_tmp->getNumVectors ();
4253  auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4254  auto A_sub = Kokkos::subview (A_lcl,
4255  std::make_pair (LO (0), A_lclNumRows),
4256  std::make_pair (LO (0), A_numVecs));
4257 
4258 
4259  const LO B_lclNumRows = B_tmp->getLocalLength ();
4260  const LO B_numVecs = B_tmp->getNumVectors ();
4261  auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4262  auto B_sub = Kokkos::subview (B_lcl,
4263  std::make_pair (LO (0), B_lclNumRows),
4264  std::make_pair (LO (0), B_numVecs));
4265 
4266  const LO C_lclNumRows = C_tmp->getLocalLength ();
4267  const LO C_numVecs = C_tmp->getNumVectors ();
4268 
4269  auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4270  auto C_sub = Kokkos::subview (C_lcl,
4271  std::make_pair (LO (0), C_lclNumRows),
4272  std::make_pair (LO (0), C_numVecs));
4273  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4274  (transA == Teuchos::TRANS ? 'T' : 'C'));
4275  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4276  (transB == Teuchos::TRANS ? 'T' : 'C'));
4277  const impl_scalar_type alpha_IST (alpha);
4278 
4279  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4280 
4281  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4282  beta_local, C_sub);
4283  }
4284 
4285  if (! isConstantStride ()) {
4286  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4287  }
4288 
4289  // Dispose of (possibly) extra copies of A and B.
4290  A_tmp = Teuchos::null;
4291  B_tmp = Teuchos::null;
4292 
4293  // If Case 2 then sum up *this and distribute it to all processes.
4294  if (Case2) {
4295  this->reduce ();
4296  }
4297  }
4298 
4299  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4300  void
4302  elementWiseMultiply (Scalar scalarAB,
4305  Scalar scalarThis)
4306  {
4307  using Kokkos::ALL;
4308  using Kokkos::subview;
4309  const char tfecfFuncName[] = "elementWiseMultiply: ";
4310 
4311  const size_t lclNumRows = this->getLocalLength ();
4312  const size_t numVecs = this->getNumVectors ();
4313 
4314  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4315  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4316  std::runtime_error, "MultiVectors do not have the same local length.");
4317  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4318  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4319  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4320  << ".");
4321 
4322  auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4323  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4324  auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4325 
4326  if (isConstantStride () && B.isConstantStride ()) {
4327  // A is just a Vector; it only has one column, so it always has
4328  // constant stride.
4329  //
4330  // If both *this and B have constant stride, we can do an
4331  // element-wise multiply on all columns at once.
4332  KokkosBlas::mult (scalarThis,
4333  this_view,
4334  scalarAB,
4335  subview (A_view, ALL (), 0),
4336  B_view);
4337  }
4338  else {
4339  for (size_t j = 0; j < numVecs; ++j) {
4340  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4341  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4342  KokkosBlas::mult (scalarThis,
4343  subview (this_view, ALL (), C_col),
4344  scalarAB,
4345  subview (A_view, ALL (), 0),
4346  subview (B_view, ALL (), B_col));
4347  }
4348  }
4349  }
4350 
4351  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4352  void
4355  {
4357  using ::Tpetra::Details::ProfilingRegion;
4358  ProfilingRegion region ("Tpetra::MV::reduce");
4359 
4360  const auto map = this->getMap ();
4361  if (map.get () == nullptr) {
4362  return;
4363  }
4364  const auto comm = map->getComm ();
4365  if (comm.get () == nullptr) {
4366  return;
4367  }
4368 
4369  // Avoid giving device buffers to MPI. Even if MPI handles them
4370  // correctly, doing so may not perform well.
4371  const bool changed_on_device = this->need_sync_host ();
4372  if (changed_on_device) {
4373  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4374  // and host will be separate allocations. In that case, it may
4375  // pay to do the all-reduce from device to host.
4376  Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4377  auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4378  allReduceView (X_lcl, X_lcl, *comm);
4379  }
4380  else {
4381  auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4382  allReduceView (X_lcl, X_lcl, *comm);
4383  }
4384  }
4385 
4386  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4387  void
4389  replaceLocalValue (const LocalOrdinal lclRow,
4390  const size_t col,
4391  const impl_scalar_type& ScalarValue)
4392  {
4394  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4395  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4396  TEUCHOS_TEST_FOR_EXCEPTION(
4397  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4398  std::runtime_error,
4399  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4400  << " is invalid. The range of valid row indices on this process "
4401  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4402  << ", " << maxLocalIndex << "].");
4403  TEUCHOS_TEST_FOR_EXCEPTION(
4404  vectorIndexOutOfRange(col),
4405  std::runtime_error,
4406  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4407  << " of the multivector is invalid.");
4408  }
4409 
4410  auto view = this->getLocalViewHost(Access::ReadWrite);
4411  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4412  view (lclRow, colInd) = ScalarValue;
4413  }
4414 
4415 
4416  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4417  void
4419  sumIntoLocalValue (const LocalOrdinal lclRow,
4420  const size_t col,
4421  const impl_scalar_type& value,
4422  const bool atomic)
4423  {
4425  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4426  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4427  TEUCHOS_TEST_FOR_EXCEPTION(
4428  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4429  std::runtime_error,
4430  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4431  << " is invalid. The range of valid row indices on this process "
4432  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4433  << ", " << maxLocalIndex << "].");
4434  TEUCHOS_TEST_FOR_EXCEPTION(
4435  vectorIndexOutOfRange(col),
4436  std::runtime_error,
4437  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4438  << " of the multivector is invalid.");
4439  }
4440 
4441  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4442 
4443  auto view = this->getLocalViewHost(Access::ReadWrite);
4444  if (atomic) {
4445  Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4446  }
4447  else {
4448  view(lclRow, colInd) += value;
4449  }
4450  }
4451 
4452 
4453  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4454  void
4456  replaceGlobalValue (const GlobalOrdinal gblRow,
4457  const size_t col,
4458  const impl_scalar_type& ScalarValue)
4459  {
4460  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4461  // touches the RCP's reference count, which isn't thread safe.
4462  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4463 
4465  const char tfecfFuncName[] = "replaceGlobalValue: ";
4466  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4467  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4468  std::runtime_error,
4469  "Global row index " << gblRow << " is not present on this process "
4470  << this->getMap ()->getComm ()->getRank () << ".");
4471  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4472  (this->vectorIndexOutOfRange (col), std::runtime_error,
4473  "Vector index " << col << " of the MultiVector is invalid.");
4474  }
4475 
4476  this->replaceLocalValue (lclRow, col, ScalarValue);
4477  }
4478 
4479  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4480  void
4482  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4483  const size_t col,
4484  const impl_scalar_type& value,
4485  const bool atomic)
4486  {
4487  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4488  // touches the RCP's reference count, which isn't thread safe.
4489  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4490 
4492  TEUCHOS_TEST_FOR_EXCEPTION(
4493  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4494  std::runtime_error,
4495  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4496  << " is not present on this process "
4497  << this->getMap ()->getComm ()->getRank () << ".");
4498  TEUCHOS_TEST_FOR_EXCEPTION(
4499  vectorIndexOutOfRange(col),
4500  std::runtime_error,
4501  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4502  << " of the multivector is invalid.");
4503  }
4504 
4505  this->sumIntoLocalValue (lclRow, col, value, atomic);
4506  }
4507 
4508  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4509  template <class T>
4510  Teuchos::ArrayRCP<T>
4512  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4513  size_t j) const
4514  {
4515  typedef Kokkos::DualView<impl_scalar_type*,
4516  typename dual_view_type::array_layout,
4517  execution_space> col_dual_view_type;
4518  const size_t col = isConstantStride () ? j : whichVectors_[j];
4519  col_dual_view_type X_col =
4520  Kokkos::subview (view_, Kokkos::ALL (), col);
4521  return Kokkos::Compat::persistingView (X_col.d_view);
4522  }
4523 
4524  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4525  bool
4528  return view_.need_sync_host ();
4529  }
4530 
4531  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4532  bool
4535  return view_.need_sync_device ();
4536  }
4537 
4538  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4539  std::string
4541  descriptionImpl (const std::string& className) const
4542  {
4543  using Teuchos::TypeNameTraits;
4544 
4545  std::ostringstream out;
4546  out << "\"" << className << "\": {";
4547  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4548  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4549  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4550  << ", Node" << Node::name ()
4551  << "}, ";
4552  if (this->getObjectLabel () != "") {
4553  out << "Label: \"" << this->getObjectLabel () << "\", ";
4554  }
4555  out << ", numRows: " << this->getGlobalLength ();
4556  if (className != "Tpetra::Vector") {
4557  out << ", numCols: " << this->getNumVectors ()
4558  << ", isConstantStride: " << this->isConstantStride ();
4559  }
4560  if (this->isConstantStride ()) {
4561  out << ", columnStride: " << this->getStride ();
4562  }
4563  out << "}";
4564 
4565  return out.str ();
4566  }
4567 
4568  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4569  std::string
4572  {
4573  return this->descriptionImpl ("Tpetra::MultiVector");
4574  }
4575 
4576  namespace Details
4577  {
4578  template<typename ViewType>
4579  void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4580  {
4581  using std::endl;
4582  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4583  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4584  static_assert(ViewType::rank == 2,
4585  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4586  // The square braces [] and their contents are in Matlab
4587  // format, so users may copy and paste directly into Matlab.
4588  out << "Values("<<prefix<<"): " << std::endl
4589  << "[";
4590  const size_t numRows = v.extent(0);
4591  const size_t numCols = v.extent(1);
4592  if (numCols == 1) {
4593  for (size_t i = 0; i < numRows; ++i) {
4594  out << v(i,0);
4595  if (i + 1 < numRows) {
4596  out << "; ";
4597  }
4598  }
4599  }
4600  else {
4601  for (size_t i = 0; i < numRows; ++i) {
4602  for (size_t j = 0; j < numCols; ++j) {
4603  out << v(i,j);
4604  if (j + 1 < numCols) {
4605  out << ", ";
4606  }
4607  }
4608  if (i + 1 < numRows) {
4609  out << "; ";
4610  }
4611  }
4612  }
4613  out << "]" << endl;
4614  }
4615  }
4616 
4617  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4618  std::string
4620  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4621  {
4622  typedef LocalOrdinal LO;
4623  using std::endl;
4624 
4625  if (vl <= Teuchos::VERB_LOW) {
4626  return std::string ();
4627  }
4628  auto map = this->getMap ();
4629  if (map.is_null ()) {
4630  return std::string ();
4631  }
4632  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4633  auto outp = Teuchos::getFancyOStream (outStringP);
4634  Teuchos::FancyOStream& out = *outp;
4635  auto comm = map->getComm ();
4636  const int myRank = comm->getRank ();
4637  const int numProcs = comm->getSize ();
4638 
4639  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4640  Teuchos::OSTab tab1 (out);
4641 
4642  // VERB_MEDIUM and higher prints getLocalLength()
4643  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4644  out << "Local number of rows: " << lclNumRows << endl;
4645 
4646  if (vl > Teuchos::VERB_MEDIUM) {
4647  // VERB_HIGH and higher prints isConstantStride() and getStride().
4648  // The first is only relevant if the Vector has multiple columns.
4649  if (this->getNumVectors () != static_cast<size_t> (1)) {
4650  out << "isConstantStride: " << this->isConstantStride () << endl;
4651  }
4652  if (this->isConstantStride ()) {
4653  out << "Column stride: " << this->getStride () << endl;
4654  }
4655 
4656  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4657  // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4659  // so we can't use our regular accessor functins
4660 
4661  // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4662  // to trigger (since this function is conceptually const). Thus, we
4663  // get *copies* of the view's data instead.
4664  auto X_dev = view_.getDeviceCopy();
4665  auto X_host = view_.getHostCopy();
4666 
4667  if(X_dev.data() == X_host.data()) {
4668  // One single allocation
4669  Details::print_vector(out,"unified",X_host);
4670  }
4671  else {
4672  Details::print_vector(out,"host",X_host);
4673  Details::print_vector(out,"dev",X_dev);
4674  }
4675  }
4676  }
4677  out.flush (); // make sure the ostringstream got everything
4678  return outStringP->str ();
4679  }
4680 
4681  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4682  void
4684  describeImpl (Teuchos::FancyOStream& out,
4685  const std::string& className,
4686  const Teuchos::EVerbosityLevel verbLevel) const
4687  {
4688  using Teuchos::TypeNameTraits;
4689  using Teuchos::VERB_DEFAULT;
4690  using Teuchos::VERB_NONE;
4691  using Teuchos::VERB_LOW;
4692  using std::endl;
4693  const Teuchos::EVerbosityLevel vl =
4694  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4695 
4696  if (vl == VERB_NONE) {
4697  return; // don't print anything
4698  }
4699  // If this Vector's Comm is null, then the Vector does not
4700  // participate in collective operations with the other processes.
4701  // In that case, it is not even legal to call this method. The
4702  // reasonable thing to do in that case is nothing.
4703  auto map = this->getMap ();
4704  if (map.is_null ()) {
4705  return;
4706  }
4707  auto comm = map->getComm ();
4708  if (comm.is_null ()) {
4709  return;
4710  }
4711 
4712  const int myRank = comm->getRank ();
4713 
4714  // Only Process 0 should touch the output stream, but this method
4715  // in general may need to do communication. Thus, we may need to
4716  // preserve the current tab level across multiple "if (myRank ==
4717  // 0) { ... }" inner scopes. This is why we sometimes create
4718  // OSTab instances by pointer, instead of by value. We only need
4719  // to create them by pointer if the tab level must persist through
4720  // multiple inner scopes.
4721  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4722 
4723  // VERB_LOW and higher prints the equivalent of description().
4724  if (myRank == 0) {
4725  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4726  out << "\"" << className << "\":" << endl;
4727  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4728  {
4729  out << "Template parameters:" << endl;
4730  Teuchos::OSTab tab2 (out);
4731  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4732  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4733  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4734  << "Node: " << Node::name () << endl;
4735  }
4736  if (this->getObjectLabel () != "") {
4737  out << "Label: \"" << this->getObjectLabel () << "\", ";
4738  }
4739  out << "Global number of rows: " << this->getGlobalLength () << endl;
4740  if (className != "Tpetra::Vector") {
4741  out << "Number of columns: " << this->getNumVectors () << endl;
4742  }
4743  // getStride() may differ on different processes, so it (and
4744  // isConstantStride()) properly belong to per-process data.
4745  }
4746 
4747  // This is collective over the Map's communicator.
4748  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4749  const std::string lclStr = this->localDescribeToString (vl);
4750  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4751  }
4752  }
4753 
4754  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4755  void
4757  describe (Teuchos::FancyOStream &out,
4758  const Teuchos::EVerbosityLevel verbLevel) const
4759  {
4760  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4761  }
4762 
4763  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4764  void
4766  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4767  {
4768  replaceMap (newMap);
4769  }
4770 
4771  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4772  void
4775  {
4776  using ::Tpetra::Details::localDeepCopy;
4777  const char prefix[] = "Tpetra::MultiVector::assign: ";
4778 
4779  TEUCHOS_TEST_FOR_EXCEPTION
4780  (this->getGlobalLength () != src.getGlobalLength () ||
4781  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4782  prefix << "Global dimensions of the two Tpetra::MultiVector "
4783  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4784  << "," << src.getNumVectors () << "], and *this has dimensions ["
4785  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4786  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4787  TEUCHOS_TEST_FOR_EXCEPTION
4788  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4789  prefix << "The local row counts of the two Tpetra::MultiVector "
4790  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4791  << " and *this has " << this->getLocalLength () << " row(s).");
4792 
4793 
4794  // See #1510. We're writing to *this, so we don't care about its
4795  // contents in either memory space, and we don't want
4796  // DualView::modify to complain about "concurrent modification" of
4797  // host and device Views.
4798 
4802 
4803  // If need sync to device, then host has most recent version.
4804  const bool src_last_updated_on_host = src.need_sync_device ();
4805 
4806  if (src_last_updated_on_host) {
4807  localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4808  src.getLocalViewHost(Access::ReadOnly),
4809  this->isConstantStride (),
4810  src.isConstantStride (),
4811  this->whichVectors_ (),
4812  src.whichVectors_ ());
4813  }
4814  else {
4815  localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4816  src.getLocalViewDevice(Access::ReadOnly),
4817  this->isConstantStride (),
4818  src.isConstantStride (),
4819  this->whichVectors_ (),
4820  src.whichVectors_ ());
4821  }
4822  }
4823 
4824  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4825  template<class T>
4826  Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4828  convert () const
4829  {
4830  using Teuchos::RCP;
4831 
4832  auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4833  this->getNumVectors(),
4834  /*zeroOut=*/false));
4835  Tpetra::deep_copy(*newMV, *this);
4836  return newMV;
4837  }
4838 
4839  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4840  bool
4843  {
4844  using ::Tpetra::Details::PackTraits;
4845 
4846  const size_t l1 = this->getLocalLength();
4847  const size_t l2 = vec.getLocalLength();
4848  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4849  return false;
4850  }
4851 
4852  return true;
4853  }
4854 
4855  template <class ST, class LO, class GO, class NT>
4858  std::swap(mv.map_, this->map_);
4859  std::swap(mv.view_, this->view_);
4860  std::swap(mv.whichVectors_, this->whichVectors_);
4861  }
4862 
4863 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4864  template <class ST, class LO, class GO, class NT>
4865  void
4867  const Teuchos::SerialDenseMatrix<int, ST>& src)
4868  {
4869  using ::Tpetra::Details::localDeepCopy;
4870  using MV = MultiVector<ST, LO, GO, NT>;
4871  using IST = typename MV::impl_scalar_type;
4872  using input_view_type =
4873  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4874  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4875  using pair_type = std::pair<LO, LO>;
4876 
4877  const LO numRows = static_cast<LO> (src.numRows ());
4878  const LO numCols = static_cast<LO> (src.numCols ());
4879 
4880  TEUCHOS_TEST_FOR_EXCEPTION
4881  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4882  numCols != static_cast<LO> (dst.getNumVectors ()),
4883  std::invalid_argument, "Tpetra::deep_copy: On Process "
4884  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4885  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4886  << ", but src is " << numRows << " x " << numCols << ".");
4887 
4888  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4889  input_view_type src_orig (src_raw, src.stride (), numCols);
4890  input_view_type src_view =
4891  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4892 
4893  constexpr bool src_isConstantStride = true;
4894  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4895  localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4896  src_view,
4897  dst.isConstantStride (),
4898  src_isConstantStride,
4899  getMultiVectorWhichVectors (dst),
4900  srcWhichVectors);
4901  }
4902 
4903  template <class ST, class LO, class GO, class NT>
4904  void
4905  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4906  const MultiVector<ST, LO, GO, NT>& src)
4907  {
4908  using ::Tpetra::Details::localDeepCopy;
4909  using MV = MultiVector<ST, LO, GO, NT>;
4910  using IST = typename MV::impl_scalar_type;
4911  using output_view_type =
4912  Kokkos::View<IST**, Kokkos::LayoutLeft,
4913  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4914  using pair_type = std::pair<LO, LO>;
4915 
4916  const LO numRows = static_cast<LO> (dst.numRows ());
4917  const LO numCols = static_cast<LO> (dst.numCols ());
4918 
4919  TEUCHOS_TEST_FOR_EXCEPTION
4920  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4921  numCols != static_cast<LO> (src.getNumVectors ()),
4922  std::invalid_argument, "Tpetra::deep_copy: On Process "
4923  << src.getMap ()->getComm ()->getRank () << ", src is "
4924  << src.getLocalLength () << " x " << src.getNumVectors ()
4925  << ", but dst is " << numRows << " x " << numCols << ".");
4926 
4927  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4928  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4929  auto dst_view =
4930  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4931 
4932  constexpr bool dst_isConstantStride = true;
4933  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4934 
4935  // Prefer the host version of src's data.
4936  localDeepCopy (dst_view,
4937  src.getLocalViewHost(Access::ReadOnly),
4938  dst_isConstantStride,
4939  src.isConstantStride (),
4940  dstWhichVectors,
4941  getMultiVectorWhichVectors (src));
4942  }
4943 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4944 
4945  template <class Scalar, class LO, class GO, class NT>
4946  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4947  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4948  size_t numVectors)
4949  {
4951  return Teuchos::rcp (new MV (map, numVectors));
4952  }
4953 
4954  template <class ST, class LO, class GO, class NT>
4957  {
4958  typedef MultiVector<ST, LO, GO, NT> MV;
4959  MV cpy (src.getMap (), src.getNumVectors (), false);
4960  cpy.assign (src);
4961  return cpy;
4962  }
4963 
4964 } // namespace Tpetra
4965 
4966 //
4967 // Explicit instantiation macro
4968 //
4969 // Must be expanded from within the Tpetra namespace!
4970 //
4971 
4972 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4973 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4974  template class MultiVector< SCALAR , LO , GO , NODE >; \
4975  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4976  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4977  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4978  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4979 
4980 #else
4981 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4982  template class MultiVector< SCALAR , LO , GO , NODE >; \
4983  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4984 
4985 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4986 
4987 
4988 #define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4989  \
4990  template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4991  MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4992 
4993 
4994 #endif // TPETRA_MULTIVECTOR_DEF_HPP
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
One or more distributed dense vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
bool isDistributed() const
Whether this is a globally distributed object.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
static bool debug()
Whether Tpetra is in debug mode.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
int local_ordinal_type
Default value of Scalar template parameter.
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector &quot;constructor&quot;: Create a MultiVector from a given Map.
Functions for initializing and finalizing Tpetra.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Declaration of Tpetra::Details::isInterComm.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
Replace existing values with new values.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
void start()
Start the deep_copy counter.
A parallel distribution of indices over processes.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
Base class for distributed Tpetra objects that support data redistribution.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
EWhichNorm
Input argument for normImpl() (which see).
Accumulate new values into existing values (may not be supported in all classes)
size_t getOrigNumLocalRows() const
&quot;Original&quot; number of rows in the (local) data.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.