Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
43 #define TPETRA_MULTIVECTOR_DEF_HPP
44 
52 
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
57 #include "Tpetra_Details_fill.hpp"
58 #include "Tpetra_Details_gathervPrint.hpp"
61 #include "Tpetra_Details_normImpl.hpp"
65 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
66 # include "Teuchos_SerialDenseMatrix.hpp"
67 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
68 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
69 #include "KokkosCompat_View.hpp"
70 #include "KokkosBlas.hpp"
71 #include "KokkosKernels_Utils.hpp"
72 #include "Kokkos_Random.hpp"
73 #include "Kokkos_ArithTraits.hpp"
74 #include <memory>
75 #include <sstream>
76 
77 #ifdef HAVE_TPETRA_INST_FLOAT128
78 namespace Kokkos {
79  // FIXME (mfh 04 Sep 2015) Just a stub for now!
80  template<class Generator>
81  struct rand<Generator, __float128> {
82  static KOKKOS_INLINE_FUNCTION __float128 max ()
83  {
84  return static_cast<__float128> (1.0);
85  }
86  static KOKKOS_INLINE_FUNCTION __float128
87  draw (Generator& gen)
88  {
89  // Half the smallest normalized double, is the scaling factor of
90  // the lower-order term in the double-double representation.
91  const __float128 scalingFactor =
92  static_cast<__float128> (std::numeric_limits<double>::min ()) /
93  static_cast<__float128> (2.0);
94  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
95  const __float128 lowerOrderTerm =
96  static_cast<__float128> (gen.drand ()) * scalingFactor;
97  return higherOrderTerm + lowerOrderTerm;
98  }
99  static KOKKOS_INLINE_FUNCTION __float128
100  draw (Generator& gen, const __float128& range)
101  {
102  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
103  const __float128 scalingFactor =
104  static_cast<__float128> (std::numeric_limits<double>::min ()) /
105  static_cast<__float128> (2.0);
106  const __float128 higherOrderTerm =
107  static_cast<__float128> (gen.drand (range));
108  const __float128 lowerOrderTerm =
109  static_cast<__float128> (gen.drand (range)) * scalingFactor;
110  return higherOrderTerm + lowerOrderTerm;
111  }
112  static KOKKOS_INLINE_FUNCTION __float128
113  draw (Generator& gen, const __float128& start, const __float128& end)
114  {
115  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
116  const __float128 scalingFactor =
117  static_cast<__float128> (std::numeric_limits<double>::min ()) /
118  static_cast<__float128> (2.0);
119  const __float128 higherOrderTerm =
120  static_cast<__float128> (gen.drand (start, end));
121  const __float128 lowerOrderTerm =
122  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
123  return higherOrderTerm + lowerOrderTerm;
124  }
125  };
126 } // namespace Kokkos
127 #endif // HAVE_TPETRA_INST_FLOAT128
128 
129 namespace { // (anonymous)
130 
145  template<class ST, class LO, class GO, class NT>
147  allocDualView (const size_t lclNumRows,
148  const size_t numCols,
149  const bool zeroOut = true,
150  const bool allowPadding = false)
151  {
152  using ::Tpetra::Details::Behavior;
153  using Kokkos::AllowPadding;
154  using Kokkos::view_alloc;
155  using Kokkos::WithoutInitializing;
156  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
157  typedef typename dual_view_type::t_dev dev_view_type;
158  // This needs to be a string and not a char*, if given as an
159  // argument to Kokkos::view_alloc. This is because view_alloc
160  // also allows a raw pointer as its first argument. See
161  // https://github.com/kokkos/kokkos/issues/434.
162  const std::string label ("MV::DualView");
163  const bool debug = Behavior::debug ();
164 
165  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
166  // creation of the DualView's Views works around
167  // Kokkos::DualView's current inability to accept an
168  // AllocationProperties initial argument (as Kokkos::View does).
169  // However, the work-around is harmless, since it does what the
170  // (currently nonexistent) equivalent DualView constructor would
171  // have done anyway.
172 
173  dev_view_type d_view;
174  if (zeroOut) {
175  if (allowPadding) {
176  d_view = dev_view_type (view_alloc (label, AllowPadding),
177  lclNumRows, numCols);
178  }
179  else {
180  d_view = dev_view_type (view_alloc (label),
181  lclNumRows, numCols);
182  }
183  }
184  else {
185  if (allowPadding) {
186  d_view = dev_view_type (view_alloc (label,
187  WithoutInitializing,
188  AllowPadding),
189  lclNumRows, numCols);
190  }
191  else {
192  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
193  lclNumRows, numCols);
194  }
195  if (debug) {
196  // Filling with NaN is a cheap and effective way to tell if
197  // downstream code is trying to use a MultiVector's data
198  // without them having been initialized. ArithTraits lets us
199  // call nan() even if the scalar type doesn't define it; it
200  // just returns some undefined value in the latter case. This
201  // won't hurt anything because by setting zeroOut=false, users
202  // already agreed that they don't care about the contents of
203  // the MultiVector.
204  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
205  KokkosBlas::fill (d_view, nan);
206  }
207  }
208  if (debug) {
209  TEUCHOS_TEST_FOR_EXCEPTION
210  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
211  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
212  "allocDualView: d_view's dimensions actual dimensions do not match "
213  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
214  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
215  << ". Please report this bug to the Tpetra developers.");
216  }
217 
218  dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
219  // Whether or not the user cares about the initial contents of the
220  // MultiVector, the device and host views are out of sync. We
221  // prefer to work in device memory. The way to ensure this
222  // happens is to mark the device view as modified.
223  dv.modify_device ();
224 
225  return dv;
226  }
227 
228  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
229  //
230  // T: The type of the entries of the View.
231  // ExecSpace: The Kokkos execution space.
232  template<class T, class ExecSpace>
233  struct MakeUnmanagedView {
234  // The 'false' part of the branch carefully ensures that this
235  // won't attempt to use a host execution space that hasn't been
236  // initialized. For example, if Kokkos::OpenMP is disabled and
237  // Kokkos::Threads is enabled, the latter is always the default
238  // execution space of Kokkos::HostSpace, even when ExecSpace is
239  // Kokkos::Serial. That's why we go through the trouble of asking
240  // Kokkos::DualView what _its_ space is. That seems to work
241  // around this default execution space issue.
242  //
243  // NOTE (mfh 29 Jan 2016): See kokkos/kokkos#178 for why we use
244  // a memory space, rather than an execution space, as the first
245  // argument of VerifyExecutionCanAccessMemorySpace.
246  typedef typename Kokkos::Impl::if_c<
247  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
248  typename ExecSpace::memory_space,
249  Kokkos::HostSpace>::value,
250  typename ExecSpace::device_type,
251  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
252  typedef Kokkos::LayoutLeft array_layout;
253  typedef Kokkos::View<T*, array_layout, host_exec_space,
254  Kokkos::MemoryUnmanaged> view_type;
255 
256  static view_type getView (const Teuchos::ArrayView<T>& x_in)
257  {
258  const size_t numEnt = static_cast<size_t> (x_in.size ());
259  if (numEnt == 0) {
260  return view_type ();
261  } else {
262  return view_type (x_in.getRawPtr (), numEnt);
263  }
264  }
265  };
266 
267  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
268  // taking a subview of a 0 x N DualView incorrectly always results
269  // in a 0 x 0 DualView.
270  template<class DualViewType>
271  DualViewType
272  takeSubview (const DualViewType& X,
273  const Kokkos::Impl::ALL_t&,
274  const std::pair<size_t, size_t>& colRng)
275  {
276  if (X.extent (0) == 0 && X.extent (1) != 0) {
277  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
278  }
279  else {
280  return subview (X, Kokkos::ALL (), colRng);
281  }
282  }
283 
284  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
285  // taking a subview of a 0 x N DualView incorrectly always results
286  // in a 0 x 0 DualView.
287  template<class DualViewType>
288  DualViewType
289  takeSubview (const DualViewType& X,
290  const std::pair<size_t, size_t>& rowRng,
291  const std::pair<size_t, size_t>& colRng)
292  {
293  if (X.extent (0) == 0 && X.extent (1) != 0) {
294  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
295  }
296  else {
297  return subview (X, rowRng, colRng);
298  }
299  }
300 
301  template<class DualViewType>
302  size_t
303  getDualViewStride (const DualViewType& dv)
304  {
305  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
306  // method yet, but its Views do.
307  const size_t LDA = dv.d_view.stride (1);
308  const size_t numRows = dv.extent (0);
309 
310  if (LDA == 0) {
311  return (numRows == 0) ? size_t (1) : numRows;
312  }
313  else {
314  return LDA;
315  }
316  }
317 
318  template<class ViewType>
319  size_t
320  getViewStride (const ViewType& view)
321  {
322  const size_t LDA = view.stride (1);
323  const size_t numRows = view.extent (0);
324 
325  if (LDA == 0) {
326  return (numRows == 0) ? size_t (1) : numRows;
327  }
328  else {
329  return LDA;
330  }
331  }
332 
333  template <class SC, class LO, class GO, class NT>
334  bool
335  multiVectorRunNormOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
336  {
337  if (! X.need_sync_device ()) {
338  return false; // most up-to-date on device
339  }
340  else { // most up-to-date on host
341  constexpr size_t localLengthThreshold = 10000;
342  return X.getLocalLength () <= localLengthThreshold;
343  }
344  }
345 
346  template <class SC, class LO, class GO, class NT>
347  void
348  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
350  const ::Tpetra::Details::EWhichNorm whichNorm)
351  {
352  using ::Tpetra::Details::normImpl;
354  using val_type = typename MV::impl_scalar_type;
355  using mag_type = typename MV::mag_type;
356  using dual_view_type = typename MV::dual_view_type;
357 
358  auto map = X.getMap ();
359  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
360  auto whichVecs = getMultiVectorWhichVectors (X);
361  const bool isConstantStride = X.isConstantStride ();
362  const bool isDistributed = X.isDistributed ();
363 
364  const bool runOnHost = multiVectorRunNormOnHost (X);
365  if (runOnHost) {
366  using view_type = typename dual_view_type::t_host;
367  using array_layout = typename view_type::array_layout;
368  using device_type = typename view_type::device_type;
369 
370  if (X.need_sync_host ()) {
371  X.sync_host ();
372  }
373  view_type X_lcl = X.getLocalViewHost ();
374  normImpl<val_type, array_layout, device_type,
375  mag_type> (norms, X_lcl, whichNorm, whichVecs,
376  isConstantStride, isDistributed, comm);
377  }
378  else {
379  using view_type = typename dual_view_type::t_dev;
380  using array_layout = typename view_type::array_layout;
381  using device_type = typename view_type::device_type;
382 
383  if (X.need_sync_device ()) {
384  X.sync_device ();
385  }
386  view_type X_lcl = X.getLocalViewDevice ();
387  normImpl<val_type, array_layout, device_type,
388  mag_type> (norms, X_lcl, whichNorm, whichVecs,
389  isConstantStride, isDistributed, comm);
390  }
391  }
392 } // namespace (anonymous)
393 
394 
395 namespace Tpetra {
396 
397  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
398  bool
399  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
400  vectorIndexOutOfRange (const size_t VectorIndex) const {
401  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
402  }
403 
404  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
407  base_type (Teuchos::rcp (new map_type ()))
408  {}
409 
410  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
412  MultiVector (const Teuchos::RCP<const map_type>& map,
413  const size_t numVecs,
414  const bool zeroOut) : /* default is true */
415  base_type (map)
416  {
417  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
418 
419  const size_t lclNumRows = this->getLocalLength ();
420  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
421  origView_ = view_;
422  }
423 
424  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
427  const Teuchos::DataAccess copyOrView) :
428  base_type (source),
429  view_ (source.view_),
430  origView_ (source.origView_),
431  whichVectors_ (source.whichVectors_)
432  {
434  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
435  "const Teuchos::DataAccess): ";
436 
437  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
438 
439  if (copyOrView == Teuchos::Copy) {
440  // Reuse the conveniently already existing function that creates
441  // a deep copy.
442  MV cpy = createCopy (source);
443  this->view_ = cpy.view_;
444  this->origView_ = cpy.origView_;
445  this->whichVectors_ = cpy.whichVectors_;
446  }
447  else if (copyOrView == Teuchos::View) {
448  }
449  else {
450  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
451  true, std::invalid_argument, "Second argument 'copyOrView' has an "
452  "invalid value " << copyOrView << ". Valid values include "
453  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
454  << Teuchos::View << ".");
455  }
456  }
457 
458  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
460  MultiVector (const Teuchos::RCP<const map_type>& map,
461  const dual_view_type& view) :
462  base_type (map),
463  view_ (view),
464  origView_ (view)
465  {
466  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
467 
468  const size_t lclNumRows_map = map->getNodeNumElements ();
469  const size_t lclNumRows_view = view.extent (0);
470  const size_t LDA = getDualViewStride (view);
471 
472  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
473  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
474  std::invalid_argument, "Kokkos::DualView does not match Map. "
475  "map->getNodeNumElements() = " << lclNumRows_map
476  << ", view.extent(0) = " << lclNumRows_view
477  << ", and getStride() = " << LDA << ".");
478  }
479 
480 
481  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
483  MultiVector (const Teuchos::RCP<const map_type>& map,
484  const typename dual_view_type::t_dev& d_view) :
485  base_type (map)
486  {
487  using Teuchos::ArrayRCP;
488  using Teuchos::RCP;
489  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
490 
491  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
492 
493  const size_t LDA = getViewStride (d_view);
494  const size_t lclNumRows = map->getNodeNumElements ();
495  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
496  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
497  "Kokkos::View. map->getNodeNumElements() = " << lclNumRows
498  << ", View's column stride = " << LDA
499  << ", and View's extent(0) = " << d_view.extent (0) << ".");
500 
501  auto h_view = Kokkos::create_mirror_view (d_view);
502  view_ = dual_view_type (d_view, h_view);
503  // The user gave us a device view. In order to respect its
504  // initial contents, we mark the DualView as "modified on device."
505  // That way, the next sync will synchronize it with the host view.
506  this->modify_device ();
507  origView_ = view_;
508  }
509 
510  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
512  MultiVector (const Teuchos::RCP<const map_type>& map,
513  const dual_view_type& view,
514  const dual_view_type& origView) :
515  base_type (map),
516  view_ (view),
517  origView_ (origView)
518  {
519  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
520 
521  const size_t LDA = getDualViewStride (origView);
522  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
523  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
524  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
525  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
526  << ". This may also mean that the input origView's first dimension (number "
527  "of rows = " << origView.extent (0) << ") does not not match the number "
528  "of entries in the Map on the calling process.");
529  }
530 
531 
532  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
534  MultiVector (const Teuchos::RCP<const map_type>& map,
535  const dual_view_type& view,
536  const Teuchos::ArrayView<const size_t>& whichVectors) :
537  base_type (map),
538  view_ (view),
539  origView_ (view),
540  whichVectors_ (whichVectors.begin (), whichVectors.end ())
541  {
542  using Kokkos::ALL;
543  using Kokkos::subview;
544  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
545 
546  const size_t lclNumRows = map.is_null () ? size_t (0) :
547  map->getNodeNumElements ();
548  // Check dimensions of the input DualView. We accept that Kokkos
549  // might not allow construction of a 0 x m (Dual)View with m > 0,
550  // so we only require the number of rows to match if the
551  // (Dual)View has more than zero columns. Likewise, we only
552  // require the number of columns to match if the (Dual)View has
553  // more than zero rows.
554  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
555  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
556  std::invalid_argument, "view.extent(0) = " << view.extent (0)
557  << " < map->getNodeNumElements() = " << lclNumRows << ".");
558  if (whichVectors.size () != 0) {
559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
560  view.extent (1) != 0 && view.extent (1) == 0,
561  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
562  " = " << whichVectors.size () << " > 0.");
563  size_t maxColInd = 0;
564  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
565  for (size_type k = 0; k < whichVectors.size (); ++k) {
566  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
567  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
568  std::invalid_argument, "whichVectors[" << k << "] = "
569  "Teuchos::OrdinalTraits<size_t>::invalid().");
570  maxColInd = std::max (maxColInd, whichVectors[k]);
571  }
572  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
573  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
574  std::invalid_argument, "view.extent(1) = " << view.extent (1)
575  << " <= max(whichVectors) = " << maxColInd << ".");
576  }
577 
578  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
579  // zero strides, so modify in that case.
580  const size_t LDA = getDualViewStride (view);
581  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
582  (LDA < lclNumRows, std::invalid_argument,
583  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
584 
585  if (whichVectors.size () == 1) {
586  // If whichVectors has only one entry, we don't need to bother
587  // with nonconstant stride. Just shift the view over so it
588  // points to the desired column.
589  //
590  // NOTE (mfh 10 May 2014) This is a special case where we set
591  // origView_ just to view that one column, not all of the
592  // original columns. This ensures that the use of origView_ in
593  // offsetView works correctly.
594  const std::pair<size_t, size_t> colRng (whichVectors[0],
595  whichVectors[0] + 1);
596  view_ = takeSubview (view_, ALL (), colRng);
597  origView_ = takeSubview (origView_, ALL (), colRng);
598  // whichVectors_.size() == 0 means "constant stride."
599  whichVectors_.clear ();
600  }
601  }
602 
603  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
605  MultiVector (const Teuchos::RCP<const map_type>& map,
606  const dual_view_type& view,
607  const dual_view_type& origView,
608  const Teuchos::ArrayView<const size_t>& whichVectors) :
609  base_type (map),
610  view_ (view),
611  origView_ (origView),
612  whichVectors_ (whichVectors.begin (), whichVectors.end ())
613  {
614  using Kokkos::ALL;
615  using Kokkos::subview;
616  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
617 
618  const size_t lclNumRows = this->getLocalLength ();
619  // Check dimensions of the input DualView. We accept that Kokkos
620  // might not allow construction of a 0 x m (Dual)View with m > 0,
621  // so we only require the number of rows to match if the
622  // (Dual)View has more than zero columns. Likewise, we only
623  // require the number of columns to match if the (Dual)View has
624  // more than zero rows.
625  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
626  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
627  std::invalid_argument, "view.extent(0) = " << view.extent (0)
628  << " < map->getNodeNumElements() = " << lclNumRows << ".");
629  if (whichVectors.size () != 0) {
630  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
631  view.extent (1) != 0 && view.extent (1) == 0,
632  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
633  " = " << whichVectors.size () << " > 0.");
634  size_t maxColInd = 0;
635  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
636  for (size_type k = 0; k < whichVectors.size (); ++k) {
637  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
638  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
639  std::invalid_argument, "whichVectors[" << k << "] = "
640  "Teuchos::OrdinalTraits<size_t>::invalid().");
641  maxColInd = std::max (maxColInd, whichVectors[k]);
642  }
643  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
644  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
645  std::invalid_argument, "view.extent(1) = " << view.extent (1)
646  << " <= max(whichVectors) = " << maxColInd << ".");
647  }
648 
649  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
650  // zero strides, so modify in that case.
651  const size_t LDA = getDualViewStride (origView);
652  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
653  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
654  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
655  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
656  << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
657 
658  if (whichVectors.size () == 1) {
659  // If whichVectors has only one entry, we don't need to bother
660  // with nonconstant stride. Just shift the view over so it
661  // points to the desired column.
662  //
663  // NOTE (mfh 10 May 2014) This is a special case where we set
664  // origView_ just to view that one column, not all of the
665  // original columns. This ensures that the use of origView_ in
666  // offsetView works correctly.
667  const std::pair<size_t, size_t> colRng (whichVectors[0],
668  whichVectors[0] + 1);
669  view_ = takeSubview (view_, ALL (), colRng);
670  origView_ = takeSubview (origView_, ALL (), colRng);
671  // whichVectors_.size() == 0 means "constant stride."
672  whichVectors_.clear ();
673  }
674  }
675 
676  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
678  MultiVector (const Teuchos::RCP<const map_type>& map,
679  const Teuchos::ArrayView<const Scalar>& data,
680  const size_t LDA,
681  const size_t numVecs) :
682  base_type (map)
683  {
684  typedef LocalOrdinal LO;
685  typedef GlobalOrdinal GO;
686  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
687  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
688 
689  // Deep copy constructor, constant stride (NO whichVectors_).
690  // There is no need for a deep copy constructor with nonconstant stride.
691 
692  const size_t lclNumRows =
693  map.is_null () ? size_t (0) : map->getNodeNumElements ();
694  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
695  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
696  "map->getNodeNumElements() = " << lclNumRows << ".");
697  if (numVecs != 0) {
698  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
699  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
700  (static_cast<size_t> (data.size ()) < minNumEntries,
701  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
702  "entries, given the input Map and number of vectors in the MultiVector."
703  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
704  "map->getNodeNumElements () = " << minNumEntries << ".");
705  }
706 
707  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
708  this->modify_device ();
709  auto X_out = this->getLocalViewDevice ();
710  origView_ = view_;
711 
712  // Make an unmanaged host Kokkos::View of the input data. First
713  // create a View (X_in_orig) with the original stride. Then,
714  // create a subview (X_in) with the right number of columns.
715  const impl_scalar_type* const X_in_raw =
716  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
717  Kokkos::View<const impl_scalar_type**,
718  Kokkos::LayoutLeft,
719  Kokkos::HostSpace,
720  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
721  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
722  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
723 
724  // If LDA != X_out's column stride, then we need to copy one
725  // column at a time; Kokkos::deep_copy refuses to work in that
726  // case.
727  const size_t outStride =
728  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
729  if (LDA == outStride) { // strides are the same; deep_copy once
730  // This only works because MultiVector uses LayoutLeft.
731  // We would need a custom copy functor otherwise.
732  Kokkos::deep_copy (X_out, X_in);
733  }
734  else { // strides differ; copy one column at a time
735  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
736  out_col_view_type;
737  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
738  in_col_view_type;
739  for (size_t j = 0; j < numVecs; ++j) {
740  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
741  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
742  Kokkos::deep_copy (X_out_j, X_in_j);
743  }
744  }
745  }
746 
747  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
749  MultiVector (const Teuchos::RCP<const map_type>& map,
750  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
751  const size_t numVecs) :
752  base_type (map)
753  {
754  typedef impl_scalar_type IST;
755  typedef LocalOrdinal LO;
756  typedef GlobalOrdinal GO;
757  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
758  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
759 
760  const size_t lclNumRows =
761  map.is_null () ? size_t (0) : map->getNodeNumElements ();
762  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
763  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
764  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
765  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
766  for (size_t j = 0; j < numVecs; ++j) {
767  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
768  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
769  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
770  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
771  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
772  << ".");
773  }
774 
775  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
776  this->modify_device ();
777  auto X_out = this->getLocalViewDevice ();
778 
779  // Make sure that the type of a single input column has the same
780  // array layout as each output column does, so we can deep_copy.
781  using array_layout = typename decltype (X_out)::array_layout;
782  using input_col_view_type = typename Kokkos::View<const IST*,
783  array_layout,
784  Kokkos::HostSpace,
785  Kokkos::MemoryUnmanaged>;
786 
787  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
788  for (size_t j = 0; j < numVecs; ++j) {
789  Teuchos::ArrayView<const IST> X_j_av =
790  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
791  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
792  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
793  Kokkos::deep_copy (X_j_out, X_j_in);
794  }
795  origView_ = view_;
796  }
797 
798  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
801  return whichVectors_.empty ();
802  }
803 
804  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
805  size_t
808  {
809  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
810  return static_cast<size_t> (0);
811  } else {
812  return this->getMap ()->getNodeNumElements ();
813  }
814  }
815 
816  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
820  {
821  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
822  return static_cast<size_t> (0);
823  } else {
824  return this->getMap ()->getGlobalNumElements ();
825  }
826  }
827 
828  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
829  size_t
831  getStride () const
832  {
833  return isConstantStride () ? getDualViewStride (origView_) : size_t (0);
834  }
835 
836  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
837  bool
839  checkSizes (const SrcDistObject& sourceObj)
840  {
841  // Check whether the source object is a MultiVector. If not, then
842  // we can't even compare sizes, so it's definitely not OK to
843  // Import or Export from it.
845  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
846  if (src == nullptr) {
847  return false;
848  }
849  else {
850  // The target of the Import or Export calls checkSizes() in
851  // DistObject::doTransfer(). By that point, we've already
852  // constructed an Import or Export object using the two
853  // multivectors' Maps, which means that (hopefully) we've
854  // already checked other attributes of the multivectors. Thus,
855  // all we need to do here is check the number of columns.
856  return src->getNumVectors () == this->getNumVectors ();
857  }
858  }
859 
860  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
861  size_t
864  return this->getNumVectors ();
865  }
866 
867  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
868  void
870 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
871  copyAndPermuteNew
872 #else // TPETRA_ENABLE_DEPRECATED_CODE
873  copyAndPermute
874 #endif // TPETRA_ENABLE_DEPRECATED_CODE
875  (const SrcDistObject& sourceObj,
876  const size_t numSameIDs,
877  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
878  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs)
879  {
880  using ::Tpetra::Details::Behavior;
882  using ::Tpetra::Details::ProfilingRegion;
883  using std::endl;
884  using KokkosRefactor::Details::permute_array_multi_column;
885  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
886  using Kokkos::Compat::create_const_view;
888  const char tfecfFuncName[] = "copyAndPermute: ";
889  ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
890 
891  const bool verbose = Behavior::verbose ();
892  std::unique_ptr<std::string> prefix;
893  if (verbose) {
894  auto map = this->getMap ();
895  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
896  const int myRank = comm.is_null () ? -1 : comm->getRank ();
897  std::ostringstream os;
898  os << "Proc " << myRank << ": MV::copyAndPermute: ";
899  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
900  os << "Start" << endl;
901  std::cerr << os.str ();
902  }
903 
904  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
905  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
906  std::logic_error, "permuteToLIDs.extent(0) = "
907  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
908  << permuteFromLIDs.extent (0) << ".");
909 
910  // We've already called checkSizes(), so this cast must succeed.
911  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
912  const size_t numCols = this->getNumVectors ();
913 
914  // sourceMV doesn't belong to us, so we can't sync it. Do the
915  // copying where it's currently sync'd.
916  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
917  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
918  std::logic_error, "Input MultiVector needs sync to both host "
919  "and device.");
920  const bool copyOnHost = sourceMV.need_sync_device ();
921  if (verbose) {
922  std::ostringstream os;
923  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
924  std::cerr << os.str ();
925  }
926 
927  if (copyOnHost) {
928  if (this->need_sync_host ()) {
929  this->sync_host ();
930  }
931  this->modify_host ();
932  }
933  else {
934  if (this->need_sync_device ()) {
935  this->sync_device ();
936  }
937  this->modify_device ();
938  }
939 
940  if (verbose) {
941  std::ostringstream os;
942  os << *prefix << "Copy" << endl;
943  std::cerr << os.str ();
944  }
945 
946  // TODO (mfh 15 Sep 2013) When we replace
947  // KokkosClassic::MultiVector with a Kokkos::View, there are two
948  // ways to copy the data:
949  //
950  // 1. Get a (sub)view of each column and call deep_copy on that.
951  // 2. Write a custom kernel to copy the data.
952  //
953  // The first is easier, but the second might be more performant in
954  // case we decide to use layouts other than LayoutLeft. It might
955  // even make sense to hide whichVectors_ in an entirely new layout
956  // for Kokkos Views.
957 
958  // Copy rows [0, numSameIDs-1] of the local multivectors.
959  //
960  // Note (ETP 2 Jul 2014) We need to always copy one column at a
961  // time, even when both multivectors are constant-stride, since
962  // deep_copy between strided subviews with more than one column
963  // doesn't currently work.
964 
965  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
966  // both source and target are constant stride and have multiple
967  // columns.
968  if (numSameIDs > 0) {
969  const std::pair<size_t, size_t> rows (0, numSameIDs);
970  if (copyOnHost) {
971  auto tgt_h = this->getLocalViewHost ();
972  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
973 
974  for (size_t j = 0; j < numCols; ++j) {
975  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
976  const size_t srcCol =
977  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
978 
979  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
980  auto src_j = Kokkos::subview (src_h, rows, srcCol);
981  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
982  }
983  }
984  else { // copy on device
985  auto tgt_d = this->getLocalViewDevice ();
986  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
987 
988  for (size_t j = 0; j < numCols; ++j) {
989  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
990  const size_t srcCol =
991  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
992 
993  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
994  auto src_j = Kokkos::subview (src_d, rows, srcCol);
995  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
996  }
997  }
998  }
999 
1000  // For the remaining GIDs, execute the permutations. This may
1001  // involve noncontiguous access of both source and destination
1002  // vectors, depending on the LID lists.
1003  //
1004  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1005  // the same process, this merges their values by replacement of
1006  // the last encountered GID, not by the specified merge rule
1007  // (such as ADD).
1008 
1009  // If there are no permutations, we are done
1010  if (permuteFromLIDs.extent (0) == 0 ||
1011  permuteToLIDs.extent (0) == 0) {
1012  if (verbose) {
1013  std::ostringstream os;
1014  os << *prefix << "No permutations. Done!" << endl;
1015  std::cerr << os.str ();
1016  }
1017  return;
1018  }
1019 
1020  if (verbose) {
1021  std::ostringstream os;
1022  os << *prefix << "Permute" << endl;
1023  std::cerr << os.str ();
1024  }
1025 
1026  // We could in theory optimize for the case where exactly one of
1027  // them is constant stride, but we don't currently do that.
1028  const bool nonConstStride =
1029  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1030 
1031  if (verbose) {
1032  std::ostringstream os;
1033  os << *prefix << "nonConstStride="
1034  << (nonConstStride ? "true" : "false") << endl;
1035  std::cerr << os.str ();
1036  }
1037 
1038  // We only need the "which vectors" arrays if either the source or
1039  // target MV is not constant stride. Since we only have one
1040  // kernel that must do double-duty, we have to create a "which
1041  // vectors" array for the MV that _is_ constant stride.
1042  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1043  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1044  if (nonConstStride) {
1045  if (this->whichVectors_.size () == 0) {
1046  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1047  tmpTgt.modify_host ();
1048  for (size_t j = 0; j < numCols; ++j) {
1049  tmpTgt.h_view(j) = j;
1050  }
1051  if (! copyOnHost) {
1052  tmpTgt.sync_device ();
1053  }
1054  tgtWhichVecs = tmpTgt;
1055  }
1056  else {
1057  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1058  tgtWhichVecs =
1059  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1060  "tgtWhichVecs",
1061  copyOnHost);
1062  }
1063  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1064  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1065  this->getNumVectors (),
1066  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1067  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1068  this->getNumVectors () << ".");
1069 
1070  if (sourceMV.whichVectors_.size () == 0) {
1071  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1072  tmpSrc.modify_host ();
1073  for (size_t j = 0; j < numCols; ++j) {
1074  tmpSrc.h_view(j) = j;
1075  }
1076  if (! copyOnHost) {
1077  tmpSrc.sync_device ();
1078  }
1079  srcWhichVecs = tmpSrc;
1080  }
1081  else {
1082  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1083  sourceMV.whichVectors_ ();
1084  srcWhichVecs =
1085  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1086  "srcWhichVecs",
1087  copyOnHost);
1088  }
1089  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1090  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1091  sourceMV.getNumVectors (), std::logic_error,
1092  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1093  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1094  << ".");
1095  }
1096 
1097  if (copyOnHost) { // permute on host too
1098  if (verbose) {
1099  std::ostringstream os;
1100  os << *prefix << "Get permute LIDs on host" << std::endl;
1101  std::cerr << os.str ();
1102  }
1103  auto tgt_h = this->getLocalViewHost ();
1104  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
1105 
1106  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1107  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1108  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1109  auto permuteFromLIDs_h =
1110  create_const_view (permuteFromLIDs.view_host ());
1111 
1112  if (verbose) {
1113  std::ostringstream os;
1114  os << *prefix << "Permute on host" << endl;
1115  std::cerr << os.str ();
1116  }
1117  if (nonConstStride) {
1118  // No need to sync first, because copyOnHost argument to
1119  // getDualViewCopyFromArrayView puts them in the right place.
1120  auto tgtWhichVecs_h =
1121  create_const_view (tgtWhichVecs.view_host ());
1122  auto srcWhichVecs_h =
1123  create_const_view (srcWhichVecs.view_host ());
1124  permute_array_multi_column_variable_stride (tgt_h, src_h,
1125  permuteToLIDs_h,
1126  permuteFromLIDs_h,
1127  tgtWhichVecs_h,
1128  srcWhichVecs_h, numCols);
1129  }
1130  else {
1131  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1132  permuteFromLIDs_h, numCols);
1133  }
1134  }
1135  else { // permute on device
1136  if (verbose) {
1137  std::ostringstream os;
1138  os << *prefix << "Get permute LIDs on device" << endl;
1139  std::cerr << os.str ();
1140  }
1141  auto tgt_d = this->getLocalViewDevice ();
1142  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
1143 
1144  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1145  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1146  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1147  auto permuteFromLIDs_d =
1148  create_const_view (permuteFromLIDs.view_device ());
1149 
1150  if (verbose) {
1151  std::ostringstream os;
1152  os << *prefix << "Permute on device" << endl;
1153  std::cerr << os.str ();
1154  }
1155  if (nonConstStride) {
1156  // No need to sync first, because copyOnHost argument to
1157  // getDualViewCopyFromArrayView puts them in the right place.
1158  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1159  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1160  permute_array_multi_column_variable_stride (tgt_d, src_d,
1161  permuteToLIDs_d,
1162  permuteFromLIDs_d,
1163  tgtWhichVecs_d,
1164  srcWhichVecs_d, numCols);
1165  }
1166  else {
1167  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1168  permuteFromLIDs_d, numCols);
1169  }
1170  }
1171 
1172  if (verbose) {
1173  std::ostringstream os;
1174  os << *prefix << "Done!" << endl;
1175  std::cerr << os.str ();
1176  }
1177  }
1178 
1179  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1180  void
1181  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1182 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1183  packAndPrepareNew
1184 #else // TPETRA_ENABLE_DEPRECATED_CODE
1185  packAndPrepare
1186 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1187  (const SrcDistObject& sourceObj,
1188  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1189  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1190  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1191  size_t& constantNumPackets,
1192  Distributor & /* distor */ )
1193  {
1194  using ::Tpetra::Details::Behavior;
1195  using ::Tpetra::Details::ProfilingRegion;
1197  using Kokkos::Compat::create_const_view;
1198  using Kokkos::Compat::getKokkosViewDeepCopy;
1199  using std::endl;
1200  using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
1201  const char tfecfFuncName[] = "packAndPrepare: ";
1202  ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1203 
1204  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1205  // have the option to check indices. We do so when Tpetra is in
1206  // debug mode. It is in debug mode by default in a debug build,
1207  // but you may control this at run time, before launching the
1208  // executable, by setting the TPETRA_DEBUG environment variable to
1209  // "1" (or "TRUE").
1210  const bool debugCheckIndices = Behavior::debug ();
1211  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1212  // environment variable to "1" (or "TRUE") for copious debug
1213  // output to std::cerr on every MPI process. This is unwise for
1214  // runs with large numbers of MPI processes.
1215  const bool printDebugOutput = Behavior::verbose ();
1216 
1217  std::unique_ptr<std::string> prefix;
1218  if (printDebugOutput) {
1219  auto map = this->getMap ();
1220  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1221  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1222  std::ostringstream os;
1223  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1224  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1225  os << "Start" << endl;
1226  std::cerr << os.str ();
1227  }
1228 
1229  // We've already called checkSizes(), so this cast must succeed.
1230  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
1231 
1232  const size_t numCols = sourceMV.getNumVectors ();
1233 
1234  // This spares us from needing to fill numExportPacketsPerLID.
1235  // Setting constantNumPackets to a nonzero value signals that
1236  // all packets have the same number of entries.
1237  constantNumPackets = numCols;
1238 
1239  // If we have no exports, there is nothing to do. Make sure this
1240  // goes _after_ setting constantNumPackets correctly.
1241  if (exportLIDs.extent (0) == 0) {
1242  if (printDebugOutput) {
1243  std::ostringstream os;
1244  os << *prefix << "No exports on this proc, DONE" << std::endl;
1245  std::cerr << os.str ();
1246  }
1247  return;
1248  }
1249 
1250  /* The layout in the export for MultiVectors is as follows:
1251  exports = { all of the data from row exportLIDs.front() ;
1252  ....
1253  all of the data from row exportLIDs.back() }
1254  This doesn't have the best locality, but is necessary because
1255  the data for a Packet (all data associated with an LID) is
1256  required to be contiguous. */
1257 
1258  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1259  // packing scheme in the above comment? The data going to a
1260  // particular process must be contiguous, of course, but those
1261  // data could include entries from multiple LIDs. DistObject just
1262  // needs to know how to index into that data. Kokkos is good at
1263  // decoupling storage intent from data layout choice.
1264 
1265  const size_t numExportLIDs = exportLIDs.extent (0);
1266  const size_t newExportsSize = numCols * numExportLIDs;
1267  if (printDebugOutput) {
1268  std::ostringstream os;
1269  os << *prefix << "realloc: "
1270  << "numExportLIDs: " << numExportLIDs
1271  << ", exports.extent(0): " << exports.extent (0)
1272  << ", newExportsSize: " << newExportsSize << std::endl;
1273  std::cerr << os.str ();
1274  }
1275  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1276 
1277  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1278  // sync it. Pack it where it's currently sync'd.
1279  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1280  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1281  std::logic_error, "Input MultiVector needs sync to both host "
1282  "and device.");
1283  const bool packOnHost = sourceMV.need_sync_device ();
1284  auto src_dev = sourceMV.getLocalViewHost ();
1285  auto src_host = sourceMV.getLocalViewDevice ();
1286  if (printDebugOutput) {
1287  std::ostringstream os;
1288  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1289  std::cerr << os.str ();
1290  }
1291 
1292  // Mark 'exports' here, since we might have resized it above.
1293  // Resizing currently requires calling the constructor, which
1294  // clears out the 'modified' flags.
1295  if (packOnHost) {
1296  exports.modify_host ();
1297  }
1298  else {
1299  exports.modify_device ();
1300  }
1301 
1302  if (numCols == 1) { // special case for one column only
1303  // MultiVector always represents a single column with constant
1304  // stride, but it doesn't hurt to implement both cases anyway.
1305  //
1306  // ETP: I'm not sure I agree with the above statement. Can't a single-
1307  // column multivector be a subview of another multi-vector, in which case
1308  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1309  // separately.
1310  //
1311  // mfh 18 Jan 2016: In answer to ETP's comment above:
1312  // MultiVector treats single-column MultiVectors created using a
1313  // "nonconstant stride constructor" as a special case, and makes
1314  // them constant stride (by making whichVectors_ have length 0).
1315  if (sourceMV.isConstantStride ()) {
1316  using KokkosRefactor::Details::pack_array_single_column;
1317  if (printDebugOutput) {
1318  std::ostringstream os;
1319  os << *prefix << "Pack numCols=1 const stride" << endl;
1320  std::cerr << os.str ();
1321  }
1322  if (packOnHost) {
1323  pack_array_single_column (exports.view_host (),
1324  create_const_view (src_host),
1325  exportLIDs.view_host (),
1326  0,
1327  debugCheckIndices);
1328  }
1329  else { // pack on device
1330  pack_array_single_column (exports.view_device (),
1331  create_const_view (src_dev),
1332  exportLIDs.view_device (),
1333  0,
1334  debugCheckIndices);
1335  }
1336  }
1337  else {
1338  using KokkosRefactor::Details::pack_array_single_column;
1339  if (printDebugOutput) {
1340  std::ostringstream os;
1341  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1342  std::cerr << os.str ();
1343  }
1344  if (packOnHost) {
1345  pack_array_single_column (exports.view_host (),
1346  create_const_view (src_host),
1347  exportLIDs.view_host (),
1348  sourceMV.whichVectors_[0],
1349  debugCheckIndices);
1350  }
1351  else { // pack on device
1352  pack_array_single_column (exports.view_device (),
1353  create_const_view (src_dev),
1354  exportLIDs.view_device (),
1355  sourceMV.whichVectors_[0],
1356  debugCheckIndices);
1357  }
1358  }
1359  }
1360  else { // the source MultiVector has multiple columns
1361  if (sourceMV.isConstantStride ()) {
1362  using KokkosRefactor::Details::pack_array_multi_column;
1363  if (printDebugOutput) {
1364  std::ostringstream os;
1365  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1366  std::cerr << os.str ();
1367  }
1368  if (packOnHost) {
1369  pack_array_multi_column (exports.view_host (),
1370  create_const_view (src_host),
1371  exportLIDs.view_host (),
1372  numCols,
1373  debugCheckIndices);
1374  }
1375  else { // pack on device
1376  pack_array_multi_column (exports.view_device (),
1377  create_const_view (src_dev),
1378  exportLIDs.view_device (),
1379  numCols,
1380  debugCheckIndices);
1381  }
1382  }
1383  else {
1384  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1385  if (printDebugOutput) {
1386  std::ostringstream os;
1387  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1388  << endl;
1389  std::cerr << os.str ();
1390  }
1391  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1392  // whichVectors_ can be expensive, but pack and unpack for
1393  // nonconstant-stride MultiVectors is slower anyway.
1394  using IST = impl_scalar_type;
1395  using DV = Kokkos::DualView<IST*, device_type>;
1396  using HES = typename DV::t_host::execution_space;
1397  using DES = typename DV::t_dev::execution_space;
1398  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1399  if (packOnHost) {
1400  pack_array_multi_column_variable_stride
1401  (exports.view_host (),
1402  create_const_view (src_host),
1403  exportLIDs.view_host (),
1404  getKokkosViewDeepCopy<HES> (whichVecs),
1405  numCols,
1406  debugCheckIndices);
1407  }
1408  else { // pack on device
1409  pack_array_multi_column_variable_stride
1410  (exports.view_device (),
1411  create_const_view (src_dev),
1412  exportLIDs.view_device (),
1413  getKokkosViewDeepCopy<DES> (whichVecs),
1414  numCols,
1415  debugCheckIndices);
1416  }
1417  }
1418  }
1419 
1420  if (printDebugOutput) {
1421  std::ostringstream os;
1422  os << *prefix << "Done!" << endl;
1423  std::cerr << os.str ();
1424  }
1425  }
1426 
1427 
1428  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1429  void
1430  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1431 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1432  unpackAndCombineNew
1433 #else // TPETRA_ENABLE_DEPRECATED_CODE
1434  unpackAndCombine
1435 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1436  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1437  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1438  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1439  const size_t constantNumPackets,
1440  Distributor& /* distor */,
1441  const CombineMode CM)
1442  {
1443  using ::Tpetra::Details::Behavior;
1444  using ::Tpetra::Details::ProfilingRegion;
1445  using KokkosRefactor::Details::unpack_array_multi_column;
1446  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1447  using Kokkos::Compat::getKokkosViewDeepCopy;
1448  using std::endl;
1449  using IST = impl_scalar_type;
1450  const char tfecfFuncName[] = "unpackAndCombine: ";
1451  ProfilingRegion regionUAC ("Tpetra::MultiVector::unpackAndCombine");
1452 
1453  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1454  // have the option to check indices. We do so when Tpetra is in
1455  // debug mode. It is in debug mode by default in a debug build,
1456  // but you may control this at run time, before launching the
1457  // executable, by setting the TPETRA_DEBUG environment variable to
1458  // "1" (or "TRUE").
1459  const bool debugCheckIndices = Behavior::debug ();
1460 
1461  const bool printDebugOutput = Behavior::verbose ();
1462  std::unique_ptr<std::string> prefix;
1463  if (printDebugOutput) {
1464  auto map = this->getMap ();
1465  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1466  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1467  std::ostringstream os;
1468  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1469  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1470  os << "Start" << endl;
1471  std::cerr << os.str ();
1472  }
1473 
1474  // If we have no imports, there is nothing to do
1475  if (importLIDs.extent (0) == 0) {
1476  if (printDebugOutput) {
1477  std::ostringstream os;
1478  os << *prefix << "No imports. Done!" << endl;
1479  }
1480  return;
1481  }
1482 
1483  const size_t numVecs = getNumVectors ();
1484  if (debugCheckIndices) {
1485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1486  (static_cast<size_t> (imports.extent (0)) !=
1487  numVecs * importLIDs.extent (0),
1488  std::runtime_error,
1489  "imports.extent(0) = " << imports.extent (0)
1490  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1491  << " * " << importLIDs.extent (0) << " = "
1492  << numVecs * importLIDs.extent (0) << ".");
1493 
1494  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1495  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1496  "constantNumPackets input argument must be nonzero.");
1497 
1498  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1499  (static_cast<size_t> (numVecs) !=
1500  static_cast<size_t> (constantNumPackets),
1501  std::runtime_error, "constantNumPackets must equal numVecs.");
1502  }
1503 
1504  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1505  // the memory space in which the imports buffer was last modified.
1506  // DistObject::doTransferNew gets to decide this.
1507  const bool unpackOnHost = imports.need_sync_device ();
1508 
1509  if (printDebugOutput) {
1510  std::ostringstream os;
1511  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1512  << endl;
1513  std::cerr << os.str ();
1514  }
1515 
1516  // We have to sync before modifying, because this method may read
1517  // as well as write (depending on the CombineMode).
1518  if (unpackOnHost) {
1519  if (this->need_sync_host ()) {
1520  this->sync_host ();
1521  }
1522  this->modify_host ();
1523  }
1524  else {
1525  if (this->need_sync_device ()) {
1526  this->sync_device ();
1527  }
1528  this->modify_device ();
1529  }
1530  auto X_d = this->getLocalViewDevice ();
1531  auto X_h = this->getLocalViewHost ();
1532  auto imports_d = imports.view_device ();
1533  auto imports_h = imports.view_host ();
1534  auto importLIDs_d = importLIDs.view_device ();
1535  auto importLIDs_h = importLIDs.view_host ();
1536 
1537  Kokkos::DualView<size_t*, device_type> whichVecs;
1538  if (! isConstantStride ()) {
1539  Kokkos::View<const size_t*, Kokkos::HostSpace,
1540  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1541  numVecs);
1542  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1543  if (unpackOnHost) {
1544  whichVecs.modify_host ();
1545  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1546  }
1547  else {
1548  whichVecs.modify_device ();
1549  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
1550  }
1551  }
1552  auto whichVecs_d = whichVecs.view_device ();
1553  auto whichVecs_h = whichVecs.view_host ();
1554 
1555  /* The layout in the export for MultiVectors is as follows:
1556  imports = { all of the data from row exportLIDs.front() ;
1557  ....
1558  all of the data from row exportLIDs.back() }
1559  This doesn't have the best locality, but is necessary because
1560  the data for a Packet (all data associated with an LID) is
1561  required to be contiguous. */
1562 
1563  if (numVecs > 0 && importLIDs.extent (0) > 0) {
1564  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1565  using host_exec_space = typename dual_view_type::t_host::execution_space;
1566 
1567  // This fixes GitHub Issue #4418.
1568  const bool use_atomic_updates = unpackOnHost ?
1569  host_exec_space::concurrency () != 1 :
1570  dev_exec_space::concurrency () != 1;
1571 
1572  if (printDebugOutput) {
1573  std::ostringstream os;
1574  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
1575  std::cerr << os.str ();
1576  }
1577 
1578  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1579  // custom combine modes, start editing here.
1580 
1581  if (CM == INSERT || CM == REPLACE) {
1582  using op_type = KokkosRefactor::Details::InsertOp<IST>;
1583  if (isConstantStride ()) {
1584  if (unpackOnHost) {
1585  unpack_array_multi_column (host_exec_space (),
1586  X_h, imports_h, importLIDs_h,
1587  op_type (), numVecs,
1588  use_atomic_updates,
1589  debugCheckIndices);
1590 
1591  }
1592  else { // unpack on device
1593  unpack_array_multi_column (dev_exec_space (),
1594  X_d, imports_d, importLIDs_d,
1595  op_type (), numVecs,
1596  use_atomic_updates,
1597  debugCheckIndices);
1598  }
1599  }
1600  else { // not constant stride
1601  if (unpackOnHost) {
1602  unpack_array_multi_column_variable_stride (host_exec_space (),
1603  X_h, imports_h,
1604  importLIDs_h,
1605  whichVecs_h,
1606  op_type (),
1607  numVecs,
1608  use_atomic_updates,
1609  debugCheckIndices);
1610  }
1611  else { // unpack on device
1612  unpack_array_multi_column_variable_stride (dev_exec_space (),
1613  X_d, imports_d,
1614  importLIDs_d,
1615  whichVecs_d,
1616  op_type (),
1617  numVecs,
1618  use_atomic_updates,
1619  debugCheckIndices);
1620  }
1621  }
1622  }
1623  else if (CM == ADD) {
1624  using op_type = KokkosRefactor::Details::AddOp<IST>;
1625  if (isConstantStride ()) {
1626  if (unpackOnHost) {
1627  unpack_array_multi_column (host_exec_space (),
1628  X_h, imports_h, importLIDs_h,
1629  op_type (), numVecs,
1630  use_atomic_updates,
1631  debugCheckIndices);
1632  }
1633  else { // unpack on device
1634  unpack_array_multi_column (dev_exec_space (),
1635  X_d, imports_d, importLIDs_d,
1636  op_type (), numVecs,
1637  use_atomic_updates,
1638  debugCheckIndices);
1639  }
1640  }
1641  else { // not constant stride
1642  if (unpackOnHost) {
1643  unpack_array_multi_column_variable_stride (host_exec_space (),
1644  X_h, imports_h,
1645  importLIDs_h,
1646  whichVecs_h,
1647  op_type (),
1648  numVecs,
1649  use_atomic_updates,
1650  debugCheckIndices);
1651  }
1652  else { // unpack on device
1653  unpack_array_multi_column_variable_stride (dev_exec_space (),
1654  X_d, imports_d,
1655  importLIDs_d,
1656  whichVecs_d,
1657  op_type (),
1658  numVecs,
1659  use_atomic_updates,
1660  debugCheckIndices);
1661  }
1662  }
1663  }
1664  else if (CM == ABSMAX) {
1665  using op_type = KokkosRefactor::Details::AbsMaxOp<IST>;
1666  if (isConstantStride ()) {
1667  if (unpackOnHost) {
1668  unpack_array_multi_column (host_exec_space (),
1669  X_h, imports_h, importLIDs_h,
1670  op_type (), numVecs,
1671  use_atomic_updates,
1672  debugCheckIndices);
1673  }
1674  else { // unpack on device
1675  unpack_array_multi_column (dev_exec_space (),
1676  X_d, imports_d, importLIDs_d,
1677  op_type (), numVecs,
1678  use_atomic_updates,
1679  debugCheckIndices);
1680  }
1681  }
1682  else {
1683  if (unpackOnHost) {
1684  unpack_array_multi_column_variable_stride (host_exec_space (),
1685  X_h, imports_h,
1686  importLIDs_h,
1687  whichVecs_h,
1688  op_type (),
1689  numVecs,
1690  use_atomic_updates,
1691  debugCheckIndices);
1692  }
1693  else { // unpack on device
1694  unpack_array_multi_column_variable_stride (dev_exec_space (),
1695  X_d, imports_d,
1696  importLIDs_d,
1697  whichVecs_d,
1698  op_type (),
1699  numVecs,
1700  use_atomic_updates,
1701  debugCheckIndices);
1702  }
1703  }
1704  }
1705  else {
1706  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1707  (true, std::logic_error, "Invalid CombineMode");
1708  }
1709  }
1710  else {
1711  if (printDebugOutput) {
1712  std::ostringstream os;
1713  os << *prefix << "Nothing to unpack" << endl;
1714  std::cerr << os.str ();
1715  }
1716  }
1717 
1718  if (printDebugOutput) {
1719  std::ostringstream os;
1720  os << *prefix << "Done!" << endl;
1721  std::cerr << os.str ();
1722  }
1723  }
1724 
1725  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1726  size_t
1729  {
1730  if (isConstantStride ()) {
1731  return static_cast<size_t> (view_.extent (1));
1732  } else {
1733  return static_cast<size_t> (whichVectors_.size ());
1734  }
1735  }
1736 
1737  namespace { // (anonymous)
1738 
1739  template<class RV>
1740  void
1741  gblDotImpl (const RV& dotsOut,
1742  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1743  const bool distributed)
1744  {
1745  using Teuchos::REDUCE_MAX;
1746  using Teuchos::REDUCE_SUM;
1747  using Teuchos::reduceAll;
1748  typedef typename RV::non_const_value_type dot_type;
1749 
1750  const size_t numVecs = dotsOut.extent (0);
1751 
1752  // If the MultiVector is distributed over multiple processes, do
1753  // the distributed (interprocess) part of the dot product. We
1754  // assume that the MPI implementation can read from and write to
1755  // device memory.
1756  //
1757  // replaceMap() may have removed some processes. Those
1758  // processes have a null Map. They must not participate in any
1759  // collective operations. We ask first whether the Map is null,
1760  // because isDistributed() defers that question to the Map. We
1761  // still compute and return local dots for processes not
1762  // participating in collective operations; those probably don't
1763  // make any sense, but it doesn't hurt to do them, since it's
1764  // illegal to call dot() on those processes anyway.
1765  if (distributed && ! comm.is_null ()) {
1766  // The calling process only participates in the collective if
1767  // both the Map and its Comm on that process are nonnull.
1768  const int nv = static_cast<int> (numVecs);
1769  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
1770 
1771  if (commIsInterComm) {
1772  // If comm is an intercomm, then we may not alias input and
1773  // output buffers, so we have to make a copy of the local
1774  // sum.
1775  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1776  Kokkos::deep_copy (lclDots, dotsOut);
1777  const dot_type* const lclSum = lclDots.data ();
1778  dot_type* const gblSum = dotsOut.data ();
1779  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1780  }
1781  else {
1782  dot_type* const inout = dotsOut.data ();
1783  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
1784  }
1785  }
1786  }
1787  } // namespace (anonymous)
1788 
1789  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1790  void
1793  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
1794  {
1795  using ::Tpetra::Details::Behavior;
1796  using Kokkos::subview;
1797  using Teuchos::Comm;
1798  using Teuchos::null;
1799  using Teuchos::RCP;
1800  // View of all the dot product results.
1801  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
1803  typedef typename dual_view_type::t_dev XMV;
1804  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1805 
1806  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
1807 
1808  const size_t numVecs = this->getNumVectors ();
1809  if (numVecs == 0) {
1810  return; // nothing to do
1811  }
1812  const size_t lclNumRows = this->getLocalLength ();
1813  const size_t numDots = static_cast<size_t> (dots.extent (0));
1814  const bool debug = Behavior::debug ();
1815 
1816  if (debug) {
1817  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1818  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1819  (! compat, std::invalid_argument, "'*this' MultiVector is not "
1820  "compatible with the input MultiVector A. We only test for this "
1821  "in debug mode.");
1822  }
1823 
1824  // FIXME (mfh 11 Jul 2014) These exception tests may not
1825  // necessarily be thrown on all processes consistently. We should
1826  // instead pass along error state with the inner product. We
1827  // could do this by setting an extra slot to
1828  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1829  // final sum should be
1830  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1831  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1832  lclNumRows != A.getLocalLength (), std::runtime_error,
1833  "MultiVectors do not have the same local length. "
1834  "this->getLocalLength() = " << lclNumRows << " != "
1835  "A.getLocalLength() = " << A.getLocalLength () << ".");
1836  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1837  numVecs != A.getNumVectors (), std::runtime_error,
1838  "MultiVectors must have the same number of columns (vectors). "
1839  "this->getNumVectors() = " << numVecs << " != "
1840  "A.getNumVectors() = " << A.getNumVectors () << ".");
1841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1842  numDots != numVecs, std::runtime_error,
1843  "The output array 'dots' must have the same number of entries as the "
1844  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
1845  numDots << " != this->getNumVectors() = " << numVecs << ".");
1846 
1847  const std::pair<size_t, size_t> colRng (0, numVecs);
1848  RV dotsOut = subview (dots, colRng);
1849  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1850  this->getMap ()->getComm ();
1851 
1852  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1853  if (this->need_sync_device ()) {
1854  const_cast<MV*>(this)->sync_device ();
1855  }
1856  if (A.need_sync_device ()) {
1857  const_cast<MV&>(A).sync_device ();
1858  }
1859 
1860  auto thisView = this->getLocalViewDevice ();
1861  auto A_view = A.getLocalViewDevice ();
1862 
1863  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1864  this->whichVectors_.getRawPtr (),
1865  A.whichVectors_.getRawPtr (),
1866  this->isConstantStride (), A.isConstantStride ());
1867  gblDotImpl (dotsOut, comm, this->isDistributed ());
1868  }
1869 
1870  namespace { // (anonymous)
1871  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1873  multiVectorSingleColumnDot (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
1875  {
1876  using ::Tpetra::Details::ProfilingRegion;
1878  using dot_type = typename MV::dot_type;
1879  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
1880 
1881  auto map = x.getMap ();
1882  Teuchos::RCP<const Teuchos::Comm<int> > comm =
1883  map.is_null () ? Teuchos::null : map->getComm ();
1884  if (comm.is_null ()) {
1885  return Kokkos::ArithTraits<dot_type>::zero ();
1886  }
1887  else {
1888  using LO = LocalOrdinal;
1889  // The min just ensures that we don't overwrite memory that
1890  // doesn't belong to us, in the erroneous input case where x
1891  // and y have different numbers of rows.
1892  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
1893  y.getLocalLength ()));
1894  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
1895  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
1896  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
1897 
1898  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1899  if (x.need_sync_device ()) {
1900  x.sync_device ();
1901  }
1902  if (y.need_sync_device ()) {
1903  const_cast<MV&>(y).sync_device ();
1904  }
1905 
1906  x.modify_device ();
1907  auto x_2d = x.getLocalViewDevice ();
1908  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
1909  auto y_2d = y.getLocalViewDevice ();
1910  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
1911  lclDot = KokkosBlas::dot (x_1d, y_1d);
1912 
1913  if (x.isDistributed ()) {
1914  using Teuchos::outArg;
1915  using Teuchos::REDUCE_SUM;
1916  using Teuchos::reduceAll;
1917  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
1918  }
1919  else {
1920  gblDot = lclDot;
1921  }
1922  return gblDot;
1923  }
1924  }
1925  } // namespace (anonymous)
1926 
1927 
1928 
1929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1930  void
1933  const Teuchos::ArrayView<dot_type>& dots) const
1934  {
1936  const char tfecfFuncName[] = "dot: ";
1937  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
1938 
1939  const size_t numVecs = this->getNumVectors ();
1940  const size_t lclNumRows = this->getLocalLength ();
1941  const size_t numDots = static_cast<size_t> (dots.size ());
1942 
1943  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
1944  // not necessarily be thrown on all processes consistently. We
1945  // keep them for now, because MultiVector's unit tests insist on
1946  // them. In the future, we should instead pass along error state
1947  // with the inner product. We could do this by setting an extra
1948  // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
1949  // The final sum should be
1950  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1951  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1952  (lclNumRows != A.getLocalLength (), std::runtime_error,
1953  "MultiVectors do not have the same local length. "
1954  "this->getLocalLength() = " << lclNumRows << " != "
1955  "A.getLocalLength() = " << A.getLocalLength () << ".");
1956  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1957  (numVecs != A.getNumVectors (), std::runtime_error,
1958  "MultiVectors must have the same number of columns (vectors). "
1959  "this->getNumVectors() = " << numVecs << " != "
1960  "A.getNumVectors() = " << A.getNumVectors () << ".");
1961  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1962  (numDots != numVecs, std::runtime_error,
1963  "The output array 'dots' must have the same number of entries as the "
1964  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
1965  numDots << " != this->getNumVectors() = " << numVecs << ".");
1966 
1967  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
1968  const dot_type gblDot = multiVectorSingleColumnDot (const_cast<MV&> (*this), A);
1969  dots[0] = gblDot;
1970  }
1971  else {
1972  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
1973  }
1974  }
1975 
1976  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1977  void
1979  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
1980  {
1982  using ::Tpetra::Details::NORM_TWO;
1983  using ::Tpetra::Details::ProfilingRegion;
1984  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
1985 
1986  // The function needs to be able to sync X.
1987  MV& X = const_cast<MV&> (*this);
1988  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
1989  }
1990 
1991  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1992  void
1994  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
1995  {
1996  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
1997  this->norm2 (norms_av);
1998  }
1999 
2000 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2001  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2002  void TPETRA_DEPRECATED
2005  const Teuchos::ArrayView<mag_type> &norms) const
2006  {
2007  using Kokkos::ALL;
2008  using Kokkos::subview;
2009  using Teuchos::Comm;
2010  using Teuchos::null;
2011  using Teuchos::RCP;
2012  using Teuchos::reduceAll;
2013  using Teuchos::REDUCE_SUM;
2014  typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2015  typedef Kokkos::ArithTraits<mag_type> ATM;
2016  typedef Kokkos::View<mag_type*, Kokkos::HostSpace> norms_view_type;
2018  const char tfecfFuncName[] = "normWeighted: ";
2019 
2020  const size_t numVecs = this->getNumVectors ();
2021  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2022  static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
2023  "norms.size() = " << norms.size () << " != this->getNumVectors() = "
2024  << numVecs << ".");
2025 
2026  const bool OneW = (weights.getNumVectors () == 1);
2027  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2028  ! OneW && weights.getNumVectors () != numVecs, std::runtime_error,
2029  "The input MultiVector of weights must contain either one column, "
2030  "or must have the same number of columns as *this. "
2031  "weights.getNumVectors() = " << weights.getNumVectors ()
2032  << " and this->getNumVectors() = " << numVecs << ".");
2033 
2034  const bool debug = ::Tpetra::Details::Behavior::debug ();
2035 
2036  if (debug) {
2037  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2038  (! this->getMap ()->isCompatible (*weights.getMap ()), std::runtime_error,
2039  "MultiVectors do not have compatible Maps:" << std::endl
2040  << "this->getMap(): " << std::endl << *this->getMap()
2041  << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
2042  }
2043  else {
2044  const size_t lclNumRows = this->getLocalLength ();
2045  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2046  (lclNumRows != weights.getLocalLength (), std::runtime_error,
2047  "MultiVectors do not have the same local length.");
2048  }
2049 
2050  norms_view_type lclNrms ("Tpetra::MV::lclNrms", numVecs);
2051 
2052  // FIXME (mfh 18 May 2016) Yes, I know "const" is a lie.
2053  const_cast<MV*> (this)->sync_device ();
2054  const_cast<MV&> (weights).sync_device ();
2055 
2056  auto X_lcl = this->getLocalViewDevice ();
2057  auto W_lcl = weights.getLocalViewDevice ();
2058 
2059  if (isConstantStride () && ! OneW) {
2060  KokkosBlas::nrm2w_squared (lclNrms, X_lcl, W_lcl);
2061  }
2062  else {
2063  for (size_t j = 0; j < numVecs; ++j) {
2064  const size_t X_col = this->isConstantStride () ? j :
2065  this->whichVectors_[j];
2066  const size_t W_col = OneW ? static_cast<size_t> (0) :
2067  (weights.isConstantStride () ? j : weights.whichVectors_[j]);
2068  KokkosBlas::nrm2w_squared (subview (lclNrms, j),
2069  subview (X_lcl, ALL (), X_col),
2070  subview (W_lcl, ALL (), W_col));
2071  }
2072  }
2073 
2074  const mag_type OneOverN =
2075  ATM::one () / static_cast<mag_type> (this->getGlobalLength ());
2076  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
2077  Teuchos::null : this->getMap ()->getComm ();
2078 
2079  if (! comm.is_null () && this->isDistributed ()) {
2080  // Assume that MPI can access device memory.
2081  reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2082  lclNrms.data (), norms.getRawPtr ());
2083  for (size_t k = 0; k < numVecs; ++k) {
2084  norms[k] = ATM::sqrt (norms[k] * OneOverN);
2085  }
2086  }
2087  else {
2088  for (size_t k = 0; k < numVecs; ++k) {
2089  norms[k] = ATM::sqrt (ATS::magnitude (lclNrms(k)) * OneOverN);
2090  }
2091  }
2092  }
2093 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2094 
2095  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2096  void
2098  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2099  {
2101  using ::Tpetra::Details::NORM_ONE;
2102  using ::Tpetra::Details::ProfilingRegion;
2103  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2104 
2105  // The function needs to be able to sync X.
2106  MV& X = const_cast<MV&> (*this);
2107  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2108  }
2109 
2110  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2111  void
2113  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2114  {
2115  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2116  this->norm1 (norms_av);
2117  }
2118 
2119  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2120  void
2122  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2123  {
2125  using ::Tpetra::Details::NORM_INF;
2126  using ::Tpetra::Details::ProfilingRegion;
2127  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2128 
2129  // The function needs to be able to sync X.
2130  MV& X = const_cast<MV&> (*this);
2131  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2132  }
2133 
2134  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2135  void
2137  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2138  {
2139  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2140  this->normInf (norms_av);
2141  }
2142 
2143  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2144  void
2146  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2147  {
2148  // KR FIXME Overload this method to take a View.
2149  using Kokkos::ALL;
2150  using Kokkos::subview;
2151  using Teuchos::Comm;
2152  using Teuchos::RCP;
2153  using Teuchos::reduceAll;
2154  using Teuchos::REDUCE_SUM;
2155  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2156 
2157  const size_t lclNumRows = this->getLocalLength ();
2158  const size_t numVecs = this->getNumVectors ();
2159  const size_t numMeans = static_cast<size_t> (means.size ());
2160 
2161  TEUCHOS_TEST_FOR_EXCEPTION(
2162  numMeans != numVecs, std::runtime_error,
2163  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2164  << " != this->getNumVectors() = " << numVecs << ".");
2165 
2166  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2167  const std::pair<size_t, size_t> colRng (0, numVecs);
2168 
2169  // Make sure that the final output view has the same layout as the
2170  // temporary view's HostMirror. Left or Right doesn't matter for
2171  // a 1-D array anyway; this is just to placate the compiler.
2172  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2173  typedef Kokkos::View<impl_scalar_type*,
2174  typename local_view_type::HostMirror::array_layout,
2175  Kokkos::HostSpace,
2176  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2177  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2178 
2179  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2180  this->getMap ()->getComm ();
2181 
2182  // If we need sync to device, then host has the most recent version.
2183  const bool useHostVersion = this->need_sync_device ();
2184  if (useHostVersion) {
2185  // DualView was last modified on host, so run the local kernel there.
2186  auto X_lcl = subview (getLocalViewHost (),
2187  rowRng, Kokkos::ALL ());
2188  // Compute the local sum of each column.
2189  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2190  if (isConstantStride ()) {
2191  KokkosBlas::sum (lclSums, X_lcl);
2192  }
2193  else {
2194  for (size_t j = 0; j < numVecs; ++j) {
2195  const size_t col = whichVectors_[j];
2196  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2197  }
2198  }
2199 
2200  // If there are multiple MPI processes, the all-reduce reads
2201  // from lclSums, and writes to meansOut. Otherwise, we just
2202  // copy lclSums into meansOut.
2203  if (! comm.is_null () && this->isDistributed ()) {
2204  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2205  lclSums.data (), meansOut.data ());
2206  }
2207  else {
2208  Kokkos::deep_copy (meansOut, lclSums);
2209  }
2210  }
2211  else {
2212  // DualView was last modified on device, so run the local kernel there.
2213  auto X_lcl = subview (this->getLocalViewDevice (),
2214  rowRng, Kokkos::ALL ());
2215 
2216  // Compute the local sum of each column.
2217  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2218  if (isConstantStride ()) {
2219  KokkosBlas::sum (lclSums, X_lcl);
2220  }
2221  else {
2222  for (size_t j = 0; j < numVecs; ++j) {
2223  const size_t col = whichVectors_[j];
2224  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2225  }
2226  }
2227 
2228  // If there are multiple MPI processes, the all-reduce reads
2229  // from lclSums, and writes to meansOut. (We assume that MPI
2230  // can read device memory.) Otherwise, we just copy lclSums
2231  // into meansOut.
2232  if (! comm.is_null () && this->isDistributed ()) {
2233  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2234  lclSums.data (), meansOut.data ());
2235  }
2236  else {
2237  Kokkos::deep_copy (meansOut, lclSums);
2238  }
2239  }
2240 
2241  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2242  // to the magnitude type, since operator/ (std::complex<T>, int)
2243  // isn't necessarily defined.
2244  const impl_scalar_type OneOverN =
2245  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2246  for (size_t k = 0; k < numMeans; ++k) {
2247  meansOut(k) = meansOut(k) * OneOverN;
2248  }
2249  }
2250 
2251 
2252  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2253  void
2256  {
2257  typedef impl_scalar_type IST;
2258  typedef Kokkos::Details::ArithTraits<IST> ATS;
2259  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2260  typedef typename pool_type::generator_type generator_type;
2261 
2262  const IST max = Kokkos::rand<generator_type, IST>::max ();
2263  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2264 
2265  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2266  }
2267 
2268 
2269  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2270  void
2272  randomize (const Scalar& minVal, const Scalar& maxVal)
2273  {
2274  typedef impl_scalar_type IST;
2275  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2276 
2277  // Seed the pseudorandom number generator using the calling
2278  // process' rank. This helps decorrelate different process'
2279  // pseudorandom streams. It's not perfect but it's effective and
2280  // doesn't require MPI communication. The seed also includes bits
2281  // from the standard library's rand().
2282  //
2283  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2284  // The code below just makes a new seed each time.
2285 
2286  const uint64_t myRank =
2287  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2288  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2289  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2290 
2291  pool_type rand_pool (seed);
2292  const IST max = static_cast<IST> (maxVal);
2293  const IST min = static_cast<IST> (minVal);
2294 
2295  // See #1510. In case diag has already been marked modified on
2296  // host or device, we need to clear those flags, since the code
2297  // below works on device.
2298  this->view_.clear_sync_state();
2299 
2300  this->modify_device ();
2301  auto thisView = this->getLocalViewDevice ();
2302 
2303  if (isConstantStride ()) {
2304  Kokkos::fill_random (thisView, rand_pool, min, max);
2305  }
2306  else {
2307  const size_t numVecs = getNumVectors ();
2308  for (size_t k = 0; k < numVecs; ++k) {
2309  const size_t col = whichVectors_[k];
2310  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2311  Kokkos::fill_random (X_k, rand_pool, min, max);
2312  }
2313  }
2314  }
2315 
2316  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2317  void
2319  putScalar (const Scalar& alpha)
2320  {
2321  using ::Tpetra::Details::ProfilingRegion;
2322  using ::Tpetra::Details::Blas::fill;
2323  using DES = typename dual_view_type::t_dev::execution_space;
2324  using HES = typename dual_view_type::t_host::execution_space;
2325  using LO = LocalOrdinal;
2326  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2327 
2328  // We need this cast for cases like Scalar = std::complex<T> but
2329  // impl_scalar_type = Kokkos::complex<T>.
2330  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2331  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2332  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2333 
2334  // Modify the most recently updated version of the data. This
2335  // avoids sync'ing, which could violate users' expectations.
2336  //
2337  // If we need sync to device, then host has the most recent version.
2338  const bool runOnHost = this->need_sync_device ();
2339 
2340  if (! runOnHost) { // last modified in device memory
2341  this->modify_device ();
2342  auto X = this->getLocalViewDevice ();
2343  if (this->isConstantStride ()) {
2344  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2345  }
2346  else {
2347  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2348  this->whichVectors_.getRawPtr ());
2349  }
2350  }
2351  else { // last modified in host memory, so modify data there.
2352  this->modify_host ();
2353  auto X = this->getLocalViewHost ();
2354  if (this->isConstantStride ()) {
2355  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2356  }
2357  else {
2358  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2359  this->whichVectors_.getRawPtr ());
2360  }
2361  }
2362  }
2363 
2364 
2365  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2366  void
2368  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2369  {
2370  using Teuchos::ArrayRCP;
2371  using Teuchos::Comm;
2372  using Teuchos::RCP;
2373  using ST = Scalar;
2374  using LO = LocalOrdinal;
2375  using GO = GlobalOrdinal;
2376 
2377  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2378  // it might work if the MV is a column view of another MV.
2379  // However, things might go wrong when restoring the original
2380  // Map, so we don't allow this case for now.
2381  TEUCHOS_TEST_FOR_EXCEPTION(
2382  ! this->isConstantStride (), std::logic_error,
2383  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2384  "if the MultiVector is a column view of another MultiVector (that is, if "
2385  "isConstantStride() == false).");
2386 
2387  // Case 1: current Map and new Map are both nonnull on this process.
2388  // Case 2: current Map is nonnull, new Map is null.
2389  // Case 3: current Map is null, new Map is nonnull.
2390  // Case 4: both Maps are null: forbidden.
2391  //
2392  // Case 1 means that we don't have to do anything on this process,
2393  // other than assign the new Map. (We always have to do that.)
2394  // It's an error for the user to supply a Map that requires
2395  // resizing in this case.
2396  //
2397  // Case 2 means that the calling process is in the current Map's
2398  // communicator, but will be excluded from the new Map's
2399  // communicator. We don't have to do anything on the calling
2400  // process; just leave whatever data it may have alone.
2401  //
2402  // Case 3 means that the calling process is excluded from the
2403  // current Map's communicator, but will be included in the new
2404  // Map's communicator. This means we need to (re)allocate the
2405  // local DualView if it does not have the right number of rows.
2406  // If the new number of rows is nonzero, we'll fill the newly
2407  // allocated local data with zeros, as befits a projection
2408  // operation.
2409  //
2410  // The typical use case for Case 3 is that the MultiVector was
2411  // first created with the Map with more processes, then that Map
2412  // was replaced with a Map with fewer processes, and finally the
2413  // original Map was restored on this call to replaceMap.
2414 
2415 #ifdef HAVE_TEUCHOS_DEBUG
2416  // mfh 28 Mar 2013: We can't check for compatibility across the
2417  // whole communicator, unless we know that the current and new
2418  // Maps are nonnull on _all_ participating processes.
2419  // TEUCHOS_TEST_FOR_EXCEPTION(
2420  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2421  // std::invalid_argument, "Tpetra::MultiVector::project: "
2422  // "If the input Map's communicator is compatible (has the same number of "
2423  // "processes as) the current Map's communicator, then the two Maps must be "
2424  // "compatible. The replaceMap() method is not for data redistribution; "
2425  // "use Import or Export for that purpose.");
2426 
2427  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2428  // of the Map, in case the process counts don't match.
2429 #endif // HAVE_TEUCHOS_DEBUG
2430 
2431  if (this->getMap ().is_null ()) { // current Map is null
2432  // If this->getMap() is null, that means that this MultiVector
2433  // has already had replaceMap happen to it. In that case, just
2434  // reallocate the DualView with the right size.
2435 
2436  TEUCHOS_TEST_FOR_EXCEPTION(
2437  newMap.is_null (), std::invalid_argument,
2438  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2439  "This probably means that the input Map is incorrect.");
2440 
2441  // Case 3: current Map is null, new Map is nonnull.
2442  // Reallocate the DualView with the right dimensions.
2443  const size_t newNumRows = newMap->getNodeNumElements ();
2444  const size_t origNumRows = view_.extent (0);
2445  const size_t numCols = this->getNumVectors ();
2446 
2447  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2448  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2449  }
2450  }
2451  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2452  // I am an excluded process. Reinitialize my data so that I
2453  // have 0 rows. Keep the number of columns as before.
2454  const size_t newNumRows = static_cast<size_t> (0);
2455  const size_t numCols = this->getNumVectors ();
2456  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2457  }
2458 
2459  this->map_ = newMap;
2460  }
2461 
2462  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2463  void
2465  scale (const Scalar& alpha)
2466  {
2467  using Kokkos::ALL;
2468  using IST = impl_scalar_type;
2469 
2470  const IST theAlpha = static_cast<IST> (alpha);
2471  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2472  return; // do nothing
2473  }
2474  const size_t lclNumRows = getLocalLength ();
2475  const size_t numVecs = getNumVectors ();
2476  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2477  const std::pair<size_t, size_t> colRng (0, numVecs);
2478 
2479  // We can't substitute putScalar(0.0) for scale(0.0), because the
2480  // former will overwrite NaNs present in the MultiVector. The
2481  // semantics of this call require multiplying them by 0, which
2482  // IEEE 754 requires to be NaN.
2483 
2484  // If we need sync to device, then host has the most recent version.
2485  const bool useHostVersion = need_sync_device ();
2486  if (useHostVersion) {
2487  auto Y_lcl = Kokkos::subview (getLocalViewHost (), rowRng, ALL ());
2488  if (isConstantStride ()) {
2489  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2490  }
2491  else {
2492  for (size_t k = 0; k < numVecs; ++k) {
2493  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2494  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2495  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2496  }
2497  }
2498  }
2499  else { // work on device
2500  auto Y_lcl = Kokkos::subview (getLocalViewDevice (), rowRng, ALL ());
2501  if (isConstantStride ()) {
2502  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2503  }
2504  else {
2505  for (size_t k = 0; k < numVecs; ++k) {
2506  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2507  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2508  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2509  }
2510  }
2511  }
2512  }
2513 
2514 
2515  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2516  void
2518  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2519  {
2520  const size_t numVecs = this->getNumVectors ();
2521  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2522  TEUCHOS_TEST_FOR_EXCEPTION(
2523  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2524  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2525  << numVecs << ".");
2526 
2527  // Use a DualView to copy the scaling constants onto the device.
2528  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2529  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2530  k_alphas.modify_host ();
2531  for (size_t i = 0; i < numAlphas; ++i) {
2532  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2533  }
2534  k_alphas.sync_device ();
2535  // Invoke the scale() overload that takes a device View of coefficients.
2536  this->scale (k_alphas.view_device ());
2537  }
2538 
2539  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2540  void
2542  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2543  {
2544  using Kokkos::ALL;
2545  using Kokkos::subview;
2546 
2547  const size_t lclNumRows = this->getLocalLength ();
2548  const size_t numVecs = this->getNumVectors ();
2549  TEUCHOS_TEST_FOR_EXCEPTION(
2550  static_cast<size_t> (alphas.extent (0)) != numVecs,
2551  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2552  "alphas.extent(0) = " << alphas.extent (0)
2553  << " != this->getNumVectors () = " << numVecs << ".");
2554  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2555  const std::pair<size_t, size_t> colRng (0, numVecs);
2556 
2557  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2558  // type of the return value of subview. This is because if we
2559  // switch the array layout from LayoutLeft to LayoutRight
2560  // (preferred for performance of block operations), the types
2561  // below won't be valid. (A view of a column of a LayoutRight
2562  // multivector has LayoutStride, not LayoutLeft.)
2563 
2564  // If we need sync to device, then host has the most recent version.
2565  const bool useHostVersion = this->need_sync_device ();
2566  if (useHostVersion) {
2567  // Work in host memory. This means we need to create a host
2568  // mirror of the input View of coefficients.
2569  auto alphas_h = Kokkos::create_mirror_view (alphas);
2570  Kokkos::deep_copy (alphas_h, alphas);
2571 
2572  auto Y_lcl = subview (this->getLocalViewHost (), rowRng, ALL ());
2573  if (isConstantStride ()) {
2574  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2575  }
2576  else {
2577  for (size_t k = 0; k < numVecs; ++k) {
2578  const size_t Y_col = this->isConstantStride () ? k :
2579  this->whichVectors_[k];
2580  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2581  // We don't have to use the entire 1-D View here; we can use
2582  // the version that takes a scalar coefficient.
2583  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2584  }
2585  }
2586  }
2587  else { // Work in device memory, using the input View 'alphas' directly.
2588  auto Y_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2589  if (isConstantStride ()) {
2590  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2591  }
2592  else {
2593  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2594  // as values on host, so copy them to host. Another approach
2595  // would be to fix scal() so that it takes a 0-D View as the
2596  // second argument.
2597  auto alphas_h = Kokkos::create_mirror_view (alphas);
2598  Kokkos::deep_copy (alphas_h, alphas);
2599 
2600  for (size_t k = 0; k < numVecs; ++k) {
2601  const size_t Y_col = this->isConstantStride () ? k :
2602  this->whichVectors_[k];
2603  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2604  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2605  }
2606  }
2607  }
2608  }
2609 
2610  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2611  void
2613  scale (const Scalar& alpha,
2615  {
2616  using Kokkos::ALL;
2617  using Kokkos::subview;
2619  const char tfecfFuncName[] = "scale: ";
2620 
2621  const size_t lclNumRows = getLocalLength ();
2622  const size_t numVecs = getNumVectors ();
2623 
2624  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2625  lclNumRows != A.getLocalLength (), std::invalid_argument,
2626  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2627  << A.getLocalLength () << ".");
2628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2629  numVecs != A.getNumVectors (), std::invalid_argument,
2630  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2631  << A.getNumVectors () << ".");
2632 
2633  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2634  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2635  const std::pair<size_t, size_t> colRng (0, numVecs);
2636 
2637  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2638  if (this->need_sync_device ()) {
2639  this->sync_device ();
2640  }
2641  if (A.need_sync_device ()) {
2642  const_cast<MV&>(A).sync_device ();
2643  }
2644 
2645  this->modify_device ();
2646  auto Y_lcl_orig = this->getLocalViewDevice ();
2647  auto X_lcl_orig = A.getLocalViewDevice ();
2648  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2649  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2650 
2651  if (isConstantStride () && A.isConstantStride ()) {
2652  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2653  }
2654  else {
2655  // Make sure that Kokkos only uses the local length for add.
2656  for (size_t k = 0; k < numVecs; ++k) {
2657  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2658  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2659  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2660  auto X_k = subview (X_lcl, ALL (), X_col);
2661 
2662  KokkosBlas::scal (Y_k, theAlpha, X_k);
2663  }
2664  }
2665  }
2666 
2667 
2668 
2669  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2670  void
2673  {
2675  const char tfecfFuncName[] = "reciprocal: ";
2676 
2677  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2678  getLocalLength () != A.getLocalLength (), std::runtime_error,
2679  "MultiVectors do not have the same local length. "
2680  "this->getLocalLength() = " << getLocalLength ()
2681  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2682  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2683  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2684  ": MultiVectors do not have the same number of columns (vectors). "
2685  "this->getNumVectors() = " << getNumVectors ()
2686  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2687 
2688  const size_t numVecs = getNumVectors ();
2689 
2690  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2691  if (this->need_sync_device ()) {
2692  this->sync_device ();
2693  }
2694  if (A.need_sync_device ()) {
2695  const_cast<MV&>(A).sync_device ();
2696  }
2697  this->modify_device ();
2698 
2699  auto this_view_dev = this->getLocalViewDevice ();
2700  auto A_view_dev = A.getLocalViewDevice ();
2701 
2702  if (isConstantStride () && A.isConstantStride ()) {
2703  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2704  }
2705  else {
2706  using Kokkos::ALL;
2707  using Kokkos::subview;
2708  for (size_t k = 0; k < numVecs; ++k) {
2709  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2710  auto vector_k = subview (this_view_dev, ALL (), this_col);
2711  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2712  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2713  KokkosBlas::reciprocal (vector_k, vector_Ak);
2714  }
2715  }
2716  }
2717 
2718  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2719  void
2722  {
2724  const char tfecfFuncName[] = "abs";
2725 
2726  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2727  getLocalLength () != A.getLocalLength (), std::runtime_error,
2728  ": MultiVectors do not have the same local length. "
2729  "this->getLocalLength() = " << getLocalLength ()
2730  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2731  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2732  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2733  ": MultiVectors do not have the same number of columns (vectors). "
2734  "this->getNumVectors() = " << getNumVectors ()
2735  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2736  const size_t numVecs = getNumVectors ();
2737 
2738  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2739  if (this->need_sync_device ()) {
2740  this->sync_device ();
2741  }
2742  if (A.need_sync_device ()) {
2743  const_cast<MV&>(A).sync_device ();
2744  }
2745  this->modify_device ();
2746 
2747  auto this_view_dev = this->getLocalViewDevice ();
2748  auto A_view_dev = A.getLocalViewDevice ();
2749 
2750  if (isConstantStride () && A.isConstantStride ()) {
2751  KokkosBlas::abs (this_view_dev, A_view_dev);
2752  }
2753  else {
2754  using Kokkos::ALL;
2755  using Kokkos::subview;
2756 
2757  for (size_t k=0; k < numVecs; ++k) {
2758  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2759  auto vector_k = subview (this_view_dev, ALL (), this_col);
2760  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2761  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2762  KokkosBlas::abs (vector_k, vector_Ak);
2763  }
2764  }
2765  }
2766 
2767  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2768  void
2770  update (const Scalar& alpha,
2772  const Scalar& beta)
2773  {
2774  const char tfecfFuncName[] = "update: ";
2775  using Kokkos::subview;
2776  using Kokkos::ALL;
2778 
2779  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
2780 
2781  const size_t lclNumRows = getLocalLength ();
2782  const size_t numVecs = getNumVectors ();
2783 
2784  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2785  lclNumRows != A.getLocalLength (), std::invalid_argument,
2786  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2787  << A.getLocalLength () << ".");
2788  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2789  numVecs != A.getNumVectors (), std::invalid_argument,
2790  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2791  << A.getNumVectors () << ".");
2792 
2793  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2794  if (this->need_sync_device ()) {
2795  this->sync_device ();
2796  }
2797  if (A.need_sync_device ()) {
2798  const_cast<MV&>(A).sync_device ();
2799  }
2800 
2801  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2802  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2803  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2804  const std::pair<size_t, size_t> colRng (0, numVecs);
2805 
2806  auto Y_lcl_orig = this->getLocalViewDevice ();
2807  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2808  auto X_lcl_orig = A.getLocalViewDevice ();
2809  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2810 
2811  // The device memory of *this is about to be modified
2812  this->modify_device ();
2813  if (isConstantStride () && A.isConstantStride ()) {
2814  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2815  }
2816  else {
2817  // Make sure that Kokkos only uses the local length for add.
2818  for (size_t k = 0; k < numVecs; ++k) {
2819  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2820  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2821  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2822  auto X_k = subview (X_lcl, ALL (), X_col);
2823 
2824  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2825  }
2826  }
2827  }
2828 
2829  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2830  void
2832  update (const Scalar& alpha,
2834  const Scalar& beta,
2836  const Scalar& gamma)
2837  {
2838  using Kokkos::ALL;
2839  using Kokkos::subview;
2841 
2842  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
2843 
2844  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
2845 
2846  const size_t lclNumRows = this->getLocalLength ();
2847  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2848  lclNumRows != A.getLocalLength (), std::invalid_argument,
2849  "The input MultiVector A has " << A.getLocalLength () << " local "
2850  "row(s), but this MultiVector has " << lclNumRows << " local "
2851  "row(s).");
2852  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2853  lclNumRows != B.getLocalLength (), std::invalid_argument,
2854  "The input MultiVector B has " << B.getLocalLength () << " local "
2855  "row(s), but this MultiVector has " << lclNumRows << " local "
2856  "row(s).");
2857  const size_t numVecs = getNumVectors ();
2858  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2859  A.getNumVectors () != numVecs, std::invalid_argument,
2860  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
2861  "but this MultiVector has " << numVecs << " column(s).");
2862  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2863  B.getNumVectors () != numVecs, std::invalid_argument,
2864  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
2865  "but this MultiVector has " << numVecs << " column(s).");
2866 
2867  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2868  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2869  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
2870 
2871  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2872  if (this->need_sync_device ()) this->sync_device ();
2873  if (A.need_sync_device ()) const_cast<MV&>(A).sync_device ();
2874  if (B.need_sync_device ()) const_cast<MV&>(B).sync_device ();
2875 
2876  // This method modifies *this.
2877  this->modify_device ();
2878 
2879  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2880  const std::pair<size_t, size_t> colRng (0, numVecs);
2881 
2882  // Prefer 'auto' over specifying the type explicitly. This avoids
2883  // issues with a subview possibly having a different type than the
2884  // original view.
2885  auto C_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2886  auto A_lcl = subview (A.getLocalViewDevice (), rowRng, ALL ());
2887  auto B_lcl = subview (B.getLocalViewDevice (), rowRng, ALL ());
2888 
2889  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
2890  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
2891  }
2892  else {
2893  // Some input (or *this) is not constant stride,
2894  // so perform the update one column at a time.
2895  for (size_t k = 0; k < numVecs; ++k) {
2896  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2897  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
2898  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
2899  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
2900  theBeta, subview (B_lcl, rowRng, B_col),
2901  theGamma, subview (C_lcl, rowRng, this_col));
2902  }
2903  }
2904  }
2905 
2906  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2907  Teuchos::ArrayRCP<const Scalar>
2909  getData (size_t j) const
2910  {
2911  using Kokkos::ALL;
2913  using IST = impl_scalar_type;
2914  const char tfecfFuncName[] = "getData: ";
2915 
2916  // Any MultiVector method that called the (classic) Kokkos Node's
2917  // viewBuffer or viewBufferNonConst methods always implied a
2918  // device->host synchronization. Thus, we synchronize here as
2919  // well.
2920  const_cast<MV*> (this)->sync_host ();
2921 
2922  auto hostView = getLocalViewHost ();
2923  const size_t col = isConstantStride () ? j : whichVectors_[j];
2924  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
2925  Teuchos::ArrayRCP<const IST> dataAsArcp =
2926  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
2927 
2928  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2929  (static_cast<size_t> (hostView_j.extent (0)) <
2930  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
2931  "hostView_j.extent(0) = " << hostView_j.extent (0)
2932  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
2933  "Please report this bug to the Tpetra developers.");
2934 
2935  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
2936  }
2937 
2938  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2939  Teuchos::ArrayRCP<Scalar>
2942  {
2943  using Kokkos::ALL;
2944  using Kokkos::subview;
2946  using IST = impl_scalar_type;
2947  const char tfecfFuncName[] = "getDataNonConst: ";
2948 
2949  // Any MultiVector method that called the (classic) Kokkos Node's
2950  // viewBuffer or viewBufferNonConst methods always implied a
2951  // device->host synchronization. Thus, we synchronize here as
2952  // well.
2953  const_cast<MV*> (this)->sync_host ();
2954  // Calling getDataNonConst() implies that the user plans to modify
2955  // the values in the MultiVector, so we mark the host data as modified.
2956  modify_host ();
2957 
2958  auto hostView = getLocalViewHost ();
2959  const size_t col = isConstantStride () ? j : whichVectors_[j];
2960  auto hostView_j = subview (hostView, ALL (), col);
2961  Teuchos::ArrayRCP<IST> dataAsArcp =
2962  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
2963 
2964  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2965  (static_cast<size_t> (hostView_j.extent (0)) <
2966  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
2967  "hostView_j.extent(0) = " << hostView_j.extent (0)
2968  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
2969  "Please report this bug to the Tpetra developers.");
2970 
2971  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
2972  }
2973 
2974  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2975  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
2977  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
2978  {
2979  using Teuchos::RCP;
2981 
2982  // Check whether the index set in cols is contiguous. If it is,
2983  // use the more efficient Range1D version of subCopy.
2984  bool contiguous = true;
2985  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
2986  for (size_t j = 1; j < numCopyVecs; ++j) {
2987  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
2988  contiguous = false;
2989  break;
2990  }
2991  }
2992  if (contiguous && numCopyVecs > 0) {
2993  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
2994  }
2995  else {
2996  RCP<const MV> X_sub = this->subView (cols);
2997  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
2998  Y->assign (*X_sub);
2999  return Y;
3000  }
3001  }
3002 
3003  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3004  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3006  subCopy (const Teuchos::Range1D &colRng) const
3007  {
3008  using Teuchos::RCP;
3010 
3011  RCP<const MV> X_sub = this->subView (colRng);
3012  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3013  Y->assign (*X_sub);
3014  return Y;
3015  }
3016 
3017  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3018  size_t
3021  return origView_.extent (0);
3022  }
3023 
3024  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3025  size_t
3028  return origView_.extent (1);
3029  }
3030 
3031  template <class Scalar, class LO, class GO, class Node>
3034  const Teuchos::RCP<const map_type>& subMap,
3035  const local_ordinal_type rowOffset) :
3036  base_type (subMap)
3037  {
3038  using Kokkos::ALL;
3039  using Kokkos::subview;
3040  using Teuchos::outArg;
3041  using Teuchos::RCP;
3042  using Teuchos::rcp;
3043  using Teuchos::reduceAll;
3044  using Teuchos::REDUCE_MIN;
3045  using std::endl;
3047  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3048  const char suffix[] = "Please report this bug to the Tpetra developers.";
3049  int lclGood = 1;
3050  int gblGood = 1;
3051  std::unique_ptr<std::ostringstream> errStrm;
3052  const bool debug = ::Tpetra::Details::Behavior::debug ();
3053  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3054 
3055  // Be careful to use the input Map's communicator, not X's. The
3056  // idea is that, on return, *this is a subview of X, using the
3057  // input Map.
3058  const auto comm = subMap->getComm ();
3059  TEUCHOS_ASSERT( ! comm.is_null () );
3060  const int myRank = comm->getRank ();
3061 
3062  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3063  const LO numCols = static_cast<LO> (X.getNumVectors ());
3064  const LO newNumRows = static_cast<LO> (subMap->getNodeNumElements ());
3065  if (verbose) {
3066  std::ostringstream os;
3067  os << "Proc " << myRank << ": " << prefix
3068  << "X: {lclNumRows: " << lclNumRowsBefore
3069  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3070  << ", numCols: " << numCols << "}, "
3071  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3072  std::cerr << os.str ();
3073  }
3074  // We ask for the _original_ number of rows in X, because X could
3075  // be a shorter (fewer rows) view of a longer MV. (For example, X
3076  // could be a domain Map view of a column Map MV.)
3077  const bool tooManyElts =
3078  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3079  if (tooManyElts) {
3080  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3081  *errStrm << " Proc " << myRank << ": subMap->getNodeNumElements() (="
3082  << newNumRows << ") + rowOffset (=" << rowOffset
3083  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3084  << ")." << endl;
3085  lclGood = 0;
3086  TEUCHOS_TEST_FOR_EXCEPTION
3087  (! debug && tooManyElts, std::invalid_argument,
3088  prefix << errStrm->str () << suffix);
3089  }
3090 
3091  if (debug) {
3092  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3093  if (gblGood != 1) {
3094  std::ostringstream gblErrStrm;
3095  const std::string myErrStr =
3096  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3097  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3098  TEUCHOS_TEST_FOR_EXCEPTION
3099  (true, std::invalid_argument, gblErrStrm.str ());
3100  }
3101  }
3102 
3103  using range_type = std::pair<LO, LO>;
3104  const range_type origRowRng
3105  (rowOffset, static_cast<LO> (X.origView_.extent (0)));
3106  const range_type rowRng
3107  (rowOffset, rowOffset + newNumRows);
3108 
3109  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3110  // FIXME (mfh 29 Sep 2016) If we just use X.view_ here, it breaks
3111  // CrsMatrix's Gauss-Seidel implementation (which assumes the
3112  // ability to create domain Map views of column Map MultiVectors,
3113  // and then get the original column Map MultiVector out again).
3114  // If we just use X.origView_ here, it breaks the fix for #46.
3115  // The test for rowOffset == 0 is a hack that makes both tests
3116  // pass, but doesn't actually fix the more general issue. In
3117  // particular, the right way to fix Gauss-Seidel would be to fix
3118  // #385; that would make "getting the original column Map
3119  // MultiVector out again" unnecessary.
3120  dual_view_type newView =
3121  subview (rowOffset == 0 ? X.origView_ : X.view_, rowRng, ALL ());
3122 
3123  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3124  // handling subviews of degenerate Views quite so well. For some
3125  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3126  // 0. We work around by creating a new empty DualView of the
3127  // desired (degenerate) dimensions.
3128  if (newOrigView.extent (0) == 0 &&
3129  newOrigView.extent (1) != X.origView_.extent (1)) {
3130  newOrigView =
3131  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3132  }
3133  if (newView.extent (0) == 0 &&
3134  newView.extent (1) != X.view_.extent (1)) {
3135  newView =
3136  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3137  }
3138 
3139  MV subViewMV = X.isConstantStride () ?
3140  MV (subMap, newView, newOrigView) :
3141  MV (subMap, newView, newOrigView, X.whichVectors_ ());
3142 
3143  if (debug) {
3144  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3145  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3146  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3147  lclGood = 0;
3148  if (errStrm.get () == nullptr) {
3149  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3150  }
3151  *errStrm << " Proc " << myRank <<
3152  ": subMap.getNodeNumElements(): " << newNumRows <<
3153  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3154  ", X.getNumVectors(): " << numCols <<
3155  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3156  }
3157  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3158  if (gblGood != 1) {
3159  std::ostringstream gblErrStrm;
3160  if (myRank == 0) {
3161  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3162  "dimensions on one or more processes:" << endl;
3163  }
3164  const std::string myErrStr =
3165  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3166  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3167  gblErrStrm << suffix << endl;
3168  TEUCHOS_TEST_FOR_EXCEPTION
3169  (true, std::invalid_argument, gblErrStrm.str ());
3170  }
3171  }
3172 
3173  if (verbose) {
3174  std::ostringstream os;
3175  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3176  std::cerr << os.str ();
3177  }
3178 
3179  *this = subViewMV; // shallow copy
3180 
3181  if (verbose) {
3182  std::ostringstream os;
3183  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3184  std::cerr << os.str ();
3185  }
3186  }
3187 
3188  template <class Scalar, class LO, class GO, class Node>
3191  const map_type& subMap,
3192  const size_t rowOffset) :
3193  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3194  static_cast<local_ordinal_type> (rowOffset))
3195  {}
3196 
3197  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3198  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3200  offsetView (const Teuchos::RCP<const map_type>& subMap,
3201  const size_t offset) const
3202  {
3204  return Teuchos::rcp (new MV (*this, *subMap, offset));
3205  }
3206 
3207  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3208  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3210  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3211  const size_t offset)
3212  {
3214  return Teuchos::rcp (new MV (*this, *subMap, offset));
3215  }
3216 
3217  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3218  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3220  subView (const Teuchos::ArrayView<const size_t>& cols) const
3221  {
3222  using Teuchos::Array;
3223  using Teuchos::rcp;
3225 
3226  const size_t numViewCols = static_cast<size_t> (cols.size ());
3227  TEUCHOS_TEST_FOR_EXCEPTION(
3228  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3229  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3230  "contain at least one entry, but cols.size() = " << cols.size ()
3231  << " == 0.");
3232 
3233  // Check whether the index set in cols is contiguous. If it is,
3234  // use the more efficient Range1D version of subView.
3235  bool contiguous = true;
3236  for (size_t j = 1; j < numViewCols; ++j) {
3237  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3238  contiguous = false;
3239  break;
3240  }
3241  }
3242  if (contiguous) {
3243  if (numViewCols == 0) {
3244  // The output MV has no columns, so there is nothing to view.
3245  return rcp (new MV (this->getMap (), numViewCols));
3246  } else {
3247  // Use the more efficient contiguous-index-range version.
3248  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3249  }
3250  }
3251 
3252  if (isConstantStride ()) {
3253  return rcp (new MV (this->getMap (), view_, origView_, cols));
3254  }
3255  else {
3256  Array<size_t> newcols (cols.size ());
3257  for (size_t j = 0; j < numViewCols; ++j) {
3258  newcols[j] = whichVectors_[cols[j]];
3259  }
3260  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3261  }
3262  }
3263 
3264 
3265  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3266  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3268  subView (const Teuchos::Range1D& colRng) const
3269  {
3270  using ::Tpetra::Details::Behavior;
3271  using Kokkos::ALL;
3272  using Kokkos::subview;
3273  using Teuchos::Array;
3274  using Teuchos::RCP;
3275  using Teuchos::rcp;
3277  const char tfecfFuncName[] = "subView(Range1D): ";
3278 
3279  const size_t lclNumRows = this->getLocalLength ();
3280  const size_t numVecs = this->getNumVectors ();
3281  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3282  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3283  // "at least one vector.");
3284  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3285  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3286  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3287  << numVecs << ".");
3288  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3289  numVecs != 0 && colRng.size () != 0 &&
3290  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3291  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3292  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3293  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3294  "[0, " << numVecs << "].");
3295 
3296  RCP<const MV> X_ret; // the MultiVector subview to return
3297 
3298  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3299  // broken for the case of views with zero rows. I will brutally
3300  // enforce that the subview has the correct dimensions. In
3301  // particular, in the case of zero rows, I will, if necessary,
3302  // create a new dual_view_type with zero rows and the correct
3303  // number of columns. In a debug build, I will use an all-reduce
3304  // to ensure that it has the correct dimensions on all processes.
3305 
3306  const std::pair<size_t, size_t> rows (0, lclNumRows);
3307  if (colRng.size () == 0) {
3308  const std::pair<size_t, size_t> cols (0, 0); // empty range
3309  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3310  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3311  }
3312  else {
3313  // Returned MultiVector is constant stride only if *this is.
3314  if (isConstantStride ()) {
3315  const std::pair<size_t, size_t> cols (colRng.lbound (),
3316  colRng.ubound () + 1);
3317  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3318  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3319  }
3320  else {
3321  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3322  // We're only asking for one column, so the result does have
3323  // constant stride, even though this MultiVector does not.
3324  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3325  whichVectors_[0] + colRng.ubound () + 1);
3326  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3327  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3328  }
3329  else {
3330  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3331  whichVectors_.begin () + colRng.ubound () + 1);
3332  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3333  }
3334  }
3335  }
3336 
3337  const bool debug = Behavior::debug ();
3338  if (debug) {
3339  using Teuchos::Comm;
3340  using Teuchos::outArg;
3341  using Teuchos::REDUCE_MIN;
3342  using Teuchos::reduceAll;
3343 
3344  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3345  Teuchos::null : this->getMap ()->getComm ();
3346  if (! comm.is_null ()) {
3347  int lclSuccess = 1;
3348  int gblSuccess = 1;
3349 
3350  if (X_ret.is_null ()) {
3351  lclSuccess = 0;
3352  }
3353  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3354  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3355  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3356  "MultiVector; the return value of this method) is null on some MPI "
3357  "process in this MultiVector's communicator. This should never "
3358  "happen. Please report this bug to the Tpetra developers.");
3359  if (! X_ret.is_null () &&
3360  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3361  lclSuccess = 0;
3362  }
3363  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3364  outArg (gblSuccess));
3365  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3366  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3367  "colRng.size(), on at least one MPI process in this MultiVector's "
3368  "communicator. This should never happen. "
3369  "Please report this bug to the Tpetra developers.");
3370  }
3371  }
3372  return X_ret;
3373  }
3374 
3375 
3376  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3377  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3379  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3380  {
3382  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3383  }
3384 
3385 
3386  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3387  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3389  subViewNonConst (const Teuchos::Range1D &colRng)
3390  {
3392  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3393  }
3394 
3395 
3396  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3399  const size_t j)
3400  : base_type (X.getMap ())
3401  {
3402  using Kokkos::subview;
3403  typedef std::pair<size_t, size_t> range_type;
3404  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3405 
3406  const size_t numCols = X.getNumVectors ();
3407  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3408  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3409  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3410  const size_t jj = X.isConstantStride () ?
3411  static_cast<size_t> (j) :
3412  static_cast<size_t> (X.whichVectors_[j]);
3413  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3414  this->origView_ = X.origView_;
3415 
3416  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3417  // Export or Import operations with different subviews of a
3418  // MultiVector. Thus, it is safe to reuse communication buffers.
3419  // See #1560 discussion.
3420  //
3421  // We only need one column's worth of buffer for imports_ and
3422  // exports_. Taking subviews now ensures that their lengths will
3423  // be exactly what we need, so we won't have to resize them later.
3424  {
3425  const size_t newSize = X.imports_.extent (0) / numCols;
3426  auto newImports = X.imports_;
3427  newImports.d_view = subview (X.imports_.d_view, range_type (0, newSize));
3428  newImports.h_view = subview (X.imports_.h_view, range_type (0, newSize));
3429  }
3430  {
3431  const size_t newSize = X.exports_.extent (0) / numCols;
3432  auto newExports = X.exports_;
3433  newExports.d_view = subview (X.exports_.d_view, range_type (0, newSize));
3434  newExports.h_view = subview (X.exports_.h_view, range_type (0, newSize));
3435  }
3436  // These two DualViews already either have the right number of
3437  // entries, or zero entries. This means that we don't need to
3438  // resize them.
3441  }
3442 
3443 
3444  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3445  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3447  getVector (const size_t j) const
3448  {
3450  return Teuchos::rcp (new V (*this, j));
3451  }
3452 
3453 
3454  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3455  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3457  getVectorNonConst (const size_t j)
3458  {
3460  return Teuchos::rcp (new V (*this, j));
3461  }
3462 
3463 
3464  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3465  void
3467  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3468  {
3469  using dev_view_type = typename dual_view_type::t_dev;
3470  using host_view_type = typename dual_view_type::t_host;
3471  using IST = impl_scalar_type;
3472  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3473  Kokkos::HostSpace,
3474  Kokkos::MemoryUnmanaged>;
3475  const char tfecfFuncName[] = "get1dCopy: ";
3476 
3477  const size_t numRows = this->getLocalLength ();
3478  const size_t numCols = this->getNumVectors ();
3479 
3480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3481  (LDA < numRows, std::runtime_error,
3482  "LDA = " << LDA << " < numRows = " << numRows << ".");
3483  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3484  (numRows > size_t (0) && numCols > size_t (0) &&
3485  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3486  std::runtime_error,
3487  "A.size() = " << A.size () << ", but its size must be at least "
3488  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3489 
3490  const std::pair<size_t, size_t> rowRange (0, numRows);
3491  const std::pair<size_t, size_t> colRange (0, numCols);
3492 
3493  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3494  LDA, numCols);
3495  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3496 
3497  // Use the most recently updated version of this MultiVector's
3498  // data. This avoids sync'ing, which could violate users'
3499  // expectations.
3500  //
3501  // If we need sync to device, then host has the most recent version.
3502  const bool useHostVersion = this->need_sync_device ();
3503 
3504  dev_view_type srcView_dev;
3505  host_view_type srcView_host;
3506  if (useHostVersion) {
3507  srcView_host = this->getLocalViewHost ();
3508  }
3509  else {
3510  srcView_dev = this->getLocalViewDevice ();
3511  }
3512 
3513  if (this->isConstantStride ()) {
3514  if (useHostVersion) {
3515  Kokkos::deep_copy (A_view, srcView_host);
3516  }
3517  else {
3518  Kokkos::deep_copy (A_view, srcView_dev);
3519  }
3520  }
3521  else {
3522  for (size_t j = 0; j < numCols; ++j) {
3523  const size_t srcCol = this->whichVectors_[j];
3524  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3525 
3526  if (useHostVersion) {
3527  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3528  Kokkos::deep_copy (dstColView, srcColView_host);
3529  }
3530  else {
3531  auto srcColView_dev = Kokkos::subview (srcView_dev, rowRange, srcCol);
3532  Kokkos::deep_copy (dstColView, srcColView_dev);
3533  }
3534  }
3535  }
3536  }
3537 
3538 
3539  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3540  void
3542  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3543  {
3545  const char tfecfFuncName[] = "get2dCopy: ";
3546  const size_t numRows = this->getLocalLength ();
3547  const size_t numCols = this->getNumVectors ();
3548 
3549  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3550  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3551  std::runtime_error, "Input array of pointers must contain as many "
3552  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3553  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3554 
3555  if (numRows != 0 && numCols != 0) {
3556  // No side effects until we've validated the input.
3557  for (size_t j = 0; j < numCols; ++j) {
3558  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3560  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3561  "the input array of arrays is not long enough to fit all entries in "
3562  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3563  << " < getLocalLength() = " << numRows << ".");
3564  }
3565 
3566  // We've validated the input, so it's safe to start copying.
3567  for (size_t j = 0; j < numCols; ++j) {
3568  Teuchos::RCP<const V> X_j = this->getVector (j);
3569  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3570  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3571  }
3572  }
3573  }
3574 
3575  namespace { // (anonymous)
3576  template <class SC, class LO, class GO, class NT>
3578  syncMVToHostIfNeededAndGetHostView (MultiVector<SC, LO, GO, NT>& X,
3579  const bool markModified)
3580  {
3581  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3582  // (replace ? with 1 or 2) have always been device->host
3583  // synchronization points, since <= 2012. We retain this
3584  // behavior for backwards compatibility.
3585  if (X.need_sync_host ()) {
3586  X.sync_host ();
3587  }
3588  if (markModified) {
3589  X.modify_host ();
3590  }
3591  return X.getLocalViewHost ();
3592  }
3593  } // namespace (anonymous)
3594 
3595 
3596  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3597  Teuchos::ArrayRCP<const Scalar>
3599  get1dView () const
3600  {
3601  if (getLocalLength () == 0 || getNumVectors () == 0) {
3602  return Teuchos::null;
3603  } else {
3604  TEUCHOS_TEST_FOR_EXCEPTION(
3605  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3606  "get1dView: This MultiVector does not have constant stride, so it is "
3607  "not possible to view its data as a single array. You may check "
3608  "whether a MultiVector has constant stride by calling "
3609  "isConstantStride().");
3610  // Since get1dView() is and was always marked const, I have to
3611  // cast away const here in order not to break backwards
3612  // compatibility.
3613  constexpr bool markModified = false;
3615  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3616  markModified);
3617  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3618  Kokkos::Compat::persistingView (X_lcl);
3619  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3620  }
3621  }
3622 
3623  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3624  Teuchos::ArrayRCP<Scalar>
3627  {
3628  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3629  return Teuchos::null;
3630  }
3631  else {
3632  TEUCHOS_TEST_FOR_EXCEPTION
3633  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3634  "get1dViewNonConst: This MultiVector does not have constant stride, "
3635  "so it is not possible to view its data as a single array. You may "
3636  "check whether a MultiVector has constant stride by calling "
3637  "isConstantStride().");
3638  constexpr bool markModified = true;
3639  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3640  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3641  Kokkos::Compat::persistingView (X_lcl);
3642  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3643  }
3644  }
3645 
3646  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3647  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3650  {
3651  constexpr bool markModified = true;
3652  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3653 
3654  // Don't use the row range here on the outside, in order to avoid
3655  // a strided return type (in case Kokkos::subview is conservative
3656  // about that). Instead, use the row range for the column views
3657  // in the loop.
3658  const size_t myNumRows = this->getLocalLength ();
3659  const size_t numCols = this->getNumVectors ();
3660  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3661 
3662  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3663  for (size_t j = 0; j < numCols; ++j) {
3664  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3665  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3666  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3667  Kokkos::Compat::persistingView (X_lcl_j);
3668  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3669  }
3670  return views;
3671  }
3672 
3673  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3674  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3676  get2dView () const
3677  {
3678  // Since get2dView() is and was always marked const, I have to
3679  // cast away const here in order not to break backwards
3680  // compatibility.
3681  constexpr bool markModified = false;
3683  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3684  markModified);
3685  // Don't use the row range here on the outside, in order to avoid
3686  // a strided return type (in case Kokkos::subview is conservative
3687  // about that). Instead, use the row range for the column views
3688  // in the loop.
3689  const size_t myNumRows = this->getLocalLength ();
3690  const size_t numCols = this->getNumVectors ();
3691  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3692 
3693  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3694  for (size_t j = 0; j < numCols; ++j) {
3695  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3696  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3697  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3698  Kokkos::Compat::persistingView (X_lcl_j);
3699  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3700  }
3701  return views;
3702  }
3703 
3704  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3705  void
3707  multiply (Teuchos::ETransp transA,
3708  Teuchos::ETransp transB,
3709  const Scalar& alpha,
3712  const Scalar& beta)
3713  {
3714  using ::Tpetra::Details::ProfilingRegion;
3715  using Teuchos::CONJ_TRANS;
3716  using Teuchos::NO_TRANS;
3717  using Teuchos::TRANS;
3718  using Teuchos::RCP;
3719  using Teuchos::rcp;
3720  using Teuchos::rcpFromRef;
3721  using std::endl;
3722  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3723  using LO = local_ordinal_type;
3724  using STS = Teuchos::ScalarTraits<Scalar>;
3726  const char tfecfFuncName[] = "multiply: ";
3727  ProfilingRegion region ("Tpetra::MV::multiply");
3728 
3729  // This routine performs a variety of matrix-matrix multiply
3730  // operations, interpreting the MultiVector (this-aka C , A and B)
3731  // as 2D matrices. Variations are due to the fact that A, B and C
3732  // can be locally replicated or globally distributed MultiVectors
3733  // and that we may or may not operate with the transpose of A and
3734  // B. Possible cases are:
3735  //
3736  // Operations # Cases Notes
3737  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3738  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3739  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3740  //
3741  // The following operations are not meaningful for 1-D
3742  // distributions:
3743  //
3744  // u1) C(local) = A^T(distr) * B^T(distr) 1
3745  // u2) C(local) = A (distr) * B^X(distr) 2
3746  // u3) C(distr) = A^X(local) * B^X(local) 4
3747  // u4) C(distr) = A^X(local) * B^X(distr) 4
3748  // u5) C(distr) = A^T(distr) * B^X(local) 2
3749  // u6) C(local) = A^X(distr) * B^X(local) 4
3750  // u7) C(distr) = A^X(distr) * B^X(local) 4
3751  // u8) C(local) = A^X(local) * B^X(distr) 4
3752  //
3753  // Total number of cases: 32 (= 2^5).
3754 
3755  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3756 
3757  const bool A_is_local = ! A.isDistributed ();
3758  const bool B_is_local = ! B.isDistributed ();
3759  const bool C_is_local = ! this->isDistributed ();
3760 
3761  // In debug mode, check compatibility of local dimensions. We
3762  // only do this in debug mode, since it requires an all-reduce
3763  // to ensure correctness on all processses. It's entirely
3764  // possible that only some processes may have incompatible local
3765  // dimensions. Throwing an exception only on those processes
3766  // could cause this method to hang.
3767  const bool debug = ::Tpetra::Details::Behavior::debug ();
3768  if (debug) {
3769  auto myMap = this->getMap ();
3770  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
3771  using Teuchos::REDUCE_MIN;
3772  using Teuchos::reduceAll;
3773  using Teuchos::outArg;
3774 
3775  auto comm = myMap->getComm ();
3776  const size_t A_nrows =
3777  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
3778  const size_t A_ncols =
3779  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
3780  const size_t B_nrows =
3781  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
3782  const size_t B_ncols =
3783  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
3784 
3785  const bool lclBad = this->getLocalLength () != A_nrows ||
3786  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
3787 
3788  const int myRank = comm->getRank ();
3789  std::ostringstream errStrm;
3790  if (this->getLocalLength () != A_nrows) {
3791  errStrm << "Proc " << myRank << ": this->getLocalLength()="
3792  << this->getLocalLength () << " != A_nrows=" << A_nrows
3793  << "." << std::endl;
3794  }
3795  if (this->getNumVectors () != B_ncols) {
3796  errStrm << "Proc " << myRank << ": this->getNumVectors()="
3797  << this->getNumVectors () << " != B_ncols=" << B_ncols
3798  << "." << std::endl;
3799  }
3800  if (A_ncols != B_nrows) {
3801  errStrm << "Proc " << myRank << ": A_ncols="
3802  << A_ncols << " != B_nrows=" << B_nrows
3803  << "." << std::endl;
3804  }
3805 
3806  const int lclGood = lclBad ? 0 : 1;
3807  int gblGood = 0;
3808  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3809  if (gblGood != 1) {
3810  std::ostringstream os;
3811  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
3812 
3813  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3814  (true, std::runtime_error, "Inconsistent local dimensions on at "
3815  "least one process in this object's communicator." << std::endl
3816  << "Operation: "
3817  << "C(" << (C_is_local ? "local" : "distr") << ") = "
3818  << alpha << "*A"
3819  << (transA == Teuchos::TRANS ? "^T" :
3820  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3821  << "(" << (A_is_local ? "local" : "distr") << ") * "
3822  << beta << "*B"
3823  << (transA == Teuchos::TRANS ? "^T" :
3824  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3825  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3826  << "Global dimensions: C(" << this->getGlobalLength () << ", "
3827  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
3828  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
3829  << ", " << B.getNumVectors () << ")." << std::endl
3830  << os.str ());
3831  }
3832  }
3833  }
3834 
3835  // Case 1: C(local) = A^X(local) * B^X(local)
3836  const bool Case1 = C_is_local && A_is_local && B_is_local;
3837  // Case 2: C(local) = A^T(distr) * B (distr)
3838  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
3839  transA != NO_TRANS &&
3840  transB == NO_TRANS;
3841  // Case 3: C(distr) = A (distr) * B^X(local)
3842  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
3843  transA == NO_TRANS;
3844 
3845  // Test that we are considering a meaningful case
3846  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3847  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
3848  "Multiplication of op(A) and op(B) into *this is not a "
3849  "supported use case.");
3850 
3851  if (beta != STS::zero () && Case2) {
3852  // If Case2, then C is local and contributions must be summed
3853  // across all processes. However, if beta != 0, then accumulate
3854  // beta*C into the sum. When summing across all processes, we
3855  // only want to accumulate this once, so set beta == 0 on all
3856  // processes except Process 0.
3857  const int myRank = this->getMap ()->getComm ()->getRank ();
3858  if (myRank != 0) {
3859  beta_local = ATS::zero ();
3860  }
3861  }
3862 
3863  // We only know how to do matrix-matrix multiplies if all the
3864  // MultiVectors have constant stride. If not, we have to make
3865  // temporary copies of those MultiVectors (including possibly
3866  // *this) that don't have constant stride.
3867  RCP<MV> C_tmp;
3868  if (! isConstantStride ()) {
3869  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
3870  } else {
3871  C_tmp = rcp (this, false);
3872  }
3873 
3874  RCP<const MV> A_tmp;
3875  if (! A.isConstantStride ()) {
3876  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
3877  } else {
3878  A_tmp = rcpFromRef (A);
3879  }
3880 
3881  RCP<const MV> B_tmp;
3882  if (! B.isConstantStride ()) {
3883  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
3884  } else {
3885  B_tmp = rcpFromRef (B);
3886  }
3887 
3888  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3889  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
3890  ! A_tmp->isConstantStride (), std::logic_error,
3891  "Failed to make temporary constant-stride copies of MultiVectors.");
3892 
3893  {
3894  if (A_tmp->need_sync_device ()) {
3895  const_cast<MV&> (*A_tmp).sync_device (); // const is a lie
3896  }
3897  const LO A_lclNumRows = A_tmp->getLocalLength ();
3898  const LO A_numVecs = A_tmp->getNumVectors ();
3899  auto A_lcl = A_tmp->getLocalViewDevice ();
3900  auto A_sub = Kokkos::subview (A_lcl,
3901  std::make_pair (LO (0), A_lclNumRows),
3902  std::make_pair (LO (0), A_numVecs));
3903 
3904  if (B_tmp->need_sync_device ()) {
3905  const_cast<MV&> (*B_tmp).sync_device (); // const is a lie
3906  }
3907  const LO B_lclNumRows = B_tmp->getLocalLength ();
3908  const LO B_numVecs = B_tmp->getNumVectors ();
3909  auto B_lcl = B_tmp->getLocalViewDevice ();
3910  auto B_sub = Kokkos::subview (B_lcl,
3911  std::make_pair (LO (0), B_lclNumRows),
3912  std::make_pair (LO (0), B_numVecs));
3913 
3914  if (C_tmp->need_sync_device ()) {
3915  const_cast<MV&> (*C_tmp).sync_device (); // const is a lie
3916  }
3917  const LO C_lclNumRows = C_tmp->getLocalLength ();
3918  const LO C_numVecs = C_tmp->getNumVectors ();
3919  auto C_lcl = C_tmp->getLocalViewDevice ();
3920  auto C_sub = Kokkos::subview (C_lcl,
3921  std::make_pair (LO (0), C_lclNumRows),
3922  std::make_pair (LO (0), C_numVecs));
3923  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
3924  (transA == Teuchos::TRANS ? 'T' : 'C'));
3925  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
3926  (transB == Teuchos::TRANS ? 'T' : 'C'));
3927  const impl_scalar_type alpha_IST (alpha);
3928 
3929  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
3930  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
3931  beta_local, C_sub);
3932  }
3933 
3934  if (! isConstantStride ()) {
3935  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
3936  }
3937 
3938  // Dispose of (possibly) extra copies of A and B.
3939  A_tmp = Teuchos::null;
3940  B_tmp = Teuchos::null;
3941 
3942  // If Case 2 then sum up *this and distribute it to all processes.
3943  if (Case2) {
3944  this->reduce ();
3945  }
3946  }
3947 
3948  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3949  void
3951  elementWiseMultiply (Scalar scalarAB,
3954  Scalar scalarThis)
3955  {
3956  using Kokkos::ALL;
3957  using Kokkos::subview;
3960  const char tfecfFuncName[] = "elementWiseMultiply: ";
3961 
3962  const size_t lclNumRows = this->getLocalLength ();
3963  const size_t numVecs = this->getNumVectors ();
3964 
3965  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3966  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
3967  std::runtime_error, "MultiVectors do not have the same local length.");
3968  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3969  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
3970  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
3971  << ".");
3972 
3973  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
3974  if (this->need_sync_device ()) {
3975  this->sync_device ();
3976  }
3977  if (A.need_sync_device ()) {
3978  const_cast<V&>(A).sync_device ();
3979  }
3980  if (B.need_sync_device ()) {
3981  const_cast<MV&>(B).sync_device ();
3982  }
3983  this->modify_device ();
3984 
3985  auto this_view = this->getLocalViewDevice ();
3986  auto A_view = A.getLocalViewDevice ();
3987  auto B_view = B.getLocalViewDevice ();
3988 
3989  if (isConstantStride () && B.isConstantStride ()) {
3990  // A is just a Vector; it only has one column, so it always has
3991  // constant stride.
3992  //
3993  // If both *this and B have constant stride, we can do an
3994  // element-wise multiply on all columns at once.
3995  KokkosBlas::mult (scalarThis,
3996  this_view,
3997  scalarAB,
3998  subview (A_view, ALL (), 0),
3999  B_view);
4000  }
4001  else {
4002  for (size_t j = 0; j < numVecs; ++j) {
4003  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4004  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4005  KokkosBlas::mult (scalarThis,
4006  subview (this_view, ALL (), C_col),
4007  scalarAB,
4008  subview (A_view, ALL (), 0),
4009  subview (B_view, ALL (), B_col));
4010  }
4011  }
4012  }
4013 
4014  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4015  void
4018  {
4020  using ::Tpetra::Details::ProfilingRegion;
4021  ProfilingRegion region ("Tpetra::MV::reduce");
4022 
4023  const auto map = this->getMap ();
4024  if (map.get () == nullptr) {
4025  return;
4026  }
4027  const auto comm = map->getComm ();
4028  if (comm.get () == nullptr) {
4029  return;
4030  }
4031 
4032  // Avoid giving device buffers to MPI. Even if MPI handles them
4033  // correctly, doing so may not perform well.
4034  const bool changed_on_device = this->need_sync_host ();
4035  if (changed_on_device) {
4036  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4037  // and host will be separate allocations. In that case, it may
4038  // pay to do the all-reduce from device to host.
4039  this->modify_device ();
4040  auto X_lcl = this->getLocalViewDevice ();
4041  allReduceView (X_lcl, X_lcl, *comm);
4042  }
4043  else {
4044  this->modify_host ();
4045  auto X_lcl = this->getLocalViewHost ();
4046  allReduceView (X_lcl, X_lcl, *comm);
4047  }
4048  }
4049 
4050  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4051  void
4053  replaceLocalValue (const LocalOrdinal lclRow,
4054  const size_t col,
4055  const impl_scalar_type& ScalarValue) const
4056  {
4057 #ifdef HAVE_TPETRA_DEBUG
4058  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4059  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4060  TEUCHOS_TEST_FOR_EXCEPTION(
4061  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4062  std::runtime_error,
4063  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4064  << " is invalid. The range of valid row indices on this process "
4065  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4066  << ", " << maxLocalIndex << "].");
4067  TEUCHOS_TEST_FOR_EXCEPTION(
4068  vectorIndexOutOfRange(col),
4069  std::runtime_error,
4070  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4071  << " of the multivector is invalid.");
4072 #endif
4073  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4074  view_.h_view (lclRow, colInd) = ScalarValue;
4075  }
4076 
4077 
4078  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4079  void
4081  sumIntoLocalValue (const LocalOrdinal lclRow,
4082  const size_t col,
4083  const impl_scalar_type& value,
4084  const bool atomic) const
4085  {
4086 #ifdef HAVE_TPETRA_DEBUG
4087  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4088  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4089  TEUCHOS_TEST_FOR_EXCEPTION(
4090  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4091  std::runtime_error,
4092  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4093  << " is invalid. The range of valid row indices on this process "
4094  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4095  << ", " << maxLocalIndex << "].");
4096  TEUCHOS_TEST_FOR_EXCEPTION(
4097  vectorIndexOutOfRange(col),
4098  std::runtime_error,
4099  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4100  << " of the multivector is invalid.");
4101 #endif
4102  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4103  if (atomic) {
4104  Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
4105  }
4106  else {
4107  view_.h_view (lclRow, colInd) += value;
4108  }
4109  }
4110 
4111 
4112  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4113  void
4115  replaceGlobalValue (const GlobalOrdinal gblRow,
4116  const size_t col,
4117  const impl_scalar_type& ScalarValue) const
4118  {
4119  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4120  // touches the RCP's reference count, which isn't thread safe.
4121  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4122 #ifdef HAVE_TPETRA_DEBUG
4123  const char tfecfFuncName[] = "replaceGlobalValue: ";
4124  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4125  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4126  std::runtime_error,
4127  "Global row index " << gblRow << " is not present on this process "
4128  << this->getMap ()->getComm ()->getRank () << ".");
4129  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4130  (this->vectorIndexOutOfRange (col), std::runtime_error,
4131  "Vector index " << col << " of the MultiVector is invalid.");
4132 #endif // HAVE_TPETRA_DEBUG
4133  this->replaceLocalValue (lclRow, col, ScalarValue);
4134  }
4135 
4136  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4137  void
4139  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4140  const size_t col,
4141  const impl_scalar_type& value,
4142  const bool atomic) const
4143  {
4144  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4145  // touches the RCP's reference count, which isn't thread safe.
4146  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4147 #ifdef HAVE_TEUCHOS_DEBUG
4148  TEUCHOS_TEST_FOR_EXCEPTION(
4149  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4150  std::runtime_error,
4151  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4152  << " is not present on this process "
4153  << this->getMap ()->getComm ()->getRank () << ".");
4154  TEUCHOS_TEST_FOR_EXCEPTION(
4155  vectorIndexOutOfRange(col),
4156  std::runtime_error,
4157  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4158  << " of the multivector is invalid.");
4159 #endif
4160  this->sumIntoLocalValue (lclRow, col, value, atomic);
4161  }
4162 
4163  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4164  template <class T>
4165  Teuchos::ArrayRCP<T>
4167  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4168  size_t j) const
4169  {
4170  typedef Kokkos::DualView<impl_scalar_type*,
4171  typename dual_view_type::array_layout,
4172  execution_space> col_dual_view_type;
4173  const size_t col = isConstantStride () ? j : whichVectors_[j];
4174  col_dual_view_type X_col =
4175  Kokkos::subview (view_, Kokkos::ALL (), col);
4176  return Kokkos::Compat::persistingView (X_col.d_view);
4177  }
4178 
4179 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
4180  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4182  TPETRA_DEPRECATED
4184  getDualView () const {
4185  return view_;
4186  }
4187 #endif // TPETRA_ENABLE_DEPRECATED_CODE
4188 
4189  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4190  void
4193  view_.clear_sync_state ();
4194  }
4195 
4196  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4197  void
4200  view_.sync_host ();
4201  }
4202 
4203  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4204  void
4207  view_.sync_device ();
4208  }
4209 
4210  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4211  bool
4214  return view_.need_sync_host ();
4215  }
4216 
4217  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4218  bool
4221  return view_.need_sync_device ();
4222  }
4223 
4224  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4225  void
4228  view_.modify_device ();
4229  }
4230 
4231  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4232  void
4235  view_.modify_host ();
4236  }
4237 
4238  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4242  return view_.view_device ();
4243  }
4244 
4245  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4249  return view_.view_host ();
4250  }
4251 
4252  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4253  std::string
4255  descriptionImpl (const std::string& className) const
4256  {
4257  using Teuchos::TypeNameTraits;
4258 
4259  std::ostringstream out;
4260  out << "\"" << className << "\": {";
4261  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4262  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4263  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4264  << ", Node" << Node::name ()
4265  << "}, ";
4266  if (this->getObjectLabel () != "") {
4267  out << "Label: \"" << this->getObjectLabel () << "\", ";
4268  }
4269  out << ", numRows: " << this->getGlobalLength ();
4270  if (className != "Tpetra::Vector") {
4271  out << ", numCols: " << this->getNumVectors ()
4272  << ", isConstantStride: " << this->isConstantStride ();
4273  }
4274  if (this->isConstantStride ()) {
4275  out << ", columnStride: " << this->getStride ();
4276  }
4277  out << "}";
4278 
4279  return out.str ();
4280  }
4281 
4282  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4283  std::string
4286  {
4287  return this->descriptionImpl ("Tpetra::MultiVector");
4288  }
4289 
4290  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4291  std::string
4293  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4294  {
4295  typedef LocalOrdinal LO;
4296  using std::endl;
4297 
4298  if (vl <= Teuchos::VERB_LOW) {
4299  return std::string ();
4300  }
4301  auto map = this->getMap ();
4302  if (map.is_null ()) {
4303  return std::string ();
4304  }
4305  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4306  auto outp = Teuchos::getFancyOStream (outStringP);
4307  Teuchos::FancyOStream& out = *outp;
4308  auto comm = map->getComm ();
4309  const int myRank = comm->getRank ();
4310  const int numProcs = comm->getSize ();
4311 
4312  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4313  Teuchos::OSTab tab1 (out);
4314 
4315  // VERB_MEDIUM and higher prints getLocalLength()
4316  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4317  out << "Local number of rows: " << lclNumRows << endl;
4318 
4319  if (vl > Teuchos::VERB_MEDIUM) {
4320  // VERB_HIGH and higher prints isConstantStride() and getStride().
4321  // The first is only relevant if the Vector has multiple columns.
4322  if (this->getNumVectors () != static_cast<size_t> (1)) {
4323  out << "isConstantStride: " << this->isConstantStride () << endl;
4324  }
4325  if (this->isConstantStride ()) {
4326  out << "Column stride: " << this->getStride () << endl;
4327  }
4328 
4329  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4330  // VERB_EXTREME prints values. Get a host View of the
4331  // Vector's local data, so we can print it. (Don't assume
4332  // that we can access device data directly in host code.)
4333  typename dual_view_type::t_host X_host;
4334  if (need_sync_host ()) {
4335  // Device memory has the latest version. Don't actually
4336  // sync to host; that changes the Vector's state, and may
4337  // change future computations (that use the data's current
4338  // place to decide where to compute). Instead, create a
4339  // temporary host copy and print that.
4340  auto X_dev = getLocalViewDevice ();
4341  auto X_host_copy = Kokkos::create_mirror_view (X_dev);
4342  Kokkos::deep_copy (X_host_copy, X_dev);
4343  X_host = X_host_copy;
4344  }
4345  else {
4346  // Either host and device are in sync, or host has the
4347  // latest version of the Vector's data. Thus, we can use
4348  // the host version directly.
4349  X_host = getLocalViewHost ();
4350  }
4351  // The square braces [] and their contents are in Matlab
4352  // format, so users may copy and paste directly into Matlab.
4353  out << "Values: " << endl
4354  << "[";
4355  const LO numCols = static_cast<LO> (this->getNumVectors ());
4356  if (numCols == 1) {
4357  for (LO i = 0; i < lclNumRows; ++i) {
4358  out << X_host(i,0);
4359  if (i + 1 < lclNumRows) {
4360  out << "; ";
4361  }
4362  }
4363  }
4364  else {
4365  for (LO i = 0; i < lclNumRows; ++i) {
4366  for (LO j = 0; j < numCols; ++j) {
4367  out << X_host(i,j);
4368  if (j + 1 < numCols) {
4369  out << ", ";
4370  }
4371  }
4372  if (i + 1 < lclNumRows) {
4373  out << "; ";
4374  }
4375  }
4376  }
4377  out << "]" << endl;
4378  }
4379  }
4380 
4381  out.flush (); // make sure the ostringstream got everything
4382  return outStringP->str ();
4383  }
4384 
4385  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4386  void
4388  describeImpl (Teuchos::FancyOStream& out,
4389  const std::string& className,
4390  const Teuchos::EVerbosityLevel verbLevel) const
4391  {
4392  using Teuchos::TypeNameTraits;
4393  using Teuchos::VERB_DEFAULT;
4394  using Teuchos::VERB_NONE;
4395  using Teuchos::VERB_LOW;
4396  using std::endl;
4397  const Teuchos::EVerbosityLevel vl =
4398  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4399 
4400  if (vl == VERB_NONE) {
4401  return; // don't print anything
4402  }
4403  // If this Vector's Comm is null, then the Vector does not
4404  // participate in collective operations with the other processes.
4405  // In that case, it is not even legal to call this method. The
4406  // reasonable thing to do in that case is nothing.
4407  auto map = this->getMap ();
4408  if (map.is_null ()) {
4409  return;
4410  }
4411  auto comm = map->getComm ();
4412  if (comm.is_null ()) {
4413  return;
4414  }
4415 
4416  const int myRank = comm->getRank ();
4417 
4418  // Only Process 0 should touch the output stream, but this method
4419  // in general may need to do communication. Thus, we may need to
4420  // preserve the current tab level across multiple "if (myRank ==
4421  // 0) { ... }" inner scopes. This is why we sometimes create
4422  // OSTab instances by pointer, instead of by value. We only need
4423  // to create them by pointer if the tab level must persist through
4424  // multiple inner scopes.
4425  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4426 
4427  // VERB_LOW and higher prints the equivalent of description().
4428  if (myRank == 0) {
4429  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4430  out << "\"" << className << "\":" << endl;
4431  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4432  {
4433  out << "Template parameters:" << endl;
4434  Teuchos::OSTab tab2 (out);
4435  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4436  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4437  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4438  << "Node: " << Node::name () << endl;
4439  }
4440  if (this->getObjectLabel () != "") {
4441  out << "Label: \"" << this->getObjectLabel () << "\", ";
4442  }
4443  out << "Global number of rows: " << this->getGlobalLength () << endl;
4444  if (className != "Tpetra::Vector") {
4445  out << "Number of columns: " << this->getNumVectors () << endl;
4446  }
4447  // getStride() may differ on different processes, so it (and
4448  // isConstantStride()) properly belong to per-process data.
4449  }
4450 
4451  // This is collective over the Map's communicator.
4452  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4453  const std::string lclStr = this->localDescribeToString (vl);
4454  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4455  }
4456  }
4457 
4458  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4459  void
4461  describe (Teuchos::FancyOStream &out,
4462  const Teuchos::EVerbosityLevel verbLevel) const
4463  {
4464  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4465  }
4466 
4467  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4468  void
4470  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4471  {
4472  replaceMap (newMap);
4473  }
4474 
4475  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4476  void
4479  {
4480  using ::Tpetra::Details::localDeepCopy;
4481  const char prefix[] = "Tpetra::MultiVector::assign: ";
4482 
4483  TEUCHOS_TEST_FOR_EXCEPTION
4484  (this->getGlobalLength () != src.getGlobalLength () ||
4485  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4486  prefix << "Global dimensions of the two Tpetra::MultiVector "
4487  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4488  << "," << src.getNumVectors () << "], and *this has dimensions ["
4489  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4490  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4491  TEUCHOS_TEST_FOR_EXCEPTION
4492  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4493  prefix << "The local row counts of the two Tpetra::MultiVector "
4494  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4495  << " and *this has " << this->getLocalLength () << " row(s).");
4496 
4497  // See #1510. We're writing to *this, so we don't care about its
4498  // contents in either memory space, and we don't want
4499  // DualView::modify to complain about "concurrent modification" of
4500  // host and device Views.
4501  this->clear_sync_state();
4502  this->modify_device ();
4503 
4504  // If need sync to device, then host has most recent version.
4505  const bool src_last_updated_on_host = src.need_sync_device ();
4506 
4507  if (src_last_updated_on_host) {
4508  localDeepCopy (this->getLocalViewDevice (),
4509  src.getLocalViewHost (),
4510  this->isConstantStride (),
4511  src.isConstantStride (),
4512  this->whichVectors_ (),
4513  src.whichVectors_ ());
4514  }
4515  else {
4516  localDeepCopy (this->getLocalViewDevice (),
4517  src.getLocalViewDevice (),
4518  this->isConstantStride (),
4519  src.isConstantStride (),
4520  this->whichVectors_ (),
4521  src.whichVectors_ ());
4522  }
4523  }
4524 
4525  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4526  bool
4529  using ::Tpetra::Details::PackTraits;
4530  using ST = impl_scalar_type;
4531  using HES =
4532  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
4533 
4534  const size_t l1 = this->getLocalLength();
4535  const size_t l2 = vec.getLocalLength();
4536  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4537  return false;
4538  }
4539  if (l1==0) {
4540  return true;
4541  }
4542 
4543  auto v1 = this->getLocalViewHost ();
4544  auto v2 = vec.getLocalViewHost ();
4545  if (PackTraits<ST, HES>::packValueCount (v1(0,0)) !=
4546  PackTraits<ST, HES>::packValueCount (v2(0,0))) {
4547  return false;
4548  }
4549 
4550  return true;
4551  }
4552 
4553  template <class ST, class LO, class GO, class NT>
4556  std::swap(mv.map_, this->map_);
4557  std::swap(mv.view_, this->view_);
4558  std::swap(mv.origView_, this->origView_);
4559  std::swap(mv.whichVectors_, this->whichVectors_);
4560  }
4561 
4562 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4563  template <class ST, class LO, class GO, class NT>
4564  void
4566  const Teuchos::SerialDenseMatrix<int, ST>& src)
4567  {
4568  using ::Tpetra::Details::localDeepCopy;
4569  using MV = MultiVector<ST, LO, GO, NT>;
4570  using IST = typename MV::impl_scalar_type;
4571  using input_view_type =
4572  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4573  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4574  using pair_type = std::pair<LO, LO>;
4575 
4576  const LO numRows = static_cast<LO> (src.numRows ());
4577  const LO numCols = static_cast<LO> (src.numCols ());
4578 
4579  TEUCHOS_TEST_FOR_EXCEPTION
4580  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4581  numCols != static_cast<LO> (dst.getNumVectors ()),
4582  std::invalid_argument, "Tpetra::deep_copy: On Process "
4583  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4584  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4585  << ", but src is " << numRows << " x " << numCols << ".");
4586 
4587  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4588  input_view_type src_orig (src_raw, src.stride (), numCols);
4589  input_view_type src_view =
4590  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4591 
4592  dst.clear_sync_state ();
4593  dst.modify_device ();
4594  constexpr bool src_isConstantStride = true;
4595  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4596  localDeepCopy (dst.getLocalViewDevice (),
4597  src_view,
4598  dst.isConstantStride (),
4599  src_isConstantStride,
4600  getMultiVectorWhichVectors (dst),
4601  srcWhichVectors);
4602  }
4603 
4604  template <class ST, class LO, class GO, class NT>
4605  void
4606  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4607  const MultiVector<ST, LO, GO, NT>& src)
4608  {
4609  using ::Tpetra::Details::localDeepCopy;
4610  using MV = MultiVector<ST, LO, GO, NT>;
4611  using IST = typename MV::impl_scalar_type;
4612  using output_view_type =
4613  Kokkos::View<IST**, Kokkos::LayoutLeft,
4614  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4615  using pair_type = std::pair<LO, LO>;
4616 
4617  const LO numRows = static_cast<LO> (dst.numRows ());
4618  const LO numCols = static_cast<LO> (dst.numCols ());
4619 
4620  TEUCHOS_TEST_FOR_EXCEPTION
4621  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4622  numCols != static_cast<LO> (src.getNumVectors ()),
4623  std::invalid_argument, "Tpetra::deep_copy: On Process "
4624  << src.getMap ()->getComm ()->getRank () << ", src is "
4625  << src.getLocalLength () << " x " << src.getNumVectors ()
4626  << ", but dst is " << numRows << " x " << numCols << ".");
4627 
4628  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4629  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4630  auto dst_view =
4631  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4632 
4633  constexpr bool dst_isConstantStride = true;
4634  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4635 
4636  // Prefer the host version of src's data.
4637  if (src.need_sync_host ()) { // last modified on device
4638  localDeepCopy (dst_view,
4639  src.getLocalViewDevice (),
4640  dst_isConstantStride,
4641  src.isConstantStride (),
4642  dstWhichVectors,
4643  getMultiVectorWhichVectors (src));
4644  }
4645  else {
4646  localDeepCopy (dst_view,
4647  src.getLocalViewHost (),
4648  dst_isConstantStride,
4649  src.isConstantStride (),
4650  dstWhichVectors,
4651  getMultiVectorWhichVectors (src));
4652  }
4653  }
4654 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4655 
4656  template <class Scalar, class LO, class GO, class NT>
4657  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4658  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4659  size_t numVectors)
4660  {
4661  typedef MultiVector<Scalar, LO, GO, NT> MV;
4662  return Teuchos::rcp (new MV (map, numVectors));
4663  }
4664 
4665  template <class ST, class LO, class GO, class NT>
4666  MultiVector<ST, LO, GO, NT>
4668  {
4669  typedef MultiVector<ST, LO, GO, NT> MV;
4670  MV cpy (src.getMap (), src.getNumVectors (), false);
4671  cpy.assign (src);
4672  return cpy;
4673  }
4674 
4675 } // namespace Tpetra
4676 
4677 //
4678 // Explicit instantiation macro
4679 //
4680 // Must be expanded from within the Tpetra namespace!
4681 //
4682 
4683 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4684 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4685  template class MultiVector< SCALAR , LO , GO , NODE >; \
4686  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4687  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4688  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4689  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4690 
4691 #else
4692 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4693  template class MultiVector< SCALAR , LO , GO , NODE >; \
4694  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4695 
4696 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4697 
4698 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector &quot;constructor&quot;: Create a MultiVector from a given Map.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
dual_view_type::t_host getLocalViewHost() const
A local Kokkos::View of host memory.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector&#39;s local values.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector&#39;s local values.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void clear_sync_state()
Clear &quot;modified&quot; flags on both host and device sides.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process&#39; verbose describe() information to the returned string. ...
One or more distributed dense vectors.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using global row index.
bool isDistributed() const
Whether this is a globally distributed object.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector&#39;s local values.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
static bool debug()
Whether Tpetra is in debug mode.
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalCols() const
&quot;Original&quot; number of columns in the (local) data.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
void sync_host()
Synchronize to Host.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
dual_view_type origView_
The &quot;original view&quot; of the MultiVector&#39;s data.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using global row index.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner (&quot;dot&quot;) product result.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Declaration of Tpetra::Details::isInterComm.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
void modify_device()
Mark data as modified on the device side.
Replace old value with maximum of magnitudes of old and new values.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
Abstract base class for objects that can be the source of an Import or Export operation.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Replace existing values with new values.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Return a deep copy of this MultiVector, with a different Node type.
void randomize()
Set all values in the multivector to pseudorandom numbers.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void modify_host()
Mark data as modified on the host side.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using local (row) index.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
A parallel distribution of indices over processes.
size_t getStride() const
Stride between columns in the multivector.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual std::string description() const
A simple one-line description of this object.
void reduce()
Sum values of a locally replicated multivector across all processes.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
dual_view_type::t_dev getLocalViewDevice() const
A local Kokkos::View of device memory.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using local row index.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,j).
Base class for distributed Tpetra objects that support data redistribution.
void localDeepCopy(const DstViewType &dst, const SrcViewType &src, const bool dstConstStride, const bool srcConstStride, const DstWhichVecsType &dstWhichVecs, const SrcWhichVecsType &srcWhichVecs)
Implementation of Tpetra::MultiVector deep copy of local data.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector&#39;s local values.
size_t getOrigNumLocalRows() const
&quot;Original&quot; number of rows in the (local) data.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
void sync_device()
Synchronize to Device.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.