Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
43 #define TPETRA_MULTIVECTOR_DEF_HPP
44 
52 
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
57 #include "Tpetra_Details_checkView.hpp"
58 #include "Tpetra_Details_fill.hpp"
59 #include "Tpetra_Details_gathervPrint.hpp"
66 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
67 # include "Teuchos_SerialDenseMatrix.hpp"
68 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
69 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
70 #include "KokkosCompat_View.hpp"
71 #include "KokkosBlas.hpp"
72 #include "KokkosKernels_Utils.hpp"
73 #include "Kokkos_Random.hpp"
74 #include "Kokkos_ArithTraits.hpp"
75 #include <memory>
76 #include <sstream>
77 
78 #ifdef HAVE_TPETRA_INST_FLOAT128
79 namespace Kokkos {
80  // FIXME (mfh 04 Sep 2015) Just a stub for now!
81  template<class Generator>
82  struct rand<Generator, __float128> {
83  static KOKKOS_INLINE_FUNCTION __float128 max ()
84  {
85  return static_cast<__float128> (1.0);
86  }
87  static KOKKOS_INLINE_FUNCTION __float128
88  draw (Generator& gen)
89  {
90  // Half the smallest normalized double, is the scaling factor of
91  // the lower-order term in the double-double representation.
92  const __float128 scalingFactor =
93  static_cast<__float128> (std::numeric_limits<double>::min ()) /
94  static_cast<__float128> (2.0);
95  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
96  const __float128 lowerOrderTerm =
97  static_cast<__float128> (gen.drand ()) * scalingFactor;
98  return higherOrderTerm + lowerOrderTerm;
99  }
100  static KOKKOS_INLINE_FUNCTION __float128
101  draw (Generator& gen, const __float128& range)
102  {
103  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
104  const __float128 scalingFactor =
105  static_cast<__float128> (std::numeric_limits<double>::min ()) /
106  static_cast<__float128> (2.0);
107  const __float128 higherOrderTerm =
108  static_cast<__float128> (gen.drand (range));
109  const __float128 lowerOrderTerm =
110  static_cast<__float128> (gen.drand (range)) * scalingFactor;
111  return higherOrderTerm + lowerOrderTerm;
112  }
113  static KOKKOS_INLINE_FUNCTION __float128
114  draw (Generator& gen, const __float128& start, const __float128& end)
115  {
116  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
117  const __float128 scalingFactor =
118  static_cast<__float128> (std::numeric_limits<double>::min ()) /
119  static_cast<__float128> (2.0);
120  const __float128 higherOrderTerm =
121  static_cast<__float128> (gen.drand (start, end));
122  const __float128 lowerOrderTerm =
123  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
124  return higherOrderTerm + lowerOrderTerm;
125  }
126  };
127 } // namespace Kokkos
128 #endif // HAVE_TPETRA_INST_FLOAT128
129 
130 namespace { // (anonymous)
131 
146  template<class ST, class LO, class GO, class NT>
148  allocDualView (const size_t lclNumRows,
149  const size_t numCols,
150  const bool zeroOut = true,
151  const bool allowPadding = false)
152  {
153  using ::Tpetra::Details::Behavior;
154  using Kokkos::AllowPadding;
155  using Kokkos::view_alloc;
156  using Kokkos::WithoutInitializing;
157  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
158  typedef typename dual_view_type::t_dev dev_view_type;
159  // This needs to be a string and not a char*, if given as an
160  // argument to Kokkos::view_alloc. This is because view_alloc
161  // also allows a raw pointer as its first argument. See
162  // https://github.com/kokkos/kokkos/issues/434.
163  const std::string label ("MV::DualView");
164  const bool debug = Behavior::debug ();
165 
166  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
167  // creation of the DualView's Views works around
168  // Kokkos::DualView's current inability to accept an
169  // AllocationProperties initial argument (as Kokkos::View does).
170  // However, the work-around is harmless, since it does what the
171  // (currently nonexistent) equivalent DualView constructor would
172  // have done anyway.
173 
174  dev_view_type d_view;
175  if (zeroOut) {
176  if (allowPadding) {
177  d_view = dev_view_type (view_alloc (label, AllowPadding),
178  lclNumRows, numCols);
179  }
180  else {
181  d_view = dev_view_type (view_alloc (label),
182  lclNumRows, numCols);
183  }
184  }
185  else {
186  if (allowPadding) {
187  d_view = dev_view_type (view_alloc (label,
188  WithoutInitializing,
189  AllowPadding),
190  lclNumRows, numCols);
191  }
192  else {
193  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
194  lclNumRows, numCols);
195  }
196  if (debug) {
197  // Filling with NaN is a cheap and effective way to tell if
198  // downstream code is trying to use a MultiVector's data
199  // without them having been initialized. ArithTraits lets us
200  // call nan() even if the scalar type doesn't define it; it
201  // just returns some undefined value in the latter case. This
202  // won't hurt anything because by setting zeroOut=false, users
203  // already agreed that they don't care about the contents of
204  // the MultiVector.
205  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
206  KokkosBlas::fill (d_view, nan);
207  }
208  }
209  if (debug) {
210  TEUCHOS_TEST_FOR_EXCEPTION
211  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
212  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
213  "allocDualView: d_view's dimensions actual dimensions do not match "
214  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
215  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
216  << ". Please report this bug to the Tpetra developers.");
217  }
218 
219  dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
220  // Whether or not the user cares about the initial contents of the
221  // MultiVector, the device and host views are out of sync. We
222  // prefer to work in device memory. The way to ensure this
223  // happens is to mark the device view as modified.
224  dv.modify_device ();
225 
226  return dv;
227  }
228 
229  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
230  //
231  // T: The type of the entries of the View.
232  // ExecSpace: The Kokkos execution space.
233  template<class T, class ExecSpace>
234  struct MakeUnmanagedView {
235  // The 'false' part of the branch carefully ensures that this
236  // won't attempt to use a host execution space that hasn't been
237  // initialized. For example, if Kokkos::OpenMP is disabled and
238  // Kokkos::Threads is enabled, the latter is always the default
239  // execution space of Kokkos::HostSpace, even when ExecSpace is
240  // Kokkos::Serial. That's why we go through the trouble of asking
241  // Kokkos::DualView what _its_ space is. That seems to work
242  // around this default execution space issue.
243  //
244  // NOTE (mfh 29 Jan 2016): See kokkos/kokkos#178 for why we use
245  // a memory space, rather than an execution space, as the first
246  // argument of VerifyExecutionCanAccessMemorySpace.
247  typedef typename Kokkos::Impl::if_c<
248  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
249  typename ExecSpace::memory_space,
250  Kokkos::HostSpace>::value,
251  typename ExecSpace::device_type,
252  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
253  typedef Kokkos::LayoutLeft array_layout;
254  typedef Kokkos::View<T*, array_layout, host_exec_space,
255  Kokkos::MemoryUnmanaged> view_type;
256 
257  static view_type getView (const Teuchos::ArrayView<T>& x_in)
258  {
259  const size_t numEnt = static_cast<size_t> (x_in.size ());
260  if (numEnt == 0) {
261  return view_type ();
262  } else {
263  return view_type (x_in.getRawPtr (), numEnt);
264  }
265  }
266  };
267 
268  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
269  // taking a subview of a 0 x N DualView incorrectly always results
270  // in a 0 x 0 DualView.
271  template<class DualViewType>
272  DualViewType
273  takeSubview (const DualViewType& X,
274  const Kokkos::Impl::ALL_t&,
275  const std::pair<size_t, size_t>& colRng)
276  {
277  if (X.extent (0) == 0 && X.extent (1) != 0) {
278  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
279  }
280  else {
281  return subview (X, Kokkos::ALL (), colRng);
282  }
283  }
284 
285  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
286  // taking a subview of a 0 x N DualView incorrectly always results
287  // in a 0 x 0 DualView.
288  template<class DualViewType>
289  DualViewType
290  takeSubview (const DualViewType& X,
291  const std::pair<size_t, size_t>& rowRng,
292  const std::pair<size_t, size_t>& colRng)
293  {
294  if (X.extent (0) == 0 && X.extent (1) != 0) {
295  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
296  }
297  else {
298  return subview (X, rowRng, colRng);
299  }
300  }
301 
302  template<class DualViewType>
303  size_t
304  getDualViewStride (const DualViewType& dv)
305  {
306  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
307  // method yet, but its Views do.
308  const size_t LDA = dv.d_view.stride (1);
309  const size_t numRows = dv.extent (0);
310 
311  if (LDA == 0) {
312  return (numRows == 0) ? size_t (1) : numRows;
313  }
314  else {
315  return LDA;
316  }
317  }
318 
319  template<class ViewType>
320  size_t
321  getViewStride (const ViewType& view)
322  {
323  const size_t LDA = view.stride (1);
324  const size_t numRows = view.extent (0);
325 
326  if (LDA == 0) {
327  return (numRows == 0) ? size_t (1) : numRows;
328  }
329  else {
330  return LDA;
331  }
332  }
333 
334  template <class SC, class LO, class GO, class NT>
335  bool
336  multiVectorRunNormOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
337  {
338  if (! X.need_sync_device ()) {
339  return false; // most up-to-date on device
340  }
341  else { // most up-to-date on host
342  constexpr size_t localLengthThreshold = 10000;
343  return X.getLocalLength () <= localLengthThreshold;
344  }
345  }
346 
347  template <class SC, class LO, class GO, class NT>
348  void
349  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
352  {
355  using val_type = typename MV::impl_scalar_type;
356  using mag_type = typename MV::mag_type;
357  using dual_view_type = typename MV::dual_view_type;
358 
359  auto map = X.getMap ();
360  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
361  auto whichVecs = getMultiVectorWhichVectors (X);
362  const bool isConstantStride = X.isConstantStride ();
363  const bool isDistributed = X.isDistributed ();
364 
365  const bool runOnHost = multiVectorRunNormOnHost (X);
366  if (runOnHost) {
367  using view_type = typename dual_view_type::t_host;
368  using array_layout = typename view_type::array_layout;
369  using device_type = typename view_type::device_type;
370 
371  if (X.need_sync_host ()) {
372  X.sync_host ();
373  }
374  view_type X_lcl = X.getLocalViewHost ();
375  normImpl<val_type, array_layout, device_type,
376  mag_type> (norms, X_lcl, whichNorm, whichVecs,
377  isConstantStride, isDistributed, comm);
378  }
379  else {
380  using view_type = typename dual_view_type::t_dev;
381  using array_layout = typename view_type::array_layout;
382  using device_type = typename view_type::device_type;
383 
384  if (X.need_sync_device ()) {
385  X.sync_device ();
386  }
387  view_type X_lcl = X.getLocalViewDevice ();
388  normImpl<val_type, array_layout, device_type,
389  mag_type> (norms, X_lcl, whichNorm, whichVecs,
390  isConstantStride, isDistributed, comm);
391  }
392  }
393 } // namespace (anonymous)
394 
395 
396 namespace Tpetra {
397 
398  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
399  bool
400  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
401  vectorIndexOutOfRange (const size_t VectorIndex) const {
402  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
403  }
404 
405  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
408  base_type (Teuchos::rcp (new map_type ()))
409  {}
410 
411  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
413  MultiVector (const Teuchos::RCP<const map_type>& map,
414  const size_t numVecs,
415  const bool zeroOut) : /* default is true */
416  base_type (map)
417  {
418  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
419 
420  const size_t lclNumRows = this->getLocalLength ();
421  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
422  origView_ = view_;
423  }
424 
425  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
428  const Teuchos::DataAccess copyOrView) :
429  base_type (source),
430  view_ (source.view_),
431  origView_ (source.origView_),
432  whichVectors_ (source.whichVectors_)
433  {
435  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
436  "const Teuchos::DataAccess): ";
437 
438  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
439 
440  if (copyOrView == Teuchos::Copy) {
441  // Reuse the conveniently already existing function that creates
442  // a deep copy.
443  MV cpy = createCopy (source);
444  this->view_ = cpy.view_;
445  this->origView_ = cpy.origView_;
446  this->whichVectors_ = cpy.whichVectors_;
447  }
448  else if (copyOrView == Teuchos::View) {
449  }
450  else {
451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
452  true, std::invalid_argument, "Second argument 'copyOrView' has an "
453  "invalid value " << copyOrView << ". Valid values include "
454  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
455  << Teuchos::View << ".");
456  }
457  }
458 
459  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
461  MultiVector (const Teuchos::RCP<const map_type>& map,
462  const dual_view_type& view) :
463  base_type (map),
464  view_ (view),
465  origView_ (view)
466  {
467  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
468  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
469  map->getNodeNumElements ();
470  const size_t lclNumRows_view = view.extent (0);
471  const size_t LDA = getDualViewStride (view);
472 
473  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
474  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
475  std::invalid_argument, "Kokkos::DualView does not match Map. "
476  "map->getNodeNumElements() = " << lclNumRows_map
477  << ", view.extent(0) = " << lclNumRows_view
478  << ", and getStride() = " << LDA << ".");
479 
480  using ::Tpetra::Details::Behavior;
481  const bool debug = Behavior::debug ();
482  if (debug) {
483  using ::Tpetra::Details::checkGlobalDualViewValidity;
484  std::ostringstream gblErrStrm;
485  const bool verbose = Behavior::verbose ();
486  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
487  const bool gblValid =
488  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
489  comm.getRawPtr ());
490  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
491  (! gblValid, std::runtime_error, gblErrStrm.str ());
492  }
493  }
494 
495 
496  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
498  MultiVector (const Teuchos::RCP<const map_type>& map,
499  const typename dual_view_type::t_dev& d_view) :
500  base_type (map)
501  {
502  using Teuchos::ArrayRCP;
503  using Teuchos::RCP;
504  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
505 
506  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
507 
508  const size_t LDA = getViewStride (d_view);
509  const size_t lclNumRows = map->getNodeNumElements ();
510  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
511  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
512  "Kokkos::View. map->getNodeNumElements() = " << lclNumRows
513  << ", View's column stride = " << LDA
514  << ", and View's extent(0) = " << d_view.extent (0) << ".");
515 
516  auto h_view = Kokkos::create_mirror_view (d_view);
517  view_ = dual_view_type (d_view, h_view);
518 
519  using ::Tpetra::Details::Behavior;
520  const bool debug = Behavior::debug ();
521  if (debug) {
522  using ::Tpetra::Details::checkGlobalDualViewValidity;
523  std::ostringstream gblErrStrm;
524  const bool verbose = Behavior::verbose ();
525  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
526  const bool gblValid =
527  checkGlobalDualViewValidity (&gblErrStrm, view_, verbose,
528  comm.getRawPtr ());
529  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
530  (! gblValid, std::runtime_error, gblErrStrm.str ());
531  }
532  // The user gave us a device view. In order to respect its
533  // initial contents, we mark the DualView as "modified on device."
534  // That way, the next sync will synchronize it with the host view.
535  this->modify_device ();
536  origView_ = view_;
537  }
538 
539  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
541  MultiVector (const Teuchos::RCP<const map_type>& map,
542  const dual_view_type& view,
543  const dual_view_type& origView) :
544  base_type (map),
545  view_ (view),
546  origView_ (origView)
547  {
548  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
549 
550  const size_t LDA = getDualViewStride (origView);
551  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
552  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
553  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
554  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
555  << ". This may also mean that the input origView's first dimension (number "
556  "of rows = " << origView.extent (0) << ") does not not match the number "
557  "of entries in the Map on the calling process.");
558 
559  using ::Tpetra::Details::Behavior;
560  const bool debug = Behavior::debug ();
561  if (debug) {
562  using ::Tpetra::Details::checkGlobalDualViewValidity;
563  std::ostringstream gblErrStrm;
564  const bool verbose = Behavior::verbose ();
565  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
566  const bool gblValid_0 =
567  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
568  comm.getRawPtr ());
569  const bool gblValid_1 =
570  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
571  comm.getRawPtr ());
572  const bool gblValid = gblValid_0 && gblValid_1;
573  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
574  (! gblValid, std::runtime_error, gblErrStrm.str ());
575  }
576  }
577 
578 
579  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
581  MultiVector (const Teuchos::RCP<const map_type>& map,
582  const dual_view_type& view,
583  const Teuchos::ArrayView<const size_t>& whichVectors) :
584  base_type (map),
585  view_ (view),
586  origView_ (view),
587  whichVectors_ (whichVectors.begin (), whichVectors.end ())
588  {
589  using Kokkos::ALL;
590  using Kokkos::subview;
591  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
592 
593  using ::Tpetra::Details::Behavior;
594  const bool debug = Behavior::debug ();
595  if (debug) {
596  using ::Tpetra::Details::checkGlobalDualViewValidity;
597  std::ostringstream gblErrStrm;
598  const bool verbose = Behavior::verbose ();
599  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
600  const bool gblValid =
601  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
602  comm.getRawPtr ());
603  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
604  (! gblValid, std::runtime_error, gblErrStrm.str ());
605  }
606 
607  const size_t lclNumRows = map.is_null () ? size_t (0) :
608  map->getNodeNumElements ();
609  // Check dimensions of the input DualView. We accept that Kokkos
610  // might not allow construction of a 0 x m (Dual)View with m > 0,
611  // so we only require the number of rows to match if the
612  // (Dual)View has more than zero columns. Likewise, we only
613  // require the number of columns to match if the (Dual)View has
614  // more than zero rows.
615  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
616  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
617  std::invalid_argument, "view.extent(0) = " << view.extent (0)
618  << " < map->getNodeNumElements() = " << lclNumRows << ".");
619  if (whichVectors.size () != 0) {
620  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
621  view.extent (1) != 0 && view.extent (1) == 0,
622  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
623  " = " << whichVectors.size () << " > 0.");
624  size_t maxColInd = 0;
625  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
626  for (size_type k = 0; k < whichVectors.size (); ++k) {
627  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
628  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
629  std::invalid_argument, "whichVectors[" << k << "] = "
630  "Teuchos::OrdinalTraits<size_t>::invalid().");
631  maxColInd = std::max (maxColInd, whichVectors[k]);
632  }
633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
634  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
635  std::invalid_argument, "view.extent(1) = " << view.extent (1)
636  << " <= max(whichVectors) = " << maxColInd << ".");
637  }
638 
639  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
640  // zero strides, so modify in that case.
641  const size_t LDA = getDualViewStride (view);
642  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
643  (LDA < lclNumRows, std::invalid_argument,
644  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
645 
646  if (whichVectors.size () == 1) {
647  // If whichVectors has only one entry, we don't need to bother
648  // with nonconstant stride. Just shift the view over so it
649  // points to the desired column.
650  //
651  // NOTE (mfh 10 May 2014) This is a special case where we set
652  // origView_ just to view that one column, not all of the
653  // original columns. This ensures that the use of origView_ in
654  // offsetView works correctly.
655  const std::pair<size_t, size_t> colRng (whichVectors[0],
656  whichVectors[0] + 1);
657  view_ = takeSubview (view_, ALL (), colRng);
658  origView_ = takeSubview (origView_, ALL (), colRng);
659  // whichVectors_.size() == 0 means "constant stride."
660  whichVectors_.clear ();
661  }
662  }
663 
664  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
666  MultiVector (const Teuchos::RCP<const map_type>& map,
667  const dual_view_type& view,
668  const dual_view_type& origView,
669  const Teuchos::ArrayView<const size_t>& whichVectors) :
670  base_type (map),
671  view_ (view),
672  origView_ (origView),
673  whichVectors_ (whichVectors.begin (), whichVectors.end ())
674  {
675  using Kokkos::ALL;
676  using Kokkos::subview;
677  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
678 
679  using ::Tpetra::Details::Behavior;
680  const bool debug = Behavior::debug ();
681  if (debug) {
682  using ::Tpetra::Details::checkGlobalDualViewValidity;
683  std::ostringstream gblErrStrm;
684  const bool verbose = Behavior::verbose ();
685  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
686  const bool gblValid_0 =
687  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
688  comm.getRawPtr ());
689  const bool gblValid_1 =
690  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
691  comm.getRawPtr ());
692  const bool gblValid = gblValid_0 && gblValid_1;
693  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
694  (! gblValid, std::runtime_error, gblErrStrm.str ());
695  }
696 
697  const size_t lclNumRows = this->getLocalLength ();
698  // Check dimensions of the input DualView. We accept that Kokkos
699  // might not allow construction of a 0 x m (Dual)View with m > 0,
700  // so we only require the number of rows to match if the
701  // (Dual)View has more than zero columns. Likewise, we only
702  // require the number of columns to match if the (Dual)View has
703  // more than zero rows.
704  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
705  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
706  std::invalid_argument, "view.extent(0) = " << view.extent (0)
707  << " < map->getNodeNumElements() = " << lclNumRows << ".");
708  if (whichVectors.size () != 0) {
709  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
710  view.extent (1) != 0 && view.extent (1) == 0,
711  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
712  " = " << whichVectors.size () << " > 0.");
713  size_t maxColInd = 0;
714  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
715  for (size_type k = 0; k < whichVectors.size (); ++k) {
716  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
717  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
718  std::invalid_argument, "whichVectors[" << k << "] = "
719  "Teuchos::OrdinalTraits<size_t>::invalid().");
720  maxColInd = std::max (maxColInd, whichVectors[k]);
721  }
722  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
723  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
724  std::invalid_argument, "view.extent(1) = " << view.extent (1)
725  << " <= max(whichVectors) = " << maxColInd << ".");
726  }
727 
728  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
729  // zero strides, so modify in that case.
730  const size_t LDA = getDualViewStride (origView);
731  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
732  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
733  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
734  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
735  << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
736 
737  if (whichVectors.size () == 1) {
738  // If whichVectors has only one entry, we don't need to bother
739  // with nonconstant stride. Just shift the view over so it
740  // points to the desired column.
741  //
742  // NOTE (mfh 10 May 2014) This is a special case where we set
743  // origView_ just to view that one column, not all of the
744  // original columns. This ensures that the use of origView_ in
745  // offsetView works correctly.
746  const std::pair<size_t, size_t> colRng (whichVectors[0],
747  whichVectors[0] + 1);
748  view_ = takeSubview (view_, ALL (), colRng);
749  origView_ = takeSubview (origView_, ALL (), colRng);
750  // whichVectors_.size() == 0 means "constant stride."
751  whichVectors_.clear ();
752  }
753  }
754 
755  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
757  MultiVector (const Teuchos::RCP<const map_type>& map,
758  const Teuchos::ArrayView<const Scalar>& data,
759  const size_t LDA,
760  const size_t numVecs) :
761  base_type (map)
762  {
763  typedef LocalOrdinal LO;
764  typedef GlobalOrdinal GO;
765  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
766  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
767 
768  // Deep copy constructor, constant stride (NO whichVectors_).
769  // There is no need for a deep copy constructor with nonconstant stride.
770 
771  const size_t lclNumRows =
772  map.is_null () ? size_t (0) : map->getNodeNumElements ();
773  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
774  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
775  "map->getNodeNumElements() = " << lclNumRows << ".");
776  if (numVecs != 0) {
777  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
778  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
779  (static_cast<size_t> (data.size ()) < minNumEntries,
780  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
781  "entries, given the input Map and number of vectors in the MultiVector."
782  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
783  "map->getNodeNumElements () = " << minNumEntries << ".");
784  }
785 
786  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
787  this->modify_device ();
788  auto X_out = this->getLocalViewDevice ();
789  origView_ = view_;
790 
791  // Make an unmanaged host Kokkos::View of the input data. First
792  // create a View (X_in_orig) with the original stride. Then,
793  // create a subview (X_in) with the right number of columns.
794  const impl_scalar_type* const X_in_raw =
795  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
796  Kokkos::View<const impl_scalar_type**,
797  Kokkos::LayoutLeft,
798  Kokkos::HostSpace,
799  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
800  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
801  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
802 
803  // If LDA != X_out's column stride, then we need to copy one
804  // column at a time; Kokkos::deep_copy refuses to work in that
805  // case.
806  const size_t outStride =
807  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
808  if (LDA == outStride) { // strides are the same; deep_copy once
809  // This only works because MultiVector uses LayoutLeft.
810  // We would need a custom copy functor otherwise.
811  Kokkos::deep_copy (X_out, X_in);
812  }
813  else { // strides differ; copy one column at a time
814  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
815  out_col_view_type;
816  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
817  in_col_view_type;
818  for (size_t j = 0; j < numVecs; ++j) {
819  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
820  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
821  Kokkos::deep_copy (X_out_j, X_in_j);
822  }
823  }
824  }
825 
826  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
828  MultiVector (const Teuchos::RCP<const map_type>& map,
829  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
830  const size_t numVecs) :
831  base_type (map)
832  {
833  typedef impl_scalar_type IST;
834  typedef LocalOrdinal LO;
835  typedef GlobalOrdinal GO;
836  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
837  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
838 
839  const size_t lclNumRows =
840  map.is_null () ? size_t (0) : map->getNodeNumElements ();
841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
842  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
843  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
844  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
845  for (size_t j = 0; j < numVecs; ++j) {
846  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
847  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
848  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
849  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
850  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
851  << ".");
852  }
853 
854  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
855  this->modify_device ();
856  auto X_out = this->getLocalViewDevice ();
857 
858  // Make sure that the type of a single input column has the same
859  // array layout as each output column does, so we can deep_copy.
860  using array_layout = typename decltype (X_out)::array_layout;
861  using input_col_view_type = typename Kokkos::View<const IST*,
862  array_layout,
863  Kokkos::HostSpace,
864  Kokkos::MemoryUnmanaged>;
865 
866  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
867  for (size_t j = 0; j < numVecs; ++j) {
868  Teuchos::ArrayView<const IST> X_j_av =
869  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
870  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
871  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
872  Kokkos::deep_copy (X_j_out, X_j_in);
873  }
874  origView_ = view_;
875  }
876 
877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
880  return whichVectors_.empty ();
881  }
882 
883  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
884  size_t
887  {
888  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
889  return static_cast<size_t> (0);
890  } else {
891  return this->getMap ()->getNodeNumElements ();
892  }
893  }
894 
895  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
899  {
900  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
901  return static_cast<size_t> (0);
902  } else {
903  return this->getMap ()->getGlobalNumElements ();
904  }
905  }
906 
907  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
908  size_t
910  getStride () const
911  {
912  return isConstantStride () ? getDualViewStride (origView_) : size_t (0);
913  }
914 
915  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
916  bool
918  checkSizes (const SrcDistObject& sourceObj)
919  {
920  // Check whether the source object is a MultiVector. If not, then
921  // we can't even compare sizes, so it's definitely not OK to
922  // Import or Export from it.
924  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
925  if (src == nullptr) {
926  return false;
927  }
928  else {
929  // The target of the Import or Export calls checkSizes() in
930  // DistObject::doTransfer(). By that point, we've already
931  // constructed an Import or Export object using the two
932  // multivectors' Maps, which means that (hopefully) we've
933  // already checked other attributes of the multivectors. Thus,
934  // all we need to do here is check the number of columns.
935  return src->getNumVectors () == this->getNumVectors ();
936  }
937  }
938 
939  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
940  size_t
943  return this->getNumVectors ();
944  }
945 
946  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
947  void
949 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
950  copyAndPermuteNew
951 #else // TPETRA_ENABLE_DEPRECATED_CODE
952  copyAndPermute
953 #endif // TPETRA_ENABLE_DEPRECATED_CODE
954  (const SrcDistObject& sourceObj,
955  const size_t numSameIDs,
956  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
957  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs)
958  {
959  using ::Tpetra::Details::Behavior;
961  using ::Tpetra::Details::ProfilingRegion;
962  using std::endl;
963  using KokkosRefactor::Details::permute_array_multi_column;
964  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
965  using Kokkos::Compat::create_const_view;
967  const char tfecfFuncName[] = "copyAndPermute: ";
968  ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
969 
970  const bool verbose = Behavior::verbose ();
971  std::unique_ptr<std::string> prefix;
972  if (verbose) {
973  auto map = this->getMap ();
974  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
975  const int myRank = comm.is_null () ? -1 : comm->getRank ();
976  std::ostringstream os;
977  os << "Proc " << myRank << ": MV::copyAndPermute: ";
978  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
979  os << "Start" << endl;
980  std::cerr << os.str ();
981  }
982 
983  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
984  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
985  std::logic_error, "permuteToLIDs.extent(0) = "
986  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
987  << permuteFromLIDs.extent (0) << ".");
988 
989  // We've already called checkSizes(), so this cast must succeed.
990  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
991  const size_t numCols = this->getNumVectors ();
992 
993  // sourceMV doesn't belong to us, so we can't sync it. Do the
994  // copying where it's currently sync'd.
995  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
996  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
997  std::logic_error, "Input MultiVector needs sync to both host "
998  "and device.");
999  const bool copyOnHost = sourceMV.need_sync_device ();
1000  if (verbose) {
1001  std::ostringstream os;
1002  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1003  std::cerr << os.str ();
1004  }
1005 
1006  if (copyOnHost) {
1007  if (this->need_sync_host ()) {
1008  this->sync_host ();
1009  }
1010  this->modify_host ();
1011  }
1012  else {
1013  if (this->need_sync_device ()) {
1014  this->sync_device ();
1015  }
1016  this->modify_device ();
1017  }
1018 
1019  if (verbose) {
1020  std::ostringstream os;
1021  os << *prefix << "Copy" << endl;
1022  std::cerr << os.str ();
1023  }
1024 
1025  // TODO (mfh 15 Sep 2013) When we replace
1026  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1027  // ways to copy the data:
1028  //
1029  // 1. Get a (sub)view of each column and call deep_copy on that.
1030  // 2. Write a custom kernel to copy the data.
1031  //
1032  // The first is easier, but the second might be more performant in
1033  // case we decide to use layouts other than LayoutLeft. It might
1034  // even make sense to hide whichVectors_ in an entirely new layout
1035  // for Kokkos Views.
1036 
1037  // Copy rows [0, numSameIDs-1] of the local multivectors.
1038  //
1039  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1040  // time, even when both multivectors are constant-stride, since
1041  // deep_copy between strided subviews with more than one column
1042  // doesn't currently work.
1043 
1044  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1045  // both source and target are constant stride and have multiple
1046  // columns.
1047  if (numSameIDs > 0) {
1048  const std::pair<size_t, size_t> rows (0, numSameIDs);
1049  if (copyOnHost) {
1050  auto tgt_h = this->getLocalViewHost ();
1051  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
1052 
1053  for (size_t j = 0; j < numCols; ++j) {
1054  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1055  const size_t srcCol =
1056  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1057 
1058  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1059  auto src_j = Kokkos::subview (src_h, rows, srcCol);
1060  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
1061  }
1062  }
1063  else { // copy on device
1064  auto tgt_d = this->getLocalViewDevice ();
1065  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
1066 
1067  for (size_t j = 0; j < numCols; ++j) {
1068  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1069  const size_t srcCol =
1070  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1071 
1072  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1073  auto src_j = Kokkos::subview (src_d, rows, srcCol);
1074  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
1075  }
1076  }
1077  }
1078 
1079  // For the remaining GIDs, execute the permutations. This may
1080  // involve noncontiguous access of both source and destination
1081  // vectors, depending on the LID lists.
1082  //
1083  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1084  // the same process, this merges their values by replacement of
1085  // the last encountered GID, not by the specified merge rule
1086  // (such as ADD).
1087 
1088  // If there are no permutations, we are done
1089  if (permuteFromLIDs.extent (0) == 0 ||
1090  permuteToLIDs.extent (0) == 0) {
1091  if (verbose) {
1092  std::ostringstream os;
1093  os << *prefix << "No permutations. Done!" << endl;
1094  std::cerr << os.str ();
1095  }
1096  return;
1097  }
1098 
1099  if (verbose) {
1100  std::ostringstream os;
1101  os << *prefix << "Permute" << endl;
1102  std::cerr << os.str ();
1103  }
1104 
1105  // We could in theory optimize for the case where exactly one of
1106  // them is constant stride, but we don't currently do that.
1107  const bool nonConstStride =
1108  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1109 
1110  if (verbose) {
1111  std::ostringstream os;
1112  os << *prefix << "nonConstStride="
1113  << (nonConstStride ? "true" : "false") << endl;
1114  std::cerr << os.str ();
1115  }
1116 
1117  // We only need the "which vectors" arrays if either the source or
1118  // target MV is not constant stride. Since we only have one
1119  // kernel that must do double-duty, we have to create a "which
1120  // vectors" array for the MV that _is_ constant stride.
1121  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1122  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1123  if (nonConstStride) {
1124  if (this->whichVectors_.size () == 0) {
1125  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1126  tmpTgt.modify_host ();
1127  for (size_t j = 0; j < numCols; ++j) {
1128  tmpTgt.h_view(j) = j;
1129  }
1130  if (! copyOnHost) {
1131  tmpTgt.sync_device ();
1132  }
1133  tgtWhichVecs = tmpTgt;
1134  }
1135  else {
1136  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1137  tgtWhichVecs =
1138  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1139  "tgtWhichVecs",
1140  copyOnHost);
1141  }
1142  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1143  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1144  this->getNumVectors (),
1145  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1146  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1147  this->getNumVectors () << ".");
1148 
1149  if (sourceMV.whichVectors_.size () == 0) {
1150  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1151  tmpSrc.modify_host ();
1152  for (size_t j = 0; j < numCols; ++j) {
1153  tmpSrc.h_view(j) = j;
1154  }
1155  if (! copyOnHost) {
1156  tmpSrc.sync_device ();
1157  }
1158  srcWhichVecs = tmpSrc;
1159  }
1160  else {
1161  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1162  sourceMV.whichVectors_ ();
1163  srcWhichVecs =
1164  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1165  "srcWhichVecs",
1166  copyOnHost);
1167  }
1168  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1169  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1170  sourceMV.getNumVectors (), std::logic_error,
1171  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1172  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1173  << ".");
1174  }
1175 
1176  if (copyOnHost) { // permute on host too
1177  if (verbose) {
1178  std::ostringstream os;
1179  os << *prefix << "Get permute LIDs on host" << std::endl;
1180  std::cerr << os.str ();
1181  }
1182  auto tgt_h = this->getLocalViewHost ();
1183  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
1184 
1185  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1186  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1187  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1188  auto permuteFromLIDs_h =
1189  create_const_view (permuteFromLIDs.view_host ());
1190 
1191  if (verbose) {
1192  std::ostringstream os;
1193  os << *prefix << "Permute on host" << endl;
1194  std::cerr << os.str ();
1195  }
1196  if (nonConstStride) {
1197  // No need to sync first, because copyOnHost argument to
1198  // getDualViewCopyFromArrayView puts them in the right place.
1199  auto tgtWhichVecs_h =
1200  create_const_view (tgtWhichVecs.view_host ());
1201  auto srcWhichVecs_h =
1202  create_const_view (srcWhichVecs.view_host ());
1203  permute_array_multi_column_variable_stride (tgt_h, src_h,
1204  permuteToLIDs_h,
1205  permuteFromLIDs_h,
1206  tgtWhichVecs_h,
1207  srcWhichVecs_h, numCols);
1208  }
1209  else {
1210  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1211  permuteFromLIDs_h, numCols);
1212  }
1213  }
1214  else { // permute on device
1215  if (verbose) {
1216  std::ostringstream os;
1217  os << *prefix << "Get permute LIDs on device" << endl;
1218  std::cerr << os.str ();
1219  }
1220  auto tgt_d = this->getLocalViewDevice ();
1221  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
1222 
1223  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1224  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1225  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1226  auto permuteFromLIDs_d =
1227  create_const_view (permuteFromLIDs.view_device ());
1228 
1229  if (verbose) {
1230  std::ostringstream os;
1231  os << *prefix << "Permute on device" << endl;
1232  std::cerr << os.str ();
1233  }
1234  if (nonConstStride) {
1235  // No need to sync first, because copyOnHost argument to
1236  // getDualViewCopyFromArrayView puts them in the right place.
1237  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1238  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1239  permute_array_multi_column_variable_stride (tgt_d, src_d,
1240  permuteToLIDs_d,
1241  permuteFromLIDs_d,
1242  tgtWhichVecs_d,
1243  srcWhichVecs_d, numCols);
1244  }
1245  else {
1246  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1247  permuteFromLIDs_d, numCols);
1248  }
1249  }
1250 
1251  if (verbose) {
1252  std::ostringstream os;
1253  os << *prefix << "Done!" << endl;
1254  std::cerr << os.str ();
1255  }
1256  }
1257 
1258  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1259  void
1260  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1261 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1262  packAndPrepareNew
1263 #else // TPETRA_ENABLE_DEPRECATED_CODE
1264  packAndPrepare
1265 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1266  (const SrcDistObject& sourceObj,
1267  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1268  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1269  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1270  size_t& constantNumPackets,
1271  Distributor & /* distor */ )
1272  {
1273  using ::Tpetra::Details::Behavior;
1274  using ::Tpetra::Details::ProfilingRegion;
1276  using Kokkos::Compat::create_const_view;
1277  using Kokkos::Compat::getKokkosViewDeepCopy;
1278  using std::endl;
1279  using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
1280  const char tfecfFuncName[] = "packAndPrepare: ";
1281  ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1282 
1283  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1284  // have the option to check indices. We do so when Tpetra is in
1285  // debug mode. It is in debug mode by default in a debug build,
1286  // but you may control this at run time, before launching the
1287  // executable, by setting the TPETRA_DEBUG environment variable to
1288  // "1" (or "TRUE").
1289  const bool debugCheckIndices = Behavior::debug ();
1290  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1291  // environment variable to "1" (or "TRUE") for copious debug
1292  // output to std::cerr on every MPI process. This is unwise for
1293  // runs with large numbers of MPI processes.
1294  const bool printDebugOutput = Behavior::verbose ();
1295 
1296  std::unique_ptr<std::string> prefix;
1297  if (printDebugOutput) {
1298  auto map = this->getMap ();
1299  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1300  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1301  std::ostringstream os;
1302  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1303  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1304  os << "Start" << endl;
1305  std::cerr << os.str ();
1306  }
1307 
1308  // We've already called checkSizes(), so this cast must succeed.
1309  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
1310 
1311  const size_t numCols = sourceMV.getNumVectors ();
1312 
1313  // This spares us from needing to fill numExportPacketsPerLID.
1314  // Setting constantNumPackets to a nonzero value signals that
1315  // all packets have the same number of entries.
1316  constantNumPackets = numCols;
1317 
1318  // If we have no exports, there is nothing to do. Make sure this
1319  // goes _after_ setting constantNumPackets correctly.
1320  if (exportLIDs.extent (0) == 0) {
1321  if (printDebugOutput) {
1322  std::ostringstream os;
1323  os << *prefix << "No exports on this proc, DONE" << std::endl;
1324  std::cerr << os.str ();
1325  }
1326  return;
1327  }
1328 
1329  /* The layout in the export for MultiVectors is as follows:
1330  exports = { all of the data from row exportLIDs.front() ;
1331  ....
1332  all of the data from row exportLIDs.back() }
1333  This doesn't have the best locality, but is necessary because
1334  the data for a Packet (all data associated with an LID) is
1335  required to be contiguous. */
1336 
1337  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1338  // packing scheme in the above comment? The data going to a
1339  // particular process must be contiguous, of course, but those
1340  // data could include entries from multiple LIDs. DistObject just
1341  // needs to know how to index into that data. Kokkos is good at
1342  // decoupling storage intent from data layout choice.
1343 
1344  const size_t numExportLIDs = exportLIDs.extent (0);
1345  const size_t newExportsSize = numCols * numExportLIDs;
1346  if (printDebugOutput) {
1347  std::ostringstream os;
1348  os << *prefix << "realloc: "
1349  << "numExportLIDs: " << numExportLIDs
1350  << ", exports.extent(0): " << exports.extent (0)
1351  << ", newExportsSize: " << newExportsSize << std::endl;
1352  std::cerr << os.str ();
1353  }
1354  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1355 
1356  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1357  // sync it. Pack it where it's currently sync'd.
1358  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1359  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1360  std::logic_error, "Input MultiVector needs sync to both host "
1361  "and device.");
1362  const bool packOnHost = sourceMV.need_sync_device ();
1363  auto src_dev = sourceMV.getLocalViewHost ();
1364  auto src_host = sourceMV.getLocalViewDevice ();
1365  if (printDebugOutput) {
1366  std::ostringstream os;
1367  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1368  std::cerr << os.str ();
1369  }
1370 
1371  // Mark 'exports' here, since we might have resized it above.
1372  // Resizing currently requires calling the constructor, which
1373  // clears out the 'modified' flags.
1374  if (packOnHost) {
1375  exports.modify_host ();
1376  }
1377  else {
1378  exports.modify_device ();
1379  }
1380 
1381  if (numCols == 1) { // special case for one column only
1382  // MultiVector always represents a single column with constant
1383  // stride, but it doesn't hurt to implement both cases anyway.
1384  //
1385  // ETP: I'm not sure I agree with the above statement. Can't a single-
1386  // column multivector be a subview of another multi-vector, in which case
1387  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1388  // separately.
1389  //
1390  // mfh 18 Jan 2016: In answer to ETP's comment above:
1391  // MultiVector treats single-column MultiVectors created using a
1392  // "nonconstant stride constructor" as a special case, and makes
1393  // them constant stride (by making whichVectors_ have length 0).
1394  if (sourceMV.isConstantStride ()) {
1395  using KokkosRefactor::Details::pack_array_single_column;
1396  if (printDebugOutput) {
1397  std::ostringstream os;
1398  os << *prefix << "Pack numCols=1 const stride" << endl;
1399  std::cerr << os.str ();
1400  }
1401  if (packOnHost) {
1402  pack_array_single_column (exports.view_host (),
1403  create_const_view (src_host),
1404  exportLIDs.view_host (),
1405  0,
1406  debugCheckIndices);
1407  }
1408  else { // pack on device
1409  pack_array_single_column (exports.view_device (),
1410  create_const_view (src_dev),
1411  exportLIDs.view_device (),
1412  0,
1413  debugCheckIndices);
1414  }
1415  }
1416  else {
1417  using KokkosRefactor::Details::pack_array_single_column;
1418  if (printDebugOutput) {
1419  std::ostringstream os;
1420  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1421  std::cerr << os.str ();
1422  }
1423  if (packOnHost) {
1424  pack_array_single_column (exports.view_host (),
1425  create_const_view (src_host),
1426  exportLIDs.view_host (),
1427  sourceMV.whichVectors_[0],
1428  debugCheckIndices);
1429  }
1430  else { // pack on device
1431  pack_array_single_column (exports.view_device (),
1432  create_const_view (src_dev),
1433  exportLIDs.view_device (),
1434  sourceMV.whichVectors_[0],
1435  debugCheckIndices);
1436  }
1437  }
1438  }
1439  else { // the source MultiVector has multiple columns
1440  if (sourceMV.isConstantStride ()) {
1441  using KokkosRefactor::Details::pack_array_multi_column;
1442  if (printDebugOutput) {
1443  std::ostringstream os;
1444  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1445  std::cerr << os.str ();
1446  }
1447  if (packOnHost) {
1448  pack_array_multi_column (exports.view_host (),
1449  create_const_view (src_host),
1450  exportLIDs.view_host (),
1451  numCols,
1452  debugCheckIndices);
1453  }
1454  else { // pack on device
1455  pack_array_multi_column (exports.view_device (),
1456  create_const_view (src_dev),
1457  exportLIDs.view_device (),
1458  numCols,
1459  debugCheckIndices);
1460  }
1461  }
1462  else {
1463  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1464  if (printDebugOutput) {
1465  std::ostringstream os;
1466  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1467  << endl;
1468  std::cerr << os.str ();
1469  }
1470  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1471  // whichVectors_ can be expensive, but pack and unpack for
1472  // nonconstant-stride MultiVectors is slower anyway.
1473  using IST = impl_scalar_type;
1474  using DV = Kokkos::DualView<IST*, device_type>;
1475  using HES = typename DV::t_host::execution_space;
1476  using DES = typename DV::t_dev::execution_space;
1477  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1478  if (packOnHost) {
1479  pack_array_multi_column_variable_stride
1480  (exports.view_host (),
1481  create_const_view (src_host),
1482  exportLIDs.view_host (),
1483  getKokkosViewDeepCopy<HES> (whichVecs),
1484  numCols,
1485  debugCheckIndices);
1486  }
1487  else { // pack on device
1488  pack_array_multi_column_variable_stride
1489  (exports.view_device (),
1490  create_const_view (src_dev),
1491  exportLIDs.view_device (),
1492  getKokkosViewDeepCopy<DES> (whichVecs),
1493  numCols,
1494  debugCheckIndices);
1495  }
1496  }
1497  }
1498 
1499  if (printDebugOutput) {
1500  std::ostringstream os;
1501  os << *prefix << "Done!" << endl;
1502  std::cerr << os.str ();
1503  }
1504  }
1505 
1506 
1507  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1508  void
1509  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1510 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1511  unpackAndCombineNew
1512 #else // TPETRA_ENABLE_DEPRECATED_CODE
1513  unpackAndCombine
1514 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1515  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1516  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1517  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1518  const size_t constantNumPackets,
1519  Distributor& /* distor */,
1520  const CombineMode CM)
1521  {
1522  using ::Tpetra::Details::Behavior;
1523  using ::Tpetra::Details::ProfilingRegion;
1524  using KokkosRefactor::Details::unpack_array_multi_column;
1525  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1526  using Kokkos::Compat::getKokkosViewDeepCopy;
1527  using std::endl;
1528  using IST = impl_scalar_type;
1529  const char tfecfFuncName[] = "unpackAndCombine: ";
1530  ProfilingRegion regionUAC ("Tpetra::MultiVector::unpackAndCombine");
1531 
1532  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1533  // have the option to check indices. We do so when Tpetra is in
1534  // debug mode. It is in debug mode by default in a debug build,
1535  // but you may control this at run time, before launching the
1536  // executable, by setting the TPETRA_DEBUG environment variable to
1537  // "1" (or "TRUE").
1538  const bool debugCheckIndices = Behavior::debug ();
1539 
1540  const bool printDebugOutput = Behavior::verbose ();
1541  std::unique_ptr<std::string> prefix;
1542  if (printDebugOutput) {
1543  auto map = this->getMap ();
1544  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1545  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1546  std::ostringstream os;
1547  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1548  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1549  os << "Start" << endl;
1550  std::cerr << os.str ();
1551  }
1552 
1553  // If we have no imports, there is nothing to do
1554  if (importLIDs.extent (0) == 0) {
1555  if (printDebugOutput) {
1556  std::ostringstream os;
1557  os << *prefix << "No imports. Done!" << endl;
1558  }
1559  return;
1560  }
1561 
1562  const size_t numVecs = getNumVectors ();
1563  if (debugCheckIndices) {
1564  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1565  (static_cast<size_t> (imports.extent (0)) !=
1566  numVecs * importLIDs.extent (0),
1567  std::runtime_error,
1568  "imports.extent(0) = " << imports.extent (0)
1569  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1570  << " * " << importLIDs.extent (0) << " = "
1571  << numVecs * importLIDs.extent (0) << ".");
1572 
1573  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1574  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1575  "constantNumPackets input argument must be nonzero.");
1576 
1577  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1578  (static_cast<size_t> (numVecs) !=
1579  static_cast<size_t> (constantNumPackets),
1580  std::runtime_error, "constantNumPackets must equal numVecs.");
1581  }
1582 
1583  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1584  // the memory space in which the imports buffer was last modified.
1585  // DistObject::doTransferNew gets to decide this.
1586  const bool unpackOnHost = imports.need_sync_device ();
1587 
1588  if (printDebugOutput) {
1589  std::ostringstream os;
1590  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1591  << endl;
1592  std::cerr << os.str ();
1593  }
1594 
1595  // We have to sync before modifying, because this method may read
1596  // as well as write (depending on the CombineMode).
1597  if (unpackOnHost) {
1598  if (this->need_sync_host ()) {
1599  this->sync_host ();
1600  }
1601  this->modify_host ();
1602  }
1603  else {
1604  if (this->need_sync_device ()) {
1605  this->sync_device ();
1606  }
1607  this->modify_device ();
1608  }
1609  auto X_d = this->getLocalViewDevice ();
1610  auto X_h = this->getLocalViewHost ();
1611  auto imports_d = imports.view_device ();
1612  auto imports_h = imports.view_host ();
1613  auto importLIDs_d = importLIDs.view_device ();
1614  auto importLIDs_h = importLIDs.view_host ();
1615 
1616  Kokkos::DualView<size_t*, device_type> whichVecs;
1617  if (! isConstantStride ()) {
1618  Kokkos::View<const size_t*, Kokkos::HostSpace,
1619  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1620  numVecs);
1621  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1622  if (unpackOnHost) {
1623  whichVecs.modify_host ();
1624  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1625  }
1626  else {
1627  whichVecs.modify_device ();
1628  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
1629  }
1630  }
1631  auto whichVecs_d = whichVecs.view_device ();
1632  auto whichVecs_h = whichVecs.view_host ();
1633 
1634  /* The layout in the export for MultiVectors is as follows:
1635  imports = { all of the data from row exportLIDs.front() ;
1636  ....
1637  all of the data from row exportLIDs.back() }
1638  This doesn't have the best locality, but is necessary because
1639  the data for a Packet (all data associated with an LID) is
1640  required to be contiguous. */
1641 
1642  if (numVecs > 0 && importLIDs.extent (0) > 0) {
1643  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1644  using host_exec_space = typename dual_view_type::t_host::execution_space;
1645 
1646  // This fixes GitHub Issue #4418.
1647  const bool use_atomic_updates = unpackOnHost ?
1648  host_exec_space::concurrency () != 1 :
1649  dev_exec_space::concurrency () != 1;
1650 
1651  if (printDebugOutput) {
1652  std::ostringstream os;
1653  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
1654  std::cerr << os.str ();
1655  }
1656 
1657  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1658  // custom combine modes, start editing here.
1659 
1660  if (CM == INSERT || CM == REPLACE) {
1661  using op_type = KokkosRefactor::Details::InsertOp<IST>;
1662  if (isConstantStride ()) {
1663  if (unpackOnHost) {
1664  unpack_array_multi_column (host_exec_space (),
1665  X_h, imports_h, importLIDs_h,
1666  op_type (), numVecs,
1667  use_atomic_updates,
1668  debugCheckIndices);
1669 
1670  }
1671  else { // unpack on device
1672  unpack_array_multi_column (dev_exec_space (),
1673  X_d, imports_d, importLIDs_d,
1674  op_type (), numVecs,
1675  use_atomic_updates,
1676  debugCheckIndices);
1677  }
1678  }
1679  else { // not constant stride
1680  if (unpackOnHost) {
1681  unpack_array_multi_column_variable_stride (host_exec_space (),
1682  X_h, imports_h,
1683  importLIDs_h,
1684  whichVecs_h,
1685  op_type (),
1686  numVecs,
1687  use_atomic_updates,
1688  debugCheckIndices);
1689  }
1690  else { // unpack on device
1691  unpack_array_multi_column_variable_stride (dev_exec_space (),
1692  X_d, imports_d,
1693  importLIDs_d,
1694  whichVecs_d,
1695  op_type (),
1696  numVecs,
1697  use_atomic_updates,
1698  debugCheckIndices);
1699  }
1700  }
1701  }
1702  else if (CM == ADD) {
1703  using op_type = KokkosRefactor::Details::AddOp<IST>;
1704  if (isConstantStride ()) {
1705  if (unpackOnHost) {
1706  unpack_array_multi_column (host_exec_space (),
1707  X_h, imports_h, importLIDs_h,
1708  op_type (), numVecs,
1709  use_atomic_updates,
1710  debugCheckIndices);
1711  }
1712  else { // unpack on device
1713  unpack_array_multi_column (dev_exec_space (),
1714  X_d, imports_d, importLIDs_d,
1715  op_type (), numVecs,
1716  use_atomic_updates,
1717  debugCheckIndices);
1718  }
1719  }
1720  else { // not constant stride
1721  if (unpackOnHost) {
1722  unpack_array_multi_column_variable_stride (host_exec_space (),
1723  X_h, imports_h,
1724  importLIDs_h,
1725  whichVecs_h,
1726  op_type (),
1727  numVecs,
1728  use_atomic_updates,
1729  debugCheckIndices);
1730  }
1731  else { // unpack on device
1732  unpack_array_multi_column_variable_stride (dev_exec_space (),
1733  X_d, imports_d,
1734  importLIDs_d,
1735  whichVecs_d,
1736  op_type (),
1737  numVecs,
1738  use_atomic_updates,
1739  debugCheckIndices);
1740  }
1741  }
1742  }
1743  else if (CM == ABSMAX) {
1744  using op_type = KokkosRefactor::Details::AbsMaxOp<IST>;
1745  if (isConstantStride ()) {
1746  if (unpackOnHost) {
1747  unpack_array_multi_column (host_exec_space (),
1748  X_h, imports_h, importLIDs_h,
1749  op_type (), numVecs,
1750  use_atomic_updates,
1751  debugCheckIndices);
1752  }
1753  else { // unpack on device
1754  unpack_array_multi_column (dev_exec_space (),
1755  X_d, imports_d, importLIDs_d,
1756  op_type (), numVecs,
1757  use_atomic_updates,
1758  debugCheckIndices);
1759  }
1760  }
1761  else {
1762  if (unpackOnHost) {
1763  unpack_array_multi_column_variable_stride (host_exec_space (),
1764  X_h, imports_h,
1765  importLIDs_h,
1766  whichVecs_h,
1767  op_type (),
1768  numVecs,
1769  use_atomic_updates,
1770  debugCheckIndices);
1771  }
1772  else { // unpack on device
1773  unpack_array_multi_column_variable_stride (dev_exec_space (),
1774  X_d, imports_d,
1775  importLIDs_d,
1776  whichVecs_d,
1777  op_type (),
1778  numVecs,
1779  use_atomic_updates,
1780  debugCheckIndices);
1781  }
1782  }
1783  }
1784  else {
1785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1786  (true, std::logic_error, "Invalid CombineMode");
1787  }
1788  }
1789  else {
1790  if (printDebugOutput) {
1791  std::ostringstream os;
1792  os << *prefix << "Nothing to unpack" << endl;
1793  std::cerr << os.str ();
1794  }
1795  }
1796 
1797  if (printDebugOutput) {
1798  std::ostringstream os;
1799  os << *prefix << "Done!" << endl;
1800  std::cerr << os.str ();
1801  }
1802  }
1803 
1804  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1805  size_t
1808  {
1809  if (isConstantStride ()) {
1810  return static_cast<size_t> (view_.extent (1));
1811  } else {
1812  return static_cast<size_t> (whichVectors_.size ());
1813  }
1814  }
1815 
1816  namespace { // (anonymous)
1817 
1818  template<class RV>
1819  void
1820  gblDotImpl (const RV& dotsOut,
1821  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1822  const bool distributed)
1823  {
1824  using Teuchos::REDUCE_MAX;
1825  using Teuchos::REDUCE_SUM;
1826  using Teuchos::reduceAll;
1827  typedef typename RV::non_const_value_type dot_type;
1828 
1829  const size_t numVecs = dotsOut.extent (0);
1830 
1831  // If the MultiVector is distributed over multiple processes, do
1832  // the distributed (interprocess) part of the dot product. We
1833  // assume that the MPI implementation can read from and write to
1834  // device memory.
1835  //
1836  // replaceMap() may have removed some processes. Those
1837  // processes have a null Map. They must not participate in any
1838  // collective operations. We ask first whether the Map is null,
1839  // because isDistributed() defers that question to the Map. We
1840  // still compute and return local dots for processes not
1841  // participating in collective operations; those probably don't
1842  // make any sense, but it doesn't hurt to do them, since it's
1843  // illegal to call dot() on those processes anyway.
1844  if (distributed && ! comm.is_null ()) {
1845  // The calling process only participates in the collective if
1846  // both the Map and its Comm on that process are nonnull.
1847  const int nv = static_cast<int> (numVecs);
1848  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
1849 
1850  if (commIsInterComm) {
1851  // If comm is an intercomm, then we may not alias input and
1852  // output buffers, so we have to make a copy of the local
1853  // sum.
1854  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1855  Kokkos::deep_copy (lclDots, dotsOut);
1856  const dot_type* const lclSum = lclDots.data ();
1857  dot_type* const gblSum = dotsOut.data ();
1858  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1859  }
1860  else {
1861  dot_type* const inout = dotsOut.data ();
1862  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
1863  }
1864  }
1865  }
1866  } // namespace (anonymous)
1867 
1868  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1869  void
1872  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
1873  {
1874  using ::Tpetra::Details::Behavior;
1875  using Kokkos::subview;
1876  using Teuchos::Comm;
1877  using Teuchos::null;
1878  using Teuchos::RCP;
1879  // View of all the dot product results.
1880  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
1882  typedef typename dual_view_type::t_dev XMV;
1883  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1884 
1885  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
1886 
1887  const size_t numVecs = this->getNumVectors ();
1888  if (numVecs == 0) {
1889  return; // nothing to do
1890  }
1891  const size_t lclNumRows = this->getLocalLength ();
1892  const size_t numDots = static_cast<size_t> (dots.extent (0));
1893  const bool debug = Behavior::debug ();
1894 
1895  if (debug) {
1896  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1897  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1898  (! compat, std::invalid_argument, "'*this' MultiVector is not "
1899  "compatible with the input MultiVector A. We only test for this "
1900  "in debug mode.");
1901  }
1902 
1903  // FIXME (mfh 11 Jul 2014) These exception tests may not
1904  // necessarily be thrown on all processes consistently. We should
1905  // instead pass along error state with the inner product. We
1906  // could do this by setting an extra slot to
1907  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1908  // final sum should be
1909  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1910  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1911  lclNumRows != A.getLocalLength (), std::runtime_error,
1912  "MultiVectors do not have the same local length. "
1913  "this->getLocalLength() = " << lclNumRows << " != "
1914  "A.getLocalLength() = " << A.getLocalLength () << ".");
1915  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1916  numVecs != A.getNumVectors (), std::runtime_error,
1917  "MultiVectors must have the same number of columns (vectors). "
1918  "this->getNumVectors() = " << numVecs << " != "
1919  "A.getNumVectors() = " << A.getNumVectors () << ".");
1920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1921  numDots != numVecs, std::runtime_error,
1922  "The output array 'dots' must have the same number of entries as the "
1923  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
1924  numDots << " != this->getNumVectors() = " << numVecs << ".");
1925 
1926  const std::pair<size_t, size_t> colRng (0, numVecs);
1927  RV dotsOut = subview (dots, colRng);
1928  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1929  this->getMap ()->getComm ();
1930 
1931  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1932  if (this->need_sync_device ()) {
1933  const_cast<MV*>(this)->sync_device ();
1934  }
1935  if (A.need_sync_device ()) {
1936  const_cast<MV&>(A).sync_device ();
1937  }
1938 
1939  auto thisView = this->getLocalViewDevice ();
1940  auto A_view = A.getLocalViewDevice ();
1941 
1942  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1943  this->whichVectors_.getRawPtr (),
1944  A.whichVectors_.getRawPtr (),
1945  this->isConstantStride (), A.isConstantStride ());
1946  gblDotImpl (dotsOut, comm, this->isDistributed ());
1947  }
1948 
1949  namespace { // (anonymous)
1950  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1952  multiVectorSingleColumnDot (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
1954  {
1955  using ::Tpetra::Details::ProfilingRegion;
1957  using dot_type = typename MV::dot_type;
1958  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
1959 
1960  auto map = x.getMap ();
1961  Teuchos::RCP<const Teuchos::Comm<int> > comm =
1962  map.is_null () ? Teuchos::null : map->getComm ();
1963  if (comm.is_null ()) {
1964  return Kokkos::ArithTraits<dot_type>::zero ();
1965  }
1966  else {
1967  using LO = LocalOrdinal;
1968  // The min just ensures that we don't overwrite memory that
1969  // doesn't belong to us, in the erroneous input case where x
1970  // and y have different numbers of rows.
1971  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
1972  y.getLocalLength ()));
1973  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
1974  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
1975  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
1976 
1977  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1978  if (x.need_sync_device ()) {
1979  x.sync_device ();
1980  }
1981  if (y.need_sync_device ()) {
1982  const_cast<MV&>(y).sync_device ();
1983  }
1984 
1985  x.modify_device ();
1986  auto x_2d = x.getLocalViewDevice ();
1987  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
1988  auto y_2d = y.getLocalViewDevice ();
1989  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
1990  lclDot = KokkosBlas::dot (x_1d, y_1d);
1991 
1992  if (x.isDistributed ()) {
1993  using Teuchos::outArg;
1994  using Teuchos::REDUCE_SUM;
1995  using Teuchos::reduceAll;
1996  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
1997  }
1998  else {
1999  gblDot = lclDot;
2000  }
2001  return gblDot;
2002  }
2003  }
2004  } // namespace (anonymous)
2005 
2006 
2007 
2008  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2009  void
2012  const Teuchos::ArrayView<dot_type>& dots) const
2013  {
2015  const char tfecfFuncName[] = "dot: ";
2016  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2017 
2018  const size_t numVecs = this->getNumVectors ();
2019  const size_t lclNumRows = this->getLocalLength ();
2020  const size_t numDots = static_cast<size_t> (dots.size ());
2021 
2022  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2023  // not necessarily be thrown on all processes consistently. We
2024  // keep them for now, because MultiVector's unit tests insist on
2025  // them. In the future, we should instead pass along error state
2026  // with the inner product. We could do this by setting an extra
2027  // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
2028  // The final sum should be
2029  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2030  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2031  (lclNumRows != A.getLocalLength (), std::runtime_error,
2032  "MultiVectors do not have the same local length. "
2033  "this->getLocalLength() = " << lclNumRows << " != "
2034  "A.getLocalLength() = " << A.getLocalLength () << ".");
2035  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2036  (numVecs != A.getNumVectors (), std::runtime_error,
2037  "MultiVectors must have the same number of columns (vectors). "
2038  "this->getNumVectors() = " << numVecs << " != "
2039  "A.getNumVectors() = " << A.getNumVectors () << ".");
2040  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2041  (numDots != numVecs, std::runtime_error,
2042  "The output array 'dots' must have the same number of entries as the "
2043  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2044  numDots << " != this->getNumVectors() = " << numVecs << ".");
2045 
2046  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2047  const dot_type gblDot = multiVectorSingleColumnDot (const_cast<MV&> (*this), A);
2048  dots[0] = gblDot;
2049  }
2050  else {
2051  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2052  }
2053  }
2054 
2055  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2056  void
2058  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2059  {
2061  using ::Tpetra::Details::NORM_TWO;
2062  using ::Tpetra::Details::ProfilingRegion;
2063  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2064 
2065  // The function needs to be able to sync X.
2066  MV& X = const_cast<MV&> (*this);
2067  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2068  }
2069 
2070  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2071  void
2073  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2074  {
2075  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2076  this->norm2 (norms_av);
2077  }
2078 
2079 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2080  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2081  void TPETRA_DEPRECATED
2084  const Teuchos::ArrayView<mag_type> &norms) const
2085  {
2086  using Kokkos::ALL;
2087  using Kokkos::subview;
2088  using Teuchos::Comm;
2089  using Teuchos::null;
2090  using Teuchos::RCP;
2091  using Teuchos::reduceAll;
2092  using Teuchos::REDUCE_SUM;
2093  typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2094  typedef Kokkos::ArithTraits<mag_type> ATM;
2095  typedef Kokkos::View<mag_type*, Kokkos::HostSpace> norms_view_type;
2097  const char tfecfFuncName[] = "normWeighted: ";
2098 
2099  const size_t numVecs = this->getNumVectors ();
2100  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2101  static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
2102  "norms.size() = " << norms.size () << " != this->getNumVectors() = "
2103  << numVecs << ".");
2104 
2105  const bool OneW = (weights.getNumVectors () == 1);
2106  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2107  ! OneW && weights.getNumVectors () != numVecs, std::runtime_error,
2108  "The input MultiVector of weights must contain either one column, "
2109  "or must have the same number of columns as *this. "
2110  "weights.getNumVectors() = " << weights.getNumVectors ()
2111  << " and this->getNumVectors() = " << numVecs << ".");
2112 
2113  const bool debug = ::Tpetra::Details::Behavior::debug ();
2114 
2115  if (debug) {
2116  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2117  (! this->getMap ()->isCompatible (*weights.getMap ()), std::runtime_error,
2118  "MultiVectors do not have compatible Maps:" << std::endl
2119  << "this->getMap(): " << std::endl << *this->getMap()
2120  << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
2121  }
2122  else {
2123  const size_t lclNumRows = this->getLocalLength ();
2124  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2125  (lclNumRows != weights.getLocalLength (), std::runtime_error,
2126  "MultiVectors do not have the same local length.");
2127  }
2128 
2129  norms_view_type lclNrms ("Tpetra::MV::lclNrms", numVecs);
2130 
2131  // FIXME (mfh 18 May 2016) Yes, I know "const" is a lie.
2132  const_cast<MV*> (this)->sync_device ();
2133  const_cast<MV&> (weights).sync_device ();
2134 
2135  auto X_lcl = this->getLocalViewDevice ();
2136  auto W_lcl = weights.getLocalViewDevice ();
2137 
2138  if (isConstantStride () && ! OneW) {
2139  KokkosBlas::nrm2w_squared (lclNrms, X_lcl, W_lcl);
2140  }
2141  else {
2142  for (size_t j = 0; j < numVecs; ++j) {
2143  const size_t X_col = this->isConstantStride () ? j :
2144  this->whichVectors_[j];
2145  const size_t W_col = OneW ? static_cast<size_t> (0) :
2146  (weights.isConstantStride () ? j : weights.whichVectors_[j]);
2147  KokkosBlas::nrm2w_squared (subview (lclNrms, j),
2148  subview (X_lcl, ALL (), X_col),
2149  subview (W_lcl, ALL (), W_col));
2150  }
2151  }
2152 
2153  const mag_type OneOverN =
2154  ATM::one () / static_cast<mag_type> (this->getGlobalLength ());
2155  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
2156  Teuchos::null : this->getMap ()->getComm ();
2157 
2158  if (! comm.is_null () && this->isDistributed ()) {
2159  // Assume that MPI can access device memory.
2160  reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2161  lclNrms.data (), norms.getRawPtr ());
2162  for (size_t k = 0; k < numVecs; ++k) {
2163  norms[k] = ATM::sqrt (norms[k] * OneOverN);
2164  }
2165  }
2166  else {
2167  for (size_t k = 0; k < numVecs; ++k) {
2168  norms[k] = ATM::sqrt (ATS::magnitude (lclNrms(k)) * OneOverN);
2169  }
2170  }
2171  }
2172 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2173 
2174  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2175  void
2177  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2178  {
2180  using ::Tpetra::Details::NORM_ONE;
2181  using ::Tpetra::Details::ProfilingRegion;
2182  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2183 
2184  // The function needs to be able to sync X.
2185  MV& X = const_cast<MV&> (*this);
2186  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2187  }
2188 
2189  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2190  void
2192  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2193  {
2194  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2195  this->norm1 (norms_av);
2196  }
2197 
2198  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2199  void
2201  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2202  {
2204  using ::Tpetra::Details::NORM_INF;
2205  using ::Tpetra::Details::ProfilingRegion;
2206  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2207 
2208  // The function needs to be able to sync X.
2209  MV& X = const_cast<MV&> (*this);
2210  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2211  }
2212 
2213  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2214  void
2216  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2217  {
2218  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2219  this->normInf (norms_av);
2220  }
2221 
2222  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2223  void
2225  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2226  {
2227  // KR FIXME Overload this method to take a View.
2228  using Kokkos::ALL;
2229  using Kokkos::subview;
2230  using Teuchos::Comm;
2231  using Teuchos::RCP;
2232  using Teuchos::reduceAll;
2233  using Teuchos::REDUCE_SUM;
2234  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2235 
2236  const size_t lclNumRows = this->getLocalLength ();
2237  const size_t numVecs = this->getNumVectors ();
2238  const size_t numMeans = static_cast<size_t> (means.size ());
2239 
2240  TEUCHOS_TEST_FOR_EXCEPTION(
2241  numMeans != numVecs, std::runtime_error,
2242  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2243  << " != this->getNumVectors() = " << numVecs << ".");
2244 
2245  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2246  const std::pair<size_t, size_t> colRng (0, numVecs);
2247 
2248  // Make sure that the final output view has the same layout as the
2249  // temporary view's HostMirror. Left or Right doesn't matter for
2250  // a 1-D array anyway; this is just to placate the compiler.
2251  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2252  typedef Kokkos::View<impl_scalar_type*,
2253  typename local_view_type::HostMirror::array_layout,
2254  Kokkos::HostSpace,
2255  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2256  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2257 
2258  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2259  this->getMap ()->getComm ();
2260 
2261  // If we need sync to device, then host has the most recent version.
2262  const bool useHostVersion = this->need_sync_device ();
2263  if (useHostVersion) {
2264  // DualView was last modified on host, so run the local kernel there.
2265  auto X_lcl = subview (getLocalViewHost (),
2266  rowRng, Kokkos::ALL ());
2267  // Compute the local sum of each column.
2268  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2269  if (isConstantStride ()) {
2270  KokkosBlas::sum (lclSums, X_lcl);
2271  }
2272  else {
2273  for (size_t j = 0; j < numVecs; ++j) {
2274  const size_t col = whichVectors_[j];
2275  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2276  }
2277  }
2278 
2279  // If there are multiple MPI processes, the all-reduce reads
2280  // from lclSums, and writes to meansOut. Otherwise, we just
2281  // copy lclSums into meansOut.
2282  if (! comm.is_null () && this->isDistributed ()) {
2283  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2284  lclSums.data (), meansOut.data ());
2285  }
2286  else {
2287  Kokkos::deep_copy (meansOut, lclSums);
2288  }
2289  }
2290  else {
2291  // DualView was last modified on device, so run the local kernel there.
2292  auto X_lcl = subview (this->getLocalViewDevice (),
2293  rowRng, Kokkos::ALL ());
2294 
2295  // Compute the local sum of each column.
2296  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2297  if (isConstantStride ()) {
2298  KokkosBlas::sum (lclSums, X_lcl);
2299  }
2300  else {
2301  for (size_t j = 0; j < numVecs; ++j) {
2302  const size_t col = whichVectors_[j];
2303  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2304  }
2305  }
2306 
2307  // If there are multiple MPI processes, the all-reduce reads
2308  // from lclSums, and writes to meansOut. (We assume that MPI
2309  // can read device memory.) Otherwise, we just copy lclSums
2310  // into meansOut.
2311  if (! comm.is_null () && this->isDistributed ()) {
2312  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2313  lclSums.data (), meansOut.data ());
2314  }
2315  else {
2316  Kokkos::deep_copy (meansOut, lclSums);
2317  }
2318  }
2319 
2320  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2321  // to the magnitude type, since operator/ (std::complex<T>, int)
2322  // isn't necessarily defined.
2323  const impl_scalar_type OneOverN =
2324  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2325  for (size_t k = 0; k < numMeans; ++k) {
2326  meansOut(k) = meansOut(k) * OneOverN;
2327  }
2328  }
2329 
2330 
2331  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2332  void
2335  {
2336  typedef impl_scalar_type IST;
2337  typedef Kokkos::Details::ArithTraits<IST> ATS;
2338  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2339  typedef typename pool_type::generator_type generator_type;
2340 
2341  const IST max = Kokkos::rand<generator_type, IST>::max ();
2342  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2343 
2344  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2345  }
2346 
2347 
2348  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2349  void
2351  randomize (const Scalar& minVal, const Scalar& maxVal)
2352  {
2353  typedef impl_scalar_type IST;
2354  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2355 
2356  // Seed the pseudorandom number generator using the calling
2357  // process' rank. This helps decorrelate different process'
2358  // pseudorandom streams. It's not perfect but it's effective and
2359  // doesn't require MPI communication. The seed also includes bits
2360  // from the standard library's rand().
2361  //
2362  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2363  // The code below just makes a new seed each time.
2364 
2365  const uint64_t myRank =
2366  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2367  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2368  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2369 
2370  pool_type rand_pool (seed);
2371  const IST max = static_cast<IST> (maxVal);
2372  const IST min = static_cast<IST> (minVal);
2373 
2374  // See #1510. In case diag has already been marked modified on
2375  // host or device, we need to clear those flags, since the code
2376  // below works on device.
2377  this->view_.clear_sync_state();
2378 
2379  this->modify_device ();
2380  auto thisView = this->getLocalViewDevice ();
2381 
2382  if (isConstantStride ()) {
2383  Kokkos::fill_random (thisView, rand_pool, min, max);
2384  }
2385  else {
2386  const size_t numVecs = getNumVectors ();
2387  for (size_t k = 0; k < numVecs; ++k) {
2388  const size_t col = whichVectors_[k];
2389  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2390  Kokkos::fill_random (X_k, rand_pool, min, max);
2391  }
2392  }
2393  }
2394 
2395  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2396  void
2398  putScalar (const Scalar& alpha)
2399  {
2400  using ::Tpetra::Details::ProfilingRegion;
2401  using ::Tpetra::Details::Blas::fill;
2402  using DES = typename dual_view_type::t_dev::execution_space;
2403  using HES = typename dual_view_type::t_host::execution_space;
2404  using LO = LocalOrdinal;
2405  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2406 
2407  // We need this cast for cases like Scalar = std::complex<T> but
2408  // impl_scalar_type = Kokkos::complex<T>.
2409  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2410  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2411  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2412 
2413  // Modify the most recently updated version of the data. This
2414  // avoids sync'ing, which could violate users' expectations.
2415  //
2416  // If we need sync to device, then host has the most recent version.
2417  const bool runOnHost = this->need_sync_device ();
2418 
2419  if (! runOnHost) { // last modified in device memory
2420  this->modify_device ();
2421  auto X = this->getLocalViewDevice ();
2422  if (this->isConstantStride ()) {
2423  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2424  }
2425  else {
2426  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2427  this->whichVectors_.getRawPtr ());
2428  }
2429  }
2430  else { // last modified in host memory, so modify data there.
2431  this->modify_host ();
2432  auto X = this->getLocalViewHost ();
2433  if (this->isConstantStride ()) {
2434  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2435  }
2436  else {
2437  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2438  this->whichVectors_.getRawPtr ());
2439  }
2440  }
2441  }
2442 
2443 
2444  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2445  void
2447  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2448  {
2449  using Teuchos::ArrayRCP;
2450  using Teuchos::Comm;
2451  using Teuchos::RCP;
2452  using ST = Scalar;
2453  using LO = LocalOrdinal;
2454  using GO = GlobalOrdinal;
2455 
2456  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2457  // it might work if the MV is a column view of another MV.
2458  // However, things might go wrong when restoring the original
2459  // Map, so we don't allow this case for now.
2460  TEUCHOS_TEST_FOR_EXCEPTION(
2461  ! this->isConstantStride (), std::logic_error,
2462  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2463  "if the MultiVector is a column view of another MultiVector (that is, if "
2464  "isConstantStride() == false).");
2465 
2466  // Case 1: current Map and new Map are both nonnull on this process.
2467  // Case 2: current Map is nonnull, new Map is null.
2468  // Case 3: current Map is null, new Map is nonnull.
2469  // Case 4: both Maps are null: forbidden.
2470  //
2471  // Case 1 means that we don't have to do anything on this process,
2472  // other than assign the new Map. (We always have to do that.)
2473  // It's an error for the user to supply a Map that requires
2474  // resizing in this case.
2475  //
2476  // Case 2 means that the calling process is in the current Map's
2477  // communicator, but will be excluded from the new Map's
2478  // communicator. We don't have to do anything on the calling
2479  // process; just leave whatever data it may have alone.
2480  //
2481  // Case 3 means that the calling process is excluded from the
2482  // current Map's communicator, but will be included in the new
2483  // Map's communicator. This means we need to (re)allocate the
2484  // local DualView if it does not have the right number of rows.
2485  // If the new number of rows is nonzero, we'll fill the newly
2486  // allocated local data with zeros, as befits a projection
2487  // operation.
2488  //
2489  // The typical use case for Case 3 is that the MultiVector was
2490  // first created with the Map with more processes, then that Map
2491  // was replaced with a Map with fewer processes, and finally the
2492  // original Map was restored on this call to replaceMap.
2493 
2494 #ifdef HAVE_TEUCHOS_DEBUG
2495  // mfh 28 Mar 2013: We can't check for compatibility across the
2496  // whole communicator, unless we know that the current and new
2497  // Maps are nonnull on _all_ participating processes.
2498  // TEUCHOS_TEST_FOR_EXCEPTION(
2499  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2500  // std::invalid_argument, "Tpetra::MultiVector::project: "
2501  // "If the input Map's communicator is compatible (has the same number of "
2502  // "processes as) the current Map's communicator, then the two Maps must be "
2503  // "compatible. The replaceMap() method is not for data redistribution; "
2504  // "use Import or Export for that purpose.");
2505 
2506  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2507  // of the Map, in case the process counts don't match.
2508 #endif // HAVE_TEUCHOS_DEBUG
2509 
2510  if (this->getMap ().is_null ()) { // current Map is null
2511  // If this->getMap() is null, that means that this MultiVector
2512  // has already had replaceMap happen to it. In that case, just
2513  // reallocate the DualView with the right size.
2514 
2515  TEUCHOS_TEST_FOR_EXCEPTION(
2516  newMap.is_null (), std::invalid_argument,
2517  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2518  "This probably means that the input Map is incorrect.");
2519 
2520  // Case 3: current Map is null, new Map is nonnull.
2521  // Reallocate the DualView with the right dimensions.
2522  const size_t newNumRows = newMap->getNodeNumElements ();
2523  const size_t origNumRows = view_.extent (0);
2524  const size_t numCols = this->getNumVectors ();
2525 
2526  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2527  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2528  }
2529  }
2530  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2531  // I am an excluded process. Reinitialize my data so that I
2532  // have 0 rows. Keep the number of columns as before.
2533  const size_t newNumRows = static_cast<size_t> (0);
2534  const size_t numCols = this->getNumVectors ();
2535  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2536  }
2537 
2538  this->map_ = newMap;
2539  }
2540 
2541  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2542  void
2544  scale (const Scalar& alpha)
2545  {
2546  using Kokkos::ALL;
2547  using IST = impl_scalar_type;
2548 
2549  const IST theAlpha = static_cast<IST> (alpha);
2550  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2551  return; // do nothing
2552  }
2553  const size_t lclNumRows = getLocalLength ();
2554  const size_t numVecs = getNumVectors ();
2555  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2556  const std::pair<size_t, size_t> colRng (0, numVecs);
2557 
2558  // We can't substitute putScalar(0.0) for scale(0.0), because the
2559  // former will overwrite NaNs present in the MultiVector. The
2560  // semantics of this call require multiplying them by 0, which
2561  // IEEE 754 requires to be NaN.
2562 
2563  // If we need sync to device, then host has the most recent version.
2564  const bool useHostVersion = need_sync_device ();
2565  if (useHostVersion) {
2566  auto Y_lcl = Kokkos::subview (getLocalViewHost (), rowRng, ALL ());
2567  if (isConstantStride ()) {
2568  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2569  }
2570  else {
2571  for (size_t k = 0; k < numVecs; ++k) {
2572  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2573  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2574  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2575  }
2576  }
2577  }
2578  else { // work on device
2579  auto Y_lcl = Kokkos::subview (getLocalViewDevice (), rowRng, ALL ());
2580  if (isConstantStride ()) {
2581  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2582  }
2583  else {
2584  for (size_t k = 0; k < numVecs; ++k) {
2585  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2586  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2587  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2588  }
2589  }
2590  }
2591  }
2592 
2593 
2594  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2595  void
2597  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2598  {
2599  const size_t numVecs = this->getNumVectors ();
2600  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2601  TEUCHOS_TEST_FOR_EXCEPTION(
2602  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2603  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2604  << numVecs << ".");
2605 
2606  // Use a DualView to copy the scaling constants onto the device.
2607  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2608  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2609  k_alphas.modify_host ();
2610  for (size_t i = 0; i < numAlphas; ++i) {
2611  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2612  }
2613  k_alphas.sync_device ();
2614  // Invoke the scale() overload that takes a device View of coefficients.
2615  this->scale (k_alphas.view_device ());
2616  }
2617 
2618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2619  void
2621  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2622  {
2623  using Kokkos::ALL;
2624  using Kokkos::subview;
2625 
2626  const size_t lclNumRows = this->getLocalLength ();
2627  const size_t numVecs = this->getNumVectors ();
2628  TEUCHOS_TEST_FOR_EXCEPTION(
2629  static_cast<size_t> (alphas.extent (0)) != numVecs,
2630  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2631  "alphas.extent(0) = " << alphas.extent (0)
2632  << " != this->getNumVectors () = " << numVecs << ".");
2633  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2634  const std::pair<size_t, size_t> colRng (0, numVecs);
2635 
2636  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2637  // type of the return value of subview. This is because if we
2638  // switch the array layout from LayoutLeft to LayoutRight
2639  // (preferred for performance of block operations), the types
2640  // below won't be valid. (A view of a column of a LayoutRight
2641  // multivector has LayoutStride, not LayoutLeft.)
2642 
2643  // If we need sync to device, then host has the most recent version.
2644  const bool useHostVersion = this->need_sync_device ();
2645  if (useHostVersion) {
2646  // Work in host memory. This means we need to create a host
2647  // mirror of the input View of coefficients.
2648  auto alphas_h = Kokkos::create_mirror_view (alphas);
2649  Kokkos::deep_copy (alphas_h, alphas);
2650 
2651  auto Y_lcl = subview (this->getLocalViewHost (), rowRng, ALL ());
2652  if (isConstantStride ()) {
2653  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2654  }
2655  else {
2656  for (size_t k = 0; k < numVecs; ++k) {
2657  const size_t Y_col = this->isConstantStride () ? k :
2658  this->whichVectors_[k];
2659  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2660  // We don't have to use the entire 1-D View here; we can use
2661  // the version that takes a scalar coefficient.
2662  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2663  }
2664  }
2665  }
2666  else { // Work in device memory, using the input View 'alphas' directly.
2667  auto Y_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2668  if (isConstantStride ()) {
2669  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2670  }
2671  else {
2672  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2673  // as values on host, so copy them to host. Another approach
2674  // would be to fix scal() so that it takes a 0-D View as the
2675  // second argument.
2676  auto alphas_h = Kokkos::create_mirror_view (alphas);
2677  Kokkos::deep_copy (alphas_h, alphas);
2678 
2679  for (size_t k = 0; k < numVecs; ++k) {
2680  const size_t Y_col = this->isConstantStride () ? k :
2681  this->whichVectors_[k];
2682  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2683  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2684  }
2685  }
2686  }
2687  }
2688 
2689  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2690  void
2692  scale (const Scalar& alpha,
2694  {
2695  using Kokkos::ALL;
2696  using Kokkos::subview;
2698  const char tfecfFuncName[] = "scale: ";
2699 
2700  const size_t lclNumRows = getLocalLength ();
2701  const size_t numVecs = getNumVectors ();
2702 
2703  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2704  lclNumRows != A.getLocalLength (), std::invalid_argument,
2705  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2706  << A.getLocalLength () << ".");
2707  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2708  numVecs != A.getNumVectors (), std::invalid_argument,
2709  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2710  << A.getNumVectors () << ".");
2711 
2712  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2713  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2714  const std::pair<size_t, size_t> colRng (0, numVecs);
2715 
2716  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2717  if (this->need_sync_device ()) {
2718  this->sync_device ();
2719  }
2720  if (A.need_sync_device ()) {
2721  const_cast<MV&>(A).sync_device ();
2722  }
2723 
2724  this->modify_device ();
2725  auto Y_lcl_orig = this->getLocalViewDevice ();
2726  auto X_lcl_orig = A.getLocalViewDevice ();
2727  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2728  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2729 
2730  if (isConstantStride () && A.isConstantStride ()) {
2731  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2732  }
2733  else {
2734  // Make sure that Kokkos only uses the local length for add.
2735  for (size_t k = 0; k < numVecs; ++k) {
2736  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2737  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2738  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2739  auto X_k = subview (X_lcl, ALL (), X_col);
2740 
2741  KokkosBlas::scal (Y_k, theAlpha, X_k);
2742  }
2743  }
2744  }
2745 
2746 
2747 
2748  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2749  void
2752  {
2754  const char tfecfFuncName[] = "reciprocal: ";
2755 
2756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2757  getLocalLength () != A.getLocalLength (), std::runtime_error,
2758  "MultiVectors do not have the same local length. "
2759  "this->getLocalLength() = " << getLocalLength ()
2760  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2761  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2762  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2763  ": MultiVectors do not have the same number of columns (vectors). "
2764  "this->getNumVectors() = " << getNumVectors ()
2765  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2766 
2767  const size_t numVecs = getNumVectors ();
2768 
2769  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2770  if (this->need_sync_device ()) {
2771  this->sync_device ();
2772  }
2773  if (A.need_sync_device ()) {
2774  const_cast<MV&>(A).sync_device ();
2775  }
2776  this->modify_device ();
2777 
2778  auto this_view_dev = this->getLocalViewDevice ();
2779  auto A_view_dev = A.getLocalViewDevice ();
2780 
2781  if (isConstantStride () && A.isConstantStride ()) {
2782  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2783  }
2784  else {
2785  using Kokkos::ALL;
2786  using Kokkos::subview;
2787  for (size_t k = 0; k < numVecs; ++k) {
2788  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2789  auto vector_k = subview (this_view_dev, ALL (), this_col);
2790  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2791  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2792  KokkosBlas::reciprocal (vector_k, vector_Ak);
2793  }
2794  }
2795  }
2796 
2797  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2798  void
2801  {
2803  const char tfecfFuncName[] = "abs";
2804 
2805  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2806  getLocalLength () != A.getLocalLength (), std::runtime_error,
2807  ": MultiVectors do not have the same local length. "
2808  "this->getLocalLength() = " << getLocalLength ()
2809  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2810  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2811  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2812  ": MultiVectors do not have the same number of columns (vectors). "
2813  "this->getNumVectors() = " << getNumVectors ()
2814  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2815  const size_t numVecs = getNumVectors ();
2816 
2817  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2818  if (this->need_sync_device ()) {
2819  this->sync_device ();
2820  }
2821  if (A.need_sync_device ()) {
2822  const_cast<MV&>(A).sync_device ();
2823  }
2824  this->modify_device ();
2825 
2826  auto this_view_dev = this->getLocalViewDevice ();
2827  auto A_view_dev = A.getLocalViewDevice ();
2828 
2829  if (isConstantStride () && A.isConstantStride ()) {
2830  KokkosBlas::abs (this_view_dev, A_view_dev);
2831  }
2832  else {
2833  using Kokkos::ALL;
2834  using Kokkos::subview;
2835 
2836  for (size_t k=0; k < numVecs; ++k) {
2837  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2838  auto vector_k = subview (this_view_dev, ALL (), this_col);
2839  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2840  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2841  KokkosBlas::abs (vector_k, vector_Ak);
2842  }
2843  }
2844  }
2845 
2846  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2847  void
2849  update (const Scalar& alpha,
2851  const Scalar& beta)
2852  {
2853  const char tfecfFuncName[] = "update: ";
2854  using Kokkos::subview;
2855  using Kokkos::ALL;
2857 
2858  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
2859 
2860  const size_t lclNumRows = getLocalLength ();
2861  const size_t numVecs = getNumVectors ();
2862 
2863  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2864  lclNumRows != A.getLocalLength (), std::invalid_argument,
2865  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2866  << A.getLocalLength () << ".");
2867  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2868  numVecs != A.getNumVectors (), std::invalid_argument,
2869  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2870  << A.getNumVectors () << ".");
2871 
2872  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2873  if (this->need_sync_device ()) {
2874  this->sync_device ();
2875  }
2876  if (A.need_sync_device ()) {
2877  const_cast<MV&>(A).sync_device ();
2878  }
2879 
2880  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2881  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2882  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2883  const std::pair<size_t, size_t> colRng (0, numVecs);
2884 
2885  auto Y_lcl_orig = this->getLocalViewDevice ();
2886  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2887  auto X_lcl_orig = A.getLocalViewDevice ();
2888  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2889 
2890  // The device memory of *this is about to be modified
2891  this->modify_device ();
2892  if (isConstantStride () && A.isConstantStride ()) {
2893  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2894  }
2895  else {
2896  // Make sure that Kokkos only uses the local length for add.
2897  for (size_t k = 0; k < numVecs; ++k) {
2898  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2899  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2900  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2901  auto X_k = subview (X_lcl, ALL (), X_col);
2902 
2903  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2904  }
2905  }
2906  }
2907 
2908  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2909  void
2911  update (const Scalar& alpha,
2913  const Scalar& beta,
2915  const Scalar& gamma)
2916  {
2917  using Kokkos::ALL;
2918  using Kokkos::subview;
2920 
2921  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
2922 
2923  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
2924 
2925  const size_t lclNumRows = this->getLocalLength ();
2926  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2927  lclNumRows != A.getLocalLength (), std::invalid_argument,
2928  "The input MultiVector A has " << A.getLocalLength () << " local "
2929  "row(s), but this MultiVector has " << lclNumRows << " local "
2930  "row(s).");
2931  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2932  lclNumRows != B.getLocalLength (), std::invalid_argument,
2933  "The input MultiVector B has " << B.getLocalLength () << " local "
2934  "row(s), but this MultiVector has " << lclNumRows << " local "
2935  "row(s).");
2936  const size_t numVecs = getNumVectors ();
2937  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2938  A.getNumVectors () != numVecs, std::invalid_argument,
2939  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
2940  "but this MultiVector has " << numVecs << " column(s).");
2941  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2942  B.getNumVectors () != numVecs, std::invalid_argument,
2943  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
2944  "but this MultiVector has " << numVecs << " column(s).");
2945 
2946  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2947  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2948  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
2949 
2950  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2951  if (this->need_sync_device ()) this->sync_device ();
2952  if (A.need_sync_device ()) const_cast<MV&>(A).sync_device ();
2953  if (B.need_sync_device ()) const_cast<MV&>(B).sync_device ();
2954 
2955  // This method modifies *this.
2956  this->modify_device ();
2957 
2958  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2959  const std::pair<size_t, size_t> colRng (0, numVecs);
2960 
2961  // Prefer 'auto' over specifying the type explicitly. This avoids
2962  // issues with a subview possibly having a different type than the
2963  // original view.
2964  auto C_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2965  auto A_lcl = subview (A.getLocalViewDevice (), rowRng, ALL ());
2966  auto B_lcl = subview (B.getLocalViewDevice (), rowRng, ALL ());
2967 
2968  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
2969  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
2970  }
2971  else {
2972  // Some input (or *this) is not constant stride,
2973  // so perform the update one column at a time.
2974  for (size_t k = 0; k < numVecs; ++k) {
2975  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2976  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
2977  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
2978  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
2979  theBeta, subview (B_lcl, rowRng, B_col),
2980  theGamma, subview (C_lcl, rowRng, this_col));
2981  }
2982  }
2983  }
2984 
2985  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2986  Teuchos::ArrayRCP<const Scalar>
2988  getData (size_t j) const
2989  {
2990  using Kokkos::ALL;
2992  using IST = impl_scalar_type;
2993  const char tfecfFuncName[] = "getData: ";
2994 
2995  // Any MultiVector method that called the (classic) Kokkos Node's
2996  // viewBuffer or viewBufferNonConst methods always implied a
2997  // device->host synchronization. Thus, we synchronize here as
2998  // well.
2999  const_cast<MV*> (this)->sync_host ();
3000 
3001  auto hostView = getLocalViewHost ();
3002  const size_t col = isConstantStride () ? j : whichVectors_[j];
3003  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3004  Teuchos::ArrayRCP<const IST> dataAsArcp =
3005  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3006 
3007  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3008  (static_cast<size_t> (hostView_j.extent (0)) <
3009  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3010  "hostView_j.extent(0) = " << hostView_j.extent (0)
3011  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3012  "Please report this bug to the Tpetra developers.");
3013 
3014  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3015  }
3016 
3017  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3018  Teuchos::ArrayRCP<Scalar>
3021  {
3022  using Kokkos::ALL;
3023  using Kokkos::subview;
3025  using IST = impl_scalar_type;
3026  const char tfecfFuncName[] = "getDataNonConst: ";
3027 
3028  // Any MultiVector method that called the (classic) Kokkos Node's
3029  // viewBuffer or viewBufferNonConst methods always implied a
3030  // device->host synchronization. Thus, we synchronize here as
3031  // well.
3032  const_cast<MV*> (this)->sync_host ();
3033  // Calling getDataNonConst() implies that the user plans to modify
3034  // the values in the MultiVector, so we mark the host data as modified.
3035  modify_host ();
3036 
3037  auto hostView = getLocalViewHost ();
3038  const size_t col = isConstantStride () ? j : whichVectors_[j];
3039  auto hostView_j = subview (hostView, ALL (), col);
3040  Teuchos::ArrayRCP<IST> dataAsArcp =
3041  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3042 
3043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3044  (static_cast<size_t> (hostView_j.extent (0)) <
3045  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3046  "hostView_j.extent(0) = " << hostView_j.extent (0)
3047  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3048  "Please report this bug to the Tpetra developers.");
3049 
3050  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3051  }
3052 
3053  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3054  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3056  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3057  {
3058  using Teuchos::RCP;
3060 
3061  // Check whether the index set in cols is contiguous. If it is,
3062  // use the more efficient Range1D version of subCopy.
3063  bool contiguous = true;
3064  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3065  for (size_t j = 1; j < numCopyVecs; ++j) {
3066  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3067  contiguous = false;
3068  break;
3069  }
3070  }
3071  if (contiguous && numCopyVecs > 0) {
3072  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3073  }
3074  else {
3075  RCP<const MV> X_sub = this->subView (cols);
3076  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3077  Y->assign (*X_sub);
3078  return Y;
3079  }
3080  }
3081 
3082  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3083  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3085  subCopy (const Teuchos::Range1D &colRng) const
3086  {
3087  using Teuchos::RCP;
3089 
3090  RCP<const MV> X_sub = this->subView (colRng);
3091  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3092  Y->assign (*X_sub);
3093  return Y;
3094  }
3095 
3096  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3097  size_t
3100  return origView_.extent (0);
3101  }
3102 
3103  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3104  size_t
3107  return origView_.extent (1);
3108  }
3109 
3110  template <class Scalar, class LO, class GO, class Node>
3113  const Teuchos::RCP<const map_type>& subMap,
3114  const local_ordinal_type rowOffset) :
3115  base_type (subMap)
3116  {
3117  using Kokkos::ALL;
3118  using Kokkos::subview;
3119  using Teuchos::outArg;
3120  using Teuchos::RCP;
3121  using Teuchos::rcp;
3122  using Teuchos::reduceAll;
3123  using Teuchos::REDUCE_MIN;
3124  using std::endl;
3126  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3127  const char suffix[] = "Please report this bug to the Tpetra developers.";
3128  int lclGood = 1;
3129  int gblGood = 1;
3130  std::unique_ptr<std::ostringstream> errStrm;
3131  const bool debug = ::Tpetra::Details::Behavior::debug ();
3132  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3133 
3134  // Be careful to use the input Map's communicator, not X's. The
3135  // idea is that, on return, *this is a subview of X, using the
3136  // input Map.
3137  const auto comm = subMap->getComm ();
3138  TEUCHOS_ASSERT( ! comm.is_null () );
3139  const int myRank = comm->getRank ();
3140 
3141  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3142  const LO numCols = static_cast<LO> (X.getNumVectors ());
3143  const LO newNumRows = static_cast<LO> (subMap->getNodeNumElements ());
3144  if (verbose) {
3145  std::ostringstream os;
3146  os << "Proc " << myRank << ": " << prefix
3147  << "X: {lclNumRows: " << lclNumRowsBefore
3148  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3149  << ", numCols: " << numCols << "}, "
3150  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3151  std::cerr << os.str ();
3152  }
3153  // We ask for the _original_ number of rows in X, because X could
3154  // be a shorter (fewer rows) view of a longer MV. (For example, X
3155  // could be a domain Map view of a column Map MV.)
3156  const bool tooManyElts =
3157  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3158  if (tooManyElts) {
3159  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3160  *errStrm << " Proc " << myRank << ": subMap->getNodeNumElements() (="
3161  << newNumRows << ") + rowOffset (=" << rowOffset
3162  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3163  << ")." << endl;
3164  lclGood = 0;
3165  TEUCHOS_TEST_FOR_EXCEPTION
3166  (! debug && tooManyElts, std::invalid_argument,
3167  prefix << errStrm->str () << suffix);
3168  }
3169 
3170  if (debug) {
3171  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3172  if (gblGood != 1) {
3173  std::ostringstream gblErrStrm;
3174  const std::string myErrStr =
3175  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3176  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3177  TEUCHOS_TEST_FOR_EXCEPTION
3178  (true, std::invalid_argument, gblErrStrm.str ());
3179  }
3180  }
3181 
3182  using range_type = std::pair<LO, LO>;
3183  const range_type origRowRng
3184  (rowOffset, static_cast<LO> (X.origView_.extent (0)));
3185  const range_type rowRng
3186  (rowOffset, rowOffset + newNumRows);
3187 
3188  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3189  // FIXME (mfh 29 Sep 2016) If we just use X.view_ here, it breaks
3190  // CrsMatrix's Gauss-Seidel implementation (which assumes the
3191  // ability to create domain Map views of column Map MultiVectors,
3192  // and then get the original column Map MultiVector out again).
3193  // If we just use X.origView_ here, it breaks the fix for #46.
3194  // The test for rowOffset == 0 is a hack that makes both tests
3195  // pass, but doesn't actually fix the more general issue. In
3196  // particular, the right way to fix Gauss-Seidel would be to fix
3197  // #385; that would make "getting the original column Map
3198  // MultiVector out again" unnecessary.
3199  dual_view_type newView =
3200  subview (rowOffset == 0 ? X.origView_ : X.view_, rowRng, ALL ());
3201 
3202  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3203  // handling subviews of degenerate Views quite so well. For some
3204  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3205  // 0. We work around by creating a new empty DualView of the
3206  // desired (degenerate) dimensions.
3207  if (newOrigView.extent (0) == 0 &&
3208  newOrigView.extent (1) != X.origView_.extent (1)) {
3209  newOrigView =
3210  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3211  }
3212  if (newView.extent (0) == 0 &&
3213  newView.extent (1) != X.view_.extent (1)) {
3214  newView =
3215  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3216  }
3217 
3218  MV subViewMV = X.isConstantStride () ?
3219  MV (subMap, newView, newOrigView) :
3220  MV (subMap, newView, newOrigView, X.whichVectors_ ());
3221 
3222  if (debug) {
3223  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3224  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3225  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3226  lclGood = 0;
3227  if (errStrm.get () == nullptr) {
3228  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3229  }
3230  *errStrm << " Proc " << myRank <<
3231  ": subMap.getNodeNumElements(): " << newNumRows <<
3232  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3233  ", X.getNumVectors(): " << numCols <<
3234  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3235  }
3236  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3237  if (gblGood != 1) {
3238  std::ostringstream gblErrStrm;
3239  if (myRank == 0) {
3240  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3241  "dimensions on one or more processes:" << endl;
3242  }
3243  const std::string myErrStr =
3244  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3245  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3246  gblErrStrm << suffix << endl;
3247  TEUCHOS_TEST_FOR_EXCEPTION
3248  (true, std::invalid_argument, gblErrStrm.str ());
3249  }
3250  }
3251 
3252  if (verbose) {
3253  std::ostringstream os;
3254  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3255  std::cerr << os.str ();
3256  }
3257 
3258  *this = subViewMV; // shallow copy
3259 
3260  if (verbose) {
3261  std::ostringstream os;
3262  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3263  std::cerr << os.str ();
3264  }
3265  }
3266 
3267  template <class Scalar, class LO, class GO, class Node>
3270  const map_type& subMap,
3271  const size_t rowOffset) :
3272  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3273  static_cast<local_ordinal_type> (rowOffset))
3274  {}
3275 
3276  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3277  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3279  offsetView (const Teuchos::RCP<const map_type>& subMap,
3280  const size_t offset) const
3281  {
3283  return Teuchos::rcp (new MV (*this, *subMap, offset));
3284  }
3285 
3286  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3287  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3289  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3290  const size_t offset)
3291  {
3293  return Teuchos::rcp (new MV (*this, *subMap, offset));
3294  }
3295 
3296  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3297  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3299  subView (const Teuchos::ArrayView<const size_t>& cols) const
3300  {
3301  using Teuchos::Array;
3302  using Teuchos::rcp;
3304 
3305  const size_t numViewCols = static_cast<size_t> (cols.size ());
3306  TEUCHOS_TEST_FOR_EXCEPTION(
3307  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3308  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3309  "contain at least one entry, but cols.size() = " << cols.size ()
3310  << " == 0.");
3311 
3312  // Check whether the index set in cols is contiguous. If it is,
3313  // use the more efficient Range1D version of subView.
3314  bool contiguous = true;
3315  for (size_t j = 1; j < numViewCols; ++j) {
3316  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3317  contiguous = false;
3318  break;
3319  }
3320  }
3321  if (contiguous) {
3322  if (numViewCols == 0) {
3323  // The output MV has no columns, so there is nothing to view.
3324  return rcp (new MV (this->getMap (), numViewCols));
3325  } else {
3326  // Use the more efficient contiguous-index-range version.
3327  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3328  }
3329  }
3330 
3331  if (isConstantStride ()) {
3332  return rcp (new MV (this->getMap (), view_, origView_, cols));
3333  }
3334  else {
3335  Array<size_t> newcols (cols.size ());
3336  for (size_t j = 0; j < numViewCols; ++j) {
3337  newcols[j] = whichVectors_[cols[j]];
3338  }
3339  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3340  }
3341  }
3342 
3343 
3344  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3345  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3347  subView (const Teuchos::Range1D& colRng) const
3348  {
3349  using ::Tpetra::Details::Behavior;
3350  using Kokkos::ALL;
3351  using Kokkos::subview;
3352  using Teuchos::Array;
3353  using Teuchos::RCP;
3354  using Teuchos::rcp;
3356  const char tfecfFuncName[] = "subView(Range1D): ";
3357 
3358  const size_t lclNumRows = this->getLocalLength ();
3359  const size_t numVecs = this->getNumVectors ();
3360  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3361  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3362  // "at least one vector.");
3363  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3364  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3365  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3366  << numVecs << ".");
3367  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3368  numVecs != 0 && colRng.size () != 0 &&
3369  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3370  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3371  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3372  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3373  "[0, " << numVecs << "].");
3374 
3375  RCP<const MV> X_ret; // the MultiVector subview to return
3376 
3377  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3378  // broken for the case of views with zero rows. I will brutally
3379  // enforce that the subview has the correct dimensions. In
3380  // particular, in the case of zero rows, I will, if necessary,
3381  // create a new dual_view_type with zero rows and the correct
3382  // number of columns. In a debug build, I will use an all-reduce
3383  // to ensure that it has the correct dimensions on all processes.
3384 
3385  const std::pair<size_t, size_t> rows (0, lclNumRows);
3386  if (colRng.size () == 0) {
3387  const std::pair<size_t, size_t> cols (0, 0); // empty range
3388  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3389  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3390  }
3391  else {
3392  // Returned MultiVector is constant stride only if *this is.
3393  if (isConstantStride ()) {
3394  const std::pair<size_t, size_t> cols (colRng.lbound (),
3395  colRng.ubound () + 1);
3396  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3397  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3398  }
3399  else {
3400  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3401  // We're only asking for one column, so the result does have
3402  // constant stride, even though this MultiVector does not.
3403  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3404  whichVectors_[0] + colRng.ubound () + 1);
3405  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3406  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3407  }
3408  else {
3409  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3410  whichVectors_.begin () + colRng.ubound () + 1);
3411  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3412  }
3413  }
3414  }
3415 
3416  const bool debug = Behavior::debug ();
3417  if (debug) {
3418  using Teuchos::Comm;
3419  using Teuchos::outArg;
3420  using Teuchos::REDUCE_MIN;
3421  using Teuchos::reduceAll;
3422 
3423  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3424  Teuchos::null : this->getMap ()->getComm ();
3425  if (! comm.is_null ()) {
3426  int lclSuccess = 1;
3427  int gblSuccess = 1;
3428 
3429  if (X_ret.is_null ()) {
3430  lclSuccess = 0;
3431  }
3432  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3433  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3434  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3435  "MultiVector; the return value of this method) is null on some MPI "
3436  "process in this MultiVector's communicator. This should never "
3437  "happen. Please report this bug to the Tpetra developers.");
3438  if (! X_ret.is_null () &&
3439  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3440  lclSuccess = 0;
3441  }
3442  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3443  outArg (gblSuccess));
3444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3445  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3446  "colRng.size(), on at least one MPI process in this MultiVector's "
3447  "communicator. This should never happen. "
3448  "Please report this bug to the Tpetra developers.");
3449  }
3450  }
3451  return X_ret;
3452  }
3453 
3454 
3455  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3456  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3458  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3459  {
3461  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3462  }
3463 
3464 
3465  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3466  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3468  subViewNonConst (const Teuchos::Range1D &colRng)
3469  {
3471  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3472  }
3473 
3474 
3475  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3478  const size_t j)
3479  : base_type (X.getMap ())
3480  {
3481  using Kokkos::subview;
3482  typedef std::pair<size_t, size_t> range_type;
3483  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3484 
3485  const size_t numCols = X.getNumVectors ();
3486  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3487  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3488  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3489  const size_t jj = X.isConstantStride () ?
3490  static_cast<size_t> (j) :
3491  static_cast<size_t> (X.whichVectors_[j]);
3492  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3493  this->origView_ = X.origView_;
3494 
3495  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3496  // Export or Import operations with different subviews of a
3497  // MultiVector. Thus, it is safe to reuse communication buffers.
3498  // See #1560 discussion.
3499  //
3500  // We only need one column's worth of buffer for imports_ and
3501  // exports_. Taking subviews now ensures that their lengths will
3502  // be exactly what we need, so we won't have to resize them later.
3503  {
3504  const size_t newSize = X.imports_.extent (0) / numCols;
3505  auto newImports = X.imports_;
3506  newImports.d_view = subview (X.imports_.d_view, range_type (0, newSize));
3507  newImports.h_view = subview (X.imports_.h_view, range_type (0, newSize));
3508  }
3509  {
3510  const size_t newSize = X.exports_.extent (0) / numCols;
3511  auto newExports = X.exports_;
3512  newExports.d_view = subview (X.exports_.d_view, range_type (0, newSize));
3513  newExports.h_view = subview (X.exports_.h_view, range_type (0, newSize));
3514  }
3515  // These two DualViews already either have the right number of
3516  // entries, or zero entries. This means that we don't need to
3517  // resize them.
3520  }
3521 
3522 
3523  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3524  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3526  getVector (const size_t j) const
3527  {
3529  return Teuchos::rcp (new V (*this, j));
3530  }
3531 
3532 
3533  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3534  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3536  getVectorNonConst (const size_t j)
3537  {
3539  return Teuchos::rcp (new V (*this, j));
3540  }
3541 
3542 
3543  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3544  void
3546  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3547  {
3548  using dev_view_type = typename dual_view_type::t_dev;
3549  using host_view_type = typename dual_view_type::t_host;
3550  using IST = impl_scalar_type;
3551  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3552  Kokkos::HostSpace,
3553  Kokkos::MemoryUnmanaged>;
3554  const char tfecfFuncName[] = "get1dCopy: ";
3555 
3556  const size_t numRows = this->getLocalLength ();
3557  const size_t numCols = this->getNumVectors ();
3558 
3559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3560  (LDA < numRows, std::runtime_error,
3561  "LDA = " << LDA << " < numRows = " << numRows << ".");
3562  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3563  (numRows > size_t (0) && numCols > size_t (0) &&
3564  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3565  std::runtime_error,
3566  "A.size() = " << A.size () << ", but its size must be at least "
3567  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3568 
3569  const std::pair<size_t, size_t> rowRange (0, numRows);
3570  const std::pair<size_t, size_t> colRange (0, numCols);
3571 
3572  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3573  LDA, numCols);
3574  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3575 
3576  // Use the most recently updated version of this MultiVector's
3577  // data. This avoids sync'ing, which could violate users'
3578  // expectations.
3579  //
3580  // If we need sync to device, then host has the most recent version.
3581  const bool useHostVersion = this->need_sync_device ();
3582 
3583  dev_view_type srcView_dev;
3584  host_view_type srcView_host;
3585  if (useHostVersion) {
3586  srcView_host = this->getLocalViewHost ();
3587  }
3588  else {
3589  srcView_dev = this->getLocalViewDevice ();
3590  }
3591 
3592  if (this->isConstantStride ()) {
3593  if (useHostVersion) {
3594  Kokkos::deep_copy (A_view, srcView_host);
3595  }
3596  else {
3597  Kokkos::deep_copy (A_view, srcView_dev);
3598  }
3599  }
3600  else {
3601  for (size_t j = 0; j < numCols; ++j) {
3602  const size_t srcCol = this->whichVectors_[j];
3603  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3604 
3605  if (useHostVersion) {
3606  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3607  Kokkos::deep_copy (dstColView, srcColView_host);
3608  }
3609  else {
3610  auto srcColView_dev = Kokkos::subview (srcView_dev, rowRange, srcCol);
3611  Kokkos::deep_copy (dstColView, srcColView_dev);
3612  }
3613  }
3614  }
3615  }
3616 
3617 
3618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3619  void
3621  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3622  {
3624  const char tfecfFuncName[] = "get2dCopy: ";
3625  const size_t numRows = this->getLocalLength ();
3626  const size_t numCols = this->getNumVectors ();
3627 
3628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3629  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3630  std::runtime_error, "Input array of pointers must contain as many "
3631  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3632  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3633 
3634  if (numRows != 0 && numCols != 0) {
3635  // No side effects until we've validated the input.
3636  for (size_t j = 0; j < numCols; ++j) {
3637  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3638  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3639  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3640  "the input array of arrays is not long enough to fit all entries in "
3641  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3642  << " < getLocalLength() = " << numRows << ".");
3643  }
3644 
3645  // We've validated the input, so it's safe to start copying.
3646  for (size_t j = 0; j < numCols; ++j) {
3647  Teuchos::RCP<const V> X_j = this->getVector (j);
3648  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3649  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3650  }
3651  }
3652  }
3653 
3654  namespace { // (anonymous)
3655  template <class SC, class LO, class GO, class NT>
3657  syncMVToHostIfNeededAndGetHostView (MultiVector<SC, LO, GO, NT>& X,
3658  const bool markModified)
3659  {
3660  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3661  // (replace ? with 1 or 2) have always been device->host
3662  // synchronization points, since <= 2012. We retain this
3663  // behavior for backwards compatibility.
3664  if (X.need_sync_host ()) {
3665  X.sync_host ();
3666  }
3667  if (markModified) {
3668  X.modify_host ();
3669  }
3670  return X.getLocalViewHost ();
3671  }
3672  } // namespace (anonymous)
3673 
3674 
3675  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3676  Teuchos::ArrayRCP<const Scalar>
3678  get1dView () const
3679  {
3680  if (getLocalLength () == 0 || getNumVectors () == 0) {
3681  return Teuchos::null;
3682  } else {
3683  TEUCHOS_TEST_FOR_EXCEPTION(
3684  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3685  "get1dView: This MultiVector does not have constant stride, so it is "
3686  "not possible to view its data as a single array. You may check "
3687  "whether a MultiVector has constant stride by calling "
3688  "isConstantStride().");
3689  // Since get1dView() is and was always marked const, I have to
3690  // cast away const here in order not to break backwards
3691  // compatibility.
3692  constexpr bool markModified = false;
3694  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3695  markModified);
3696  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3697  Kokkos::Compat::persistingView (X_lcl);
3698  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3699  }
3700  }
3701 
3702  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3703  Teuchos::ArrayRCP<Scalar>
3706  {
3707  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3708  return Teuchos::null;
3709  }
3710  else {
3711  TEUCHOS_TEST_FOR_EXCEPTION
3712  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3713  "get1dViewNonConst: This MultiVector does not have constant stride, "
3714  "so it is not possible to view its data as a single array. You may "
3715  "check whether a MultiVector has constant stride by calling "
3716  "isConstantStride().");
3717  constexpr bool markModified = true;
3718  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3719  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3720  Kokkos::Compat::persistingView (X_lcl);
3721  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3722  }
3723  }
3724 
3725  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3726  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3729  {
3730  constexpr bool markModified = true;
3731  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3732 
3733  // Don't use the row range here on the outside, in order to avoid
3734  // a strided return type (in case Kokkos::subview is conservative
3735  // about that). Instead, use the row range for the column views
3736  // in the loop.
3737  const size_t myNumRows = this->getLocalLength ();
3738  const size_t numCols = this->getNumVectors ();
3739  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3740 
3741  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3742  for (size_t j = 0; j < numCols; ++j) {
3743  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3744  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3745  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3746  Kokkos::Compat::persistingView (X_lcl_j);
3747  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3748  }
3749  return views;
3750  }
3751 
3752  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3753  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3755  get2dView () const
3756  {
3757  // Since get2dView() is and was always marked const, I have to
3758  // cast away const here in order not to break backwards
3759  // compatibility.
3760  constexpr bool markModified = false;
3762  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3763  markModified);
3764  // Don't use the row range here on the outside, in order to avoid
3765  // a strided return type (in case Kokkos::subview is conservative
3766  // about that). Instead, use the row range for the column views
3767  // in the loop.
3768  const size_t myNumRows = this->getLocalLength ();
3769  const size_t numCols = this->getNumVectors ();
3770  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3771 
3772  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3773  for (size_t j = 0; j < numCols; ++j) {
3774  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3775  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3776  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3777  Kokkos::Compat::persistingView (X_lcl_j);
3778  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3779  }
3780  return views;
3781  }
3782 
3783  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3784  void
3786  multiply (Teuchos::ETransp transA,
3787  Teuchos::ETransp transB,
3788  const Scalar& alpha,
3791  const Scalar& beta)
3792  {
3793  using ::Tpetra::Details::ProfilingRegion;
3794  using Teuchos::CONJ_TRANS;
3795  using Teuchos::NO_TRANS;
3796  using Teuchos::TRANS;
3797  using Teuchos::RCP;
3798  using Teuchos::rcp;
3799  using Teuchos::rcpFromRef;
3800  using std::endl;
3801  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3802  using LO = local_ordinal_type;
3803  using STS = Teuchos::ScalarTraits<Scalar>;
3805  const char tfecfFuncName[] = "multiply: ";
3806  ProfilingRegion region ("Tpetra::MV::multiply");
3807 
3808  // This routine performs a variety of matrix-matrix multiply
3809  // operations, interpreting the MultiVector (this-aka C , A and B)
3810  // as 2D matrices. Variations are due to the fact that A, B and C
3811  // can be locally replicated or globally distributed MultiVectors
3812  // and that we may or may not operate with the transpose of A and
3813  // B. Possible cases are:
3814  //
3815  // Operations # Cases Notes
3816  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3817  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3818  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3819  //
3820  // The following operations are not meaningful for 1-D
3821  // distributions:
3822  //
3823  // u1) C(local) = A^T(distr) * B^T(distr) 1
3824  // u2) C(local) = A (distr) * B^X(distr) 2
3825  // u3) C(distr) = A^X(local) * B^X(local) 4
3826  // u4) C(distr) = A^X(local) * B^X(distr) 4
3827  // u5) C(distr) = A^T(distr) * B^X(local) 2
3828  // u6) C(local) = A^X(distr) * B^X(local) 4
3829  // u7) C(distr) = A^X(distr) * B^X(local) 4
3830  // u8) C(local) = A^X(local) * B^X(distr) 4
3831  //
3832  // Total number of cases: 32 (= 2^5).
3833 
3834  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3835 
3836  const bool A_is_local = ! A.isDistributed ();
3837  const bool B_is_local = ! B.isDistributed ();
3838  const bool C_is_local = ! this->isDistributed ();
3839 
3840  // In debug mode, check compatibility of local dimensions. We
3841  // only do this in debug mode, since it requires an all-reduce
3842  // to ensure correctness on all processses. It's entirely
3843  // possible that only some processes may have incompatible local
3844  // dimensions. Throwing an exception only on those processes
3845  // could cause this method to hang.
3846  const bool debug = ::Tpetra::Details::Behavior::debug ();
3847  if (debug) {
3848  auto myMap = this->getMap ();
3849  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
3850  using Teuchos::REDUCE_MIN;
3851  using Teuchos::reduceAll;
3852  using Teuchos::outArg;
3853 
3854  auto comm = myMap->getComm ();
3855  const size_t A_nrows =
3856  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
3857  const size_t A_ncols =
3858  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
3859  const size_t B_nrows =
3860  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
3861  const size_t B_ncols =
3862  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
3863 
3864  const bool lclBad = this->getLocalLength () != A_nrows ||
3865  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
3866 
3867  const int myRank = comm->getRank ();
3868  std::ostringstream errStrm;
3869  if (this->getLocalLength () != A_nrows) {
3870  errStrm << "Proc " << myRank << ": this->getLocalLength()="
3871  << this->getLocalLength () << " != A_nrows=" << A_nrows
3872  << "." << std::endl;
3873  }
3874  if (this->getNumVectors () != B_ncols) {
3875  errStrm << "Proc " << myRank << ": this->getNumVectors()="
3876  << this->getNumVectors () << " != B_ncols=" << B_ncols
3877  << "." << std::endl;
3878  }
3879  if (A_ncols != B_nrows) {
3880  errStrm << "Proc " << myRank << ": A_ncols="
3881  << A_ncols << " != B_nrows=" << B_nrows
3882  << "." << std::endl;
3883  }
3884 
3885  const int lclGood = lclBad ? 0 : 1;
3886  int gblGood = 0;
3887  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3888  if (gblGood != 1) {
3889  std::ostringstream os;
3890  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
3891 
3892  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3893  (true, std::runtime_error, "Inconsistent local dimensions on at "
3894  "least one process in this object's communicator." << std::endl
3895  << "Operation: "
3896  << "C(" << (C_is_local ? "local" : "distr") << ") = "
3897  << alpha << "*A"
3898  << (transA == Teuchos::TRANS ? "^T" :
3899  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3900  << "(" << (A_is_local ? "local" : "distr") << ") * "
3901  << beta << "*B"
3902  << (transA == Teuchos::TRANS ? "^T" :
3903  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3904  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3905  << "Global dimensions: C(" << this->getGlobalLength () << ", "
3906  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
3907  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
3908  << ", " << B.getNumVectors () << ")." << std::endl
3909  << os.str ());
3910  }
3911  }
3912  }
3913 
3914  // Case 1: C(local) = A^X(local) * B^X(local)
3915  const bool Case1 = C_is_local && A_is_local && B_is_local;
3916  // Case 2: C(local) = A^T(distr) * B (distr)
3917  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
3918  transA != NO_TRANS &&
3919  transB == NO_TRANS;
3920  // Case 3: C(distr) = A (distr) * B^X(local)
3921  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
3922  transA == NO_TRANS;
3923 
3924  // Test that we are considering a meaningful case
3925  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3926  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
3927  "Multiplication of op(A) and op(B) into *this is not a "
3928  "supported use case.");
3929 
3930  if (beta != STS::zero () && Case2) {
3931  // If Case2, then C is local and contributions must be summed
3932  // across all processes. However, if beta != 0, then accumulate
3933  // beta*C into the sum. When summing across all processes, we
3934  // only want to accumulate this once, so set beta == 0 on all
3935  // processes except Process 0.
3936  const int myRank = this->getMap ()->getComm ()->getRank ();
3937  if (myRank != 0) {
3938  beta_local = ATS::zero ();
3939  }
3940  }
3941 
3942  // We only know how to do matrix-matrix multiplies if all the
3943  // MultiVectors have constant stride. If not, we have to make
3944  // temporary copies of those MultiVectors (including possibly
3945  // *this) that don't have constant stride.
3946  RCP<MV> C_tmp;
3947  if (! isConstantStride ()) {
3948  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
3949  } else {
3950  C_tmp = rcp (this, false);
3951  }
3952 
3953  RCP<const MV> A_tmp;
3954  if (! A.isConstantStride ()) {
3955  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
3956  } else {
3957  A_tmp = rcpFromRef (A);
3958  }
3959 
3960  RCP<const MV> B_tmp;
3961  if (! B.isConstantStride ()) {
3962  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
3963  } else {
3964  B_tmp = rcpFromRef (B);
3965  }
3966 
3967  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3968  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
3969  ! A_tmp->isConstantStride (), std::logic_error,
3970  "Failed to make temporary constant-stride copies of MultiVectors.");
3971 
3972  {
3973  if (A_tmp->need_sync_device ()) {
3974  const_cast<MV&> (*A_tmp).sync_device (); // const is a lie
3975  }
3976  const LO A_lclNumRows = A_tmp->getLocalLength ();
3977  const LO A_numVecs = A_tmp->getNumVectors ();
3978  auto A_lcl = A_tmp->getLocalViewDevice ();
3979  auto A_sub = Kokkos::subview (A_lcl,
3980  std::make_pair (LO (0), A_lclNumRows),
3981  std::make_pair (LO (0), A_numVecs));
3982 
3983  if (B_tmp->need_sync_device ()) {
3984  const_cast<MV&> (*B_tmp).sync_device (); // const is a lie
3985  }
3986  const LO B_lclNumRows = B_tmp->getLocalLength ();
3987  const LO B_numVecs = B_tmp->getNumVectors ();
3988  auto B_lcl = B_tmp->getLocalViewDevice ();
3989  auto B_sub = Kokkos::subview (B_lcl,
3990  std::make_pair (LO (0), B_lclNumRows),
3991  std::make_pair (LO (0), B_numVecs));
3992 
3993  if (C_tmp->need_sync_device ()) {
3994  const_cast<MV&> (*C_tmp).sync_device (); // const is a lie
3995  }
3996  const LO C_lclNumRows = C_tmp->getLocalLength ();
3997  const LO C_numVecs = C_tmp->getNumVectors ();
3998  auto C_lcl = C_tmp->getLocalViewDevice ();
3999  auto C_sub = Kokkos::subview (C_lcl,
4000  std::make_pair (LO (0), C_lclNumRows),
4001  std::make_pair (LO (0), C_numVecs));
4002  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4003  (transA == Teuchos::TRANS ? 'T' : 'C'));
4004  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4005  (transB == Teuchos::TRANS ? 'T' : 'C'));
4006  const impl_scalar_type alpha_IST (alpha);
4007 
4008  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4009  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4010  beta_local, C_sub);
4011  }
4012 
4013  if (! isConstantStride ()) {
4014  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4015  }
4016 
4017  // Dispose of (possibly) extra copies of A and B.
4018  A_tmp = Teuchos::null;
4019  B_tmp = Teuchos::null;
4020 
4021  // If Case 2 then sum up *this and distribute it to all processes.
4022  if (Case2) {
4023  this->reduce ();
4024  }
4025  }
4026 
4027  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4028  void
4030  elementWiseMultiply (Scalar scalarAB,
4033  Scalar scalarThis)
4034  {
4035  using Kokkos::ALL;
4036  using Kokkos::subview;
4039  const char tfecfFuncName[] = "elementWiseMultiply: ";
4040 
4041  const size_t lclNumRows = this->getLocalLength ();
4042  const size_t numVecs = this->getNumVectors ();
4043 
4044  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4045  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4046  std::runtime_error, "MultiVectors do not have the same local length.");
4047  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4048  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4049  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4050  << ".");
4051 
4052  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
4053  if (this->need_sync_device ()) {
4054  this->sync_device ();
4055  }
4056  if (A.need_sync_device ()) {
4057  const_cast<V&>(A).sync_device ();
4058  }
4059  if (B.need_sync_device ()) {
4060  const_cast<MV&>(B).sync_device ();
4061  }
4062  this->modify_device ();
4063 
4064  auto this_view = this->getLocalViewDevice ();
4065  auto A_view = A.getLocalViewDevice ();
4066  auto B_view = B.getLocalViewDevice ();
4067 
4068  if (isConstantStride () && B.isConstantStride ()) {
4069  // A is just a Vector; it only has one column, so it always has
4070  // constant stride.
4071  //
4072  // If both *this and B have constant stride, we can do an
4073  // element-wise multiply on all columns at once.
4074  KokkosBlas::mult (scalarThis,
4075  this_view,
4076  scalarAB,
4077  subview (A_view, ALL (), 0),
4078  B_view);
4079  }
4080  else {
4081  for (size_t j = 0; j < numVecs; ++j) {
4082  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4083  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4084  KokkosBlas::mult (scalarThis,
4085  subview (this_view, ALL (), C_col),
4086  scalarAB,
4087  subview (A_view, ALL (), 0),
4088  subview (B_view, ALL (), B_col));
4089  }
4090  }
4091  }
4092 
4093  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4094  void
4097  {
4099  using ::Tpetra::Details::ProfilingRegion;
4100  ProfilingRegion region ("Tpetra::MV::reduce");
4101 
4102  const auto map = this->getMap ();
4103  if (map.get () == nullptr) {
4104  return;
4105  }
4106  const auto comm = map->getComm ();
4107  if (comm.get () == nullptr) {
4108  return;
4109  }
4110 
4111  // Avoid giving device buffers to MPI. Even if MPI handles them
4112  // correctly, doing so may not perform well.
4113  const bool changed_on_device = this->need_sync_host ();
4114  if (changed_on_device) {
4115  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4116  // and host will be separate allocations. In that case, it may
4117  // pay to do the all-reduce from device to host.
4118  this->modify_device ();
4119  auto X_lcl = this->getLocalViewDevice ();
4120  allReduceView (X_lcl, X_lcl, *comm);
4121  }
4122  else {
4123  this->modify_host ();
4124  auto X_lcl = this->getLocalViewHost ();
4125  allReduceView (X_lcl, X_lcl, *comm);
4126  }
4127  }
4128 
4129  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4130  void
4132  replaceLocalValue (const LocalOrdinal lclRow,
4133  const size_t col,
4134  const impl_scalar_type& ScalarValue) const
4135  {
4136 #ifdef HAVE_TPETRA_DEBUG
4137  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4138  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4139  TEUCHOS_TEST_FOR_EXCEPTION(
4140  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4141  std::runtime_error,
4142  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4143  << " is invalid. The range of valid row indices on this process "
4144  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4145  << ", " << maxLocalIndex << "].");
4146  TEUCHOS_TEST_FOR_EXCEPTION(
4147  vectorIndexOutOfRange(col),
4148  std::runtime_error,
4149  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4150  << " of the multivector is invalid.");
4151 #endif
4152  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4153  view_.h_view (lclRow, colInd) = ScalarValue;
4154  }
4155 
4156 
4157  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4158  void
4160  sumIntoLocalValue (const LocalOrdinal lclRow,
4161  const size_t col,
4162  const impl_scalar_type& value,
4163  const bool atomic) const
4164  {
4165 #ifdef HAVE_TPETRA_DEBUG
4166  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4167  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4168  TEUCHOS_TEST_FOR_EXCEPTION(
4169  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4170  std::runtime_error,
4171  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4172  << " is invalid. The range of valid row indices on this process "
4173  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4174  << ", " << maxLocalIndex << "].");
4175  TEUCHOS_TEST_FOR_EXCEPTION(
4176  vectorIndexOutOfRange(col),
4177  std::runtime_error,
4178  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4179  << " of the multivector is invalid.");
4180 #endif
4181  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4182  if (atomic) {
4183  Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
4184  }
4185  else {
4186  view_.h_view (lclRow, colInd) += value;
4187  }
4188  }
4189 
4190 
4191  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4192  void
4194  replaceGlobalValue (const GlobalOrdinal gblRow,
4195  const size_t col,
4196  const impl_scalar_type& ScalarValue) const
4197  {
4198  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4199  // touches the RCP's reference count, which isn't thread safe.
4200  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4201 #ifdef HAVE_TPETRA_DEBUG
4202  const char tfecfFuncName[] = "replaceGlobalValue: ";
4203  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4204  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4205  std::runtime_error,
4206  "Global row index " << gblRow << " is not present on this process "
4207  << this->getMap ()->getComm ()->getRank () << ".");
4208  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4209  (this->vectorIndexOutOfRange (col), std::runtime_error,
4210  "Vector index " << col << " of the MultiVector is invalid.");
4211 #endif // HAVE_TPETRA_DEBUG
4212  this->replaceLocalValue (lclRow, col, ScalarValue);
4213  }
4214 
4215  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4216  void
4218  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4219  const size_t col,
4220  const impl_scalar_type& value,
4221  const bool atomic) const
4222  {
4223  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4224  // touches the RCP's reference count, which isn't thread safe.
4225  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4226 #ifdef HAVE_TEUCHOS_DEBUG
4227  TEUCHOS_TEST_FOR_EXCEPTION(
4228  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4229  std::runtime_error,
4230  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4231  << " is not present on this process "
4232  << this->getMap ()->getComm ()->getRank () << ".");
4233  TEUCHOS_TEST_FOR_EXCEPTION(
4234  vectorIndexOutOfRange(col),
4235  std::runtime_error,
4236  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4237  << " of the multivector is invalid.");
4238 #endif
4239  this->sumIntoLocalValue (lclRow, col, value, atomic);
4240  }
4241 
4242  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4243  template <class T>
4244  Teuchos::ArrayRCP<T>
4246  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4247  size_t j) const
4248  {
4249  typedef Kokkos::DualView<impl_scalar_type*,
4250  typename dual_view_type::array_layout,
4251  execution_space> col_dual_view_type;
4252  const size_t col = isConstantStride () ? j : whichVectors_[j];
4253  col_dual_view_type X_col =
4254  Kokkos::subview (view_, Kokkos::ALL (), col);
4255  return Kokkos::Compat::persistingView (X_col.d_view);
4256  }
4257 
4258 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
4259  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4261  TPETRA_DEPRECATED
4263  getDualView () const {
4264  return view_;
4265  }
4266 #endif // TPETRA_ENABLE_DEPRECATED_CODE
4267 
4268  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4269  void
4272  view_.clear_sync_state ();
4273  }
4274 
4275  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4276  void
4279  view_.sync_host ();
4280  }
4281 
4282  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4283  void
4286  view_.sync_device ();
4287  }
4288 
4289  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4290  bool
4293  return view_.need_sync_host ();
4294  }
4295 
4296  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4297  bool
4300  return view_.need_sync_device ();
4301  }
4302 
4303  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4304  void
4307  view_.modify_device ();
4308  }
4309 
4310  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4311  void
4314  view_.modify_host ();
4315  }
4316 
4317  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4321  return view_.view_device ();
4322  }
4323 
4324  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4328  return view_.view_host ();
4329  }
4330 
4331  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4332  std::string
4334  descriptionImpl (const std::string& className) const
4335  {
4336  using Teuchos::TypeNameTraits;
4337 
4338  std::ostringstream out;
4339  out << "\"" << className << "\": {";
4340  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4341  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4342  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4343  << ", Node" << Node::name ()
4344  << "}, ";
4345  if (this->getObjectLabel () != "") {
4346  out << "Label: \"" << this->getObjectLabel () << "\", ";
4347  }
4348  out << ", numRows: " << this->getGlobalLength ();
4349  if (className != "Tpetra::Vector") {
4350  out << ", numCols: " << this->getNumVectors ()
4351  << ", isConstantStride: " << this->isConstantStride ();
4352  }
4353  if (this->isConstantStride ()) {
4354  out << ", columnStride: " << this->getStride ();
4355  }
4356  out << "}";
4357 
4358  return out.str ();
4359  }
4360 
4361  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4362  std::string
4365  {
4366  return this->descriptionImpl ("Tpetra::MultiVector");
4367  }
4368 
4369  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4370  std::string
4372  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4373  {
4374  typedef LocalOrdinal LO;
4375  using std::endl;
4376 
4377  if (vl <= Teuchos::VERB_LOW) {
4378  return std::string ();
4379  }
4380  auto map = this->getMap ();
4381  if (map.is_null ()) {
4382  return std::string ();
4383  }
4384  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4385  auto outp = Teuchos::getFancyOStream (outStringP);
4386  Teuchos::FancyOStream& out = *outp;
4387  auto comm = map->getComm ();
4388  const int myRank = comm->getRank ();
4389  const int numProcs = comm->getSize ();
4390 
4391  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4392  Teuchos::OSTab tab1 (out);
4393 
4394  // VERB_MEDIUM and higher prints getLocalLength()
4395  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4396  out << "Local number of rows: " << lclNumRows << endl;
4397 
4398  if (vl > Teuchos::VERB_MEDIUM) {
4399  // VERB_HIGH and higher prints isConstantStride() and getStride().
4400  // The first is only relevant if the Vector has multiple columns.
4401  if (this->getNumVectors () != static_cast<size_t> (1)) {
4402  out << "isConstantStride: " << this->isConstantStride () << endl;
4403  }
4404  if (this->isConstantStride ()) {
4405  out << "Column stride: " << this->getStride () << endl;
4406  }
4407 
4408  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4409  // VERB_EXTREME prints values. Get a host View of the
4410  // Vector's local data, so we can print it. (Don't assume
4411  // that we can access device data directly in host code.)
4412  typename dual_view_type::t_host X_host;
4413  if (need_sync_host ()) {
4414  // Device memory has the latest version. Don't actually
4415  // sync to host; that changes the Vector's state, and may
4416  // change future computations (that use the data's current
4417  // place to decide where to compute). Instead, create a
4418  // temporary host copy and print that.
4419  auto X_dev = getLocalViewDevice ();
4420  auto X_host_copy = Kokkos::create_mirror_view (X_dev);
4421  Kokkos::deep_copy (X_host_copy, X_dev);
4422  X_host = X_host_copy;
4423  }
4424  else {
4425  // Either host and device are in sync, or host has the
4426  // latest version of the Vector's data. Thus, we can use
4427  // the host version directly.
4428  X_host = getLocalViewHost ();
4429  }
4430  // The square braces [] and their contents are in Matlab
4431  // format, so users may copy and paste directly into Matlab.
4432  out << "Values: " << endl
4433  << "[";
4434  const LO numCols = static_cast<LO> (this->getNumVectors ());
4435  if (numCols == 1) {
4436  for (LO i = 0; i < lclNumRows; ++i) {
4437  out << X_host(i,0);
4438  if (i + 1 < lclNumRows) {
4439  out << "; ";
4440  }
4441  }
4442  }
4443  else {
4444  for (LO i = 0; i < lclNumRows; ++i) {
4445  for (LO j = 0; j < numCols; ++j) {
4446  out << X_host(i,j);
4447  if (j + 1 < numCols) {
4448  out << ", ";
4449  }
4450  }
4451  if (i + 1 < lclNumRows) {
4452  out << "; ";
4453  }
4454  }
4455  }
4456  out << "]" << endl;
4457  }
4458  }
4459 
4460  out.flush (); // make sure the ostringstream got everything
4461  return outStringP->str ();
4462  }
4463 
4464  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4465  void
4467  describeImpl (Teuchos::FancyOStream& out,
4468  const std::string& className,
4469  const Teuchos::EVerbosityLevel verbLevel) const
4470  {
4471  using Teuchos::TypeNameTraits;
4472  using Teuchos::VERB_DEFAULT;
4473  using Teuchos::VERB_NONE;
4474  using Teuchos::VERB_LOW;
4475  using std::endl;
4476  const Teuchos::EVerbosityLevel vl =
4477  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4478 
4479  if (vl == VERB_NONE) {
4480  return; // don't print anything
4481  }
4482  // If this Vector's Comm is null, then the Vector does not
4483  // participate in collective operations with the other processes.
4484  // In that case, it is not even legal to call this method. The
4485  // reasonable thing to do in that case is nothing.
4486  auto map = this->getMap ();
4487  if (map.is_null ()) {
4488  return;
4489  }
4490  auto comm = map->getComm ();
4491  if (comm.is_null ()) {
4492  return;
4493  }
4494 
4495  const int myRank = comm->getRank ();
4496 
4497  // Only Process 0 should touch the output stream, but this method
4498  // in general may need to do communication. Thus, we may need to
4499  // preserve the current tab level across multiple "if (myRank ==
4500  // 0) { ... }" inner scopes. This is why we sometimes create
4501  // OSTab instances by pointer, instead of by value. We only need
4502  // to create them by pointer if the tab level must persist through
4503  // multiple inner scopes.
4504  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4505 
4506  // VERB_LOW and higher prints the equivalent of description().
4507  if (myRank == 0) {
4508  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4509  out << "\"" << className << "\":" << endl;
4510  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4511  {
4512  out << "Template parameters:" << endl;
4513  Teuchos::OSTab tab2 (out);
4514  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4515  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4516  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4517  << "Node: " << Node::name () << endl;
4518  }
4519  if (this->getObjectLabel () != "") {
4520  out << "Label: \"" << this->getObjectLabel () << "\", ";
4521  }
4522  out << "Global number of rows: " << this->getGlobalLength () << endl;
4523  if (className != "Tpetra::Vector") {
4524  out << "Number of columns: " << this->getNumVectors () << endl;
4525  }
4526  // getStride() may differ on different processes, so it (and
4527  // isConstantStride()) properly belong to per-process data.
4528  }
4529 
4530  // This is collective over the Map's communicator.
4531  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4532  const std::string lclStr = this->localDescribeToString (vl);
4533  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4534  }
4535  }
4536 
4537  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4538  void
4540  describe (Teuchos::FancyOStream &out,
4541  const Teuchos::EVerbosityLevel verbLevel) const
4542  {
4543  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4544  }
4545 
4546  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4547  void
4549  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4550  {
4551  replaceMap (newMap);
4552  }
4553 
4554  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4555  void
4558  {
4559  using ::Tpetra::Details::localDeepCopy;
4560  const char prefix[] = "Tpetra::MultiVector::assign: ";
4561 
4562  TEUCHOS_TEST_FOR_EXCEPTION
4563  (this->getGlobalLength () != src.getGlobalLength () ||
4564  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4565  prefix << "Global dimensions of the two Tpetra::MultiVector "
4566  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4567  << "," << src.getNumVectors () << "], and *this has dimensions ["
4568  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4569  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4570  TEUCHOS_TEST_FOR_EXCEPTION
4571  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4572  prefix << "The local row counts of the two Tpetra::MultiVector "
4573  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4574  << " and *this has " << this->getLocalLength () << " row(s).");
4575 
4576  // See #1510. We're writing to *this, so we don't care about its
4577  // contents in either memory space, and we don't want
4578  // DualView::modify to complain about "concurrent modification" of
4579  // host and device Views.
4580  this->clear_sync_state();
4581  this->modify_device ();
4582 
4583  // If need sync to device, then host has most recent version.
4584  const bool src_last_updated_on_host = src.need_sync_device ();
4585 
4586  if (src_last_updated_on_host) {
4587  localDeepCopy (this->getLocalViewDevice (),
4588  src.getLocalViewHost (),
4589  this->isConstantStride (),
4590  src.isConstantStride (),
4591  this->whichVectors_ (),
4592  src.whichVectors_ ());
4593  }
4594  else {
4595  localDeepCopy (this->getLocalViewDevice (),
4596  src.getLocalViewDevice (),
4597  this->isConstantStride (),
4598  src.isConstantStride (),
4599  this->whichVectors_ (),
4600  src.whichVectors_ ());
4601  }
4602  }
4603 
4604  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4605  bool
4608  using ::Tpetra::Details::PackTraits;
4609  using ST = impl_scalar_type;
4610  using HES =
4611  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
4612 
4613  const size_t l1 = this->getLocalLength();
4614  const size_t l2 = vec.getLocalLength();
4615  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4616  return false;
4617  }
4618  if (l1==0) {
4619  return true;
4620  }
4621 
4622  auto v1 = this->getLocalViewHost ();
4623  auto v2 = vec.getLocalViewHost ();
4624  if (PackTraits<ST, HES>::packValueCount (v1(0,0)) !=
4625  PackTraits<ST, HES>::packValueCount (v2(0,0))) {
4626  return false;
4627  }
4628 
4629  return true;
4630  }
4631 
4632  template <class ST, class LO, class GO, class NT>
4635  std::swap(mv.map_, this->map_);
4636  std::swap(mv.view_, this->view_);
4637  std::swap(mv.origView_, this->origView_);
4638  std::swap(mv.whichVectors_, this->whichVectors_);
4639  }
4640 
4641 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4642  template <class ST, class LO, class GO, class NT>
4643  void
4645  const Teuchos::SerialDenseMatrix<int, ST>& src)
4646  {
4647  using ::Tpetra::Details::localDeepCopy;
4648  using MV = MultiVector<ST, LO, GO, NT>;
4649  using IST = typename MV::impl_scalar_type;
4650  using input_view_type =
4651  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4652  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4653  using pair_type = std::pair<LO, LO>;
4654 
4655  const LO numRows = static_cast<LO> (src.numRows ());
4656  const LO numCols = static_cast<LO> (src.numCols ());
4657 
4658  TEUCHOS_TEST_FOR_EXCEPTION
4659  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4660  numCols != static_cast<LO> (dst.getNumVectors ()),
4661  std::invalid_argument, "Tpetra::deep_copy: On Process "
4662  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4663  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4664  << ", but src is " << numRows << " x " << numCols << ".");
4665 
4666  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4667  input_view_type src_orig (src_raw, src.stride (), numCols);
4668  input_view_type src_view =
4669  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4670 
4671  dst.clear_sync_state ();
4672  dst.modify_device ();
4673  constexpr bool src_isConstantStride = true;
4674  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4675  localDeepCopy (dst.getLocalViewDevice (),
4676  src_view,
4677  dst.isConstantStride (),
4678  src_isConstantStride,
4679  getMultiVectorWhichVectors (dst),
4680  srcWhichVectors);
4681  }
4682 
4683  template <class ST, class LO, class GO, class NT>
4684  void
4685  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4686  const MultiVector<ST, LO, GO, NT>& src)
4687  {
4688  using ::Tpetra::Details::localDeepCopy;
4689  using MV = MultiVector<ST, LO, GO, NT>;
4690  using IST = typename MV::impl_scalar_type;
4691  using output_view_type =
4692  Kokkos::View<IST**, Kokkos::LayoutLeft,
4693  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4694  using pair_type = std::pair<LO, LO>;
4695 
4696  const LO numRows = static_cast<LO> (dst.numRows ());
4697  const LO numCols = static_cast<LO> (dst.numCols ());
4698 
4699  TEUCHOS_TEST_FOR_EXCEPTION
4700  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4701  numCols != static_cast<LO> (src.getNumVectors ()),
4702  std::invalid_argument, "Tpetra::deep_copy: On Process "
4703  << src.getMap ()->getComm ()->getRank () << ", src is "
4704  << src.getLocalLength () << " x " << src.getNumVectors ()
4705  << ", but dst is " << numRows << " x " << numCols << ".");
4706 
4707  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4708  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4709  auto dst_view =
4710  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4711 
4712  constexpr bool dst_isConstantStride = true;
4713  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4714 
4715  // Prefer the host version of src's data.
4716  if (src.need_sync_host ()) { // last modified on device
4717  localDeepCopy (dst_view,
4718  src.getLocalViewDevice (),
4719  dst_isConstantStride,
4720  src.isConstantStride (),
4721  dstWhichVectors,
4722  getMultiVectorWhichVectors (src));
4723  }
4724  else {
4725  localDeepCopy (dst_view,
4726  src.getLocalViewHost (),
4727  dst_isConstantStride,
4728  src.isConstantStride (),
4729  dstWhichVectors,
4730  getMultiVectorWhichVectors (src));
4731  }
4732  }
4733 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4734 
4735  template <class Scalar, class LO, class GO, class NT>
4736  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4737  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4738  size_t numVectors)
4739  {
4740  typedef MultiVector<Scalar, LO, GO, NT> MV;
4741  return Teuchos::rcp (new MV (map, numVectors));
4742  }
4743 
4744  template <class ST, class LO, class GO, class NT>
4745  MultiVector<ST, LO, GO, NT>
4747  {
4748  typedef MultiVector<ST, LO, GO, NT> MV;
4749  MV cpy (src.getMap (), src.getNumVectors (), false);
4750  cpy.assign (src);
4751  return cpy;
4752  }
4753 
4754 } // namespace Tpetra
4755 
4756 //
4757 // Explicit instantiation macro
4758 //
4759 // Must be expanded from within the Tpetra namespace!
4760 //
4761 
4762 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4763 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4764  template class MultiVector< SCALAR , LO , GO , NODE >; \
4765  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4766  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4767  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4768  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4769 
4770 #else
4771 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4772  template class MultiVector< SCALAR , LO , GO , NODE >; \
4773  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4774 
4775 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4776 
4777 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector &quot;constructor&quot;: Create a MultiVector from a given Map.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
dual_view_type::t_host getLocalViewHost() const
A local Kokkos::View of host memory.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector&#39;s local values.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector&#39;s local values.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void clear_sync_state()
Clear &quot;modified&quot; flags on both host and device sides.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process&#39; verbose describe() information to the returned string. ...
One or more distributed dense vectors.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using global row index.
bool isDistributed() const
Whether this is a globally distributed object.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector&#39;s local values.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
static bool debug()
Whether Tpetra is in debug mode.
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalCols() const
&quot;Original&quot; number of columns in the (local) data.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
void sync_host()
Synchronize to Host.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
dual_view_type origView_
The &quot;original view&quot; of the MultiVector&#39;s data.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using global row index.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner (&quot;dot&quot;) product result.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Declaration of Tpetra::Details::isInterComm.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
void modify_device()
Mark data as modified on the device side.
Replace old value with maximum of magnitudes of old and new values.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
Abstract base class for objects that can be the source of an Import or Export operation.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Replace existing values with new values.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Return a deep copy of this MultiVector, with a different Node type.
void randomize()
Set all values in the multivector to pseudorandom numbers.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void modify_host()
Mark data as modified on the host side.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using local (row) index.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
A parallel distribution of indices over processes.
size_t getStride() const
Stride between columns in the multivector.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual std::string description() const
A simple one-line description of this object.
void reduce()
Sum values of a locally replicated multivector across all processes.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
dual_view_type::t_dev getLocalViewDevice() const
A local Kokkos::View of device memory.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using local row index.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,j).
Base class for distributed Tpetra objects that support data redistribution.
void localDeepCopy(const DstViewType &dst, const SrcViewType &src, const bool dstConstStride, const bool srcConstStride, const DstWhichVecsType &dstWhichVecs, const SrcWhichVecsType &srcWhichVecs)
Implementation of Tpetra::MultiVector deep copy of local data.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
EWhichNorm
Input argument for normImpl() (which see).
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector&#39;s local values.
size_t getOrigNumLocalRows() const
&quot;Original&quot; number of rows in the (local) data.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
void sync_device()
Synchronize to Device.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.