Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
41 #define TPETRA_MULTIVECTOR_DEF_HPP
42 
50 
51 #include "Tpetra_Util.hpp"
52 #include "Tpetra_Vector.hpp"
55 #include "Tpetra_Details_checkView.hpp"
56 #include "Tpetra_Details_fill.hpp"
64 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
65 # include "Teuchos_SerialDenseMatrix.hpp"
66 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
67 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
68 #include "KokkosCompat_View.hpp"
69 #include "KokkosBlas.hpp"
70 #include "KokkosKernels_Utils.hpp"
71 #include "Kokkos_Random.hpp"
72 #include "Kokkos_ArithTraits.hpp"
73 #include <memory>
74 #include <sstream>
75 
76 #ifdef HAVE_TPETRA_INST_FLOAT128
77 namespace Kokkos {
78  // FIXME (mfh 04 Sep 2015) Just a stub for now!
79  template<class Generator>
80  struct rand<Generator, __float128> {
81  static KOKKOS_INLINE_FUNCTION __float128 max ()
82  {
83  return static_cast<__float128> (1.0);
84  }
85  static KOKKOS_INLINE_FUNCTION __float128
86  draw (Generator& gen)
87  {
88  // Half the smallest normalized double, is the scaling factor of
89  // the lower-order term in the double-double representation.
90  const __float128 scalingFactor =
91  static_cast<__float128> (std::numeric_limits<double>::min ()) /
92  static_cast<__float128> (2.0);
93  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
94  const __float128 lowerOrderTerm =
95  static_cast<__float128> (gen.drand ()) * scalingFactor;
96  return higherOrderTerm + lowerOrderTerm;
97  }
98  static KOKKOS_INLINE_FUNCTION __float128
99  draw (Generator& gen, const __float128& range)
100  {
101  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
102  const __float128 scalingFactor =
103  static_cast<__float128> (std::numeric_limits<double>::min ()) /
104  static_cast<__float128> (2.0);
105  const __float128 higherOrderTerm =
106  static_cast<__float128> (gen.drand (range));
107  const __float128 lowerOrderTerm =
108  static_cast<__float128> (gen.drand (range)) * scalingFactor;
109  return higherOrderTerm + lowerOrderTerm;
110  }
111  static KOKKOS_INLINE_FUNCTION __float128
112  draw (Generator& gen, const __float128& start, const __float128& end)
113  {
114  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
115  const __float128 scalingFactor =
116  static_cast<__float128> (std::numeric_limits<double>::min ()) /
117  static_cast<__float128> (2.0);
118  const __float128 higherOrderTerm =
119  static_cast<__float128> (gen.drand (start, end));
120  const __float128 lowerOrderTerm =
121  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
122  return higherOrderTerm + lowerOrderTerm;
123  }
124  };
125 } // namespace Kokkos
126 #endif // HAVE_TPETRA_INST_FLOAT128
127 
128 namespace { // (anonymous)
129 
144  template<class ST, class LO, class GO, class NT>
146  allocDualView (const size_t lclNumRows,
147  const size_t numCols,
148  const bool zeroOut = true,
149  const bool allowPadding = false)
150  {
151  using ::Tpetra::Details::Behavior;
152  using Kokkos::AllowPadding;
153  using Kokkos::view_alloc;
154  using Kokkos::WithoutInitializing;
155  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
156  typedef typename dual_view_type::t_dev dev_view_type;
157  // This needs to be a string and not a char*, if given as an
158  // argument to Kokkos::view_alloc. This is because view_alloc
159  // also allows a raw pointer as its first argument. See
160  // https://github.com/kokkos/kokkos/issues/434.
161  const std::string label ("MV::DualView");
162  const bool debug = Behavior::debug ();
163 
164  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
165  // creation of the DualView's Views works around
166  // Kokkos::DualView's current inability to accept an
167  // AllocationProperties initial argument (as Kokkos::View does).
168  // However, the work-around is harmless, since it does what the
169  // (currently nonexistent) equivalent DualView constructor would
170  // have done anyway.
171 
172  dev_view_type d_view;
173  if (zeroOut) {
174  if (allowPadding) {
175  d_view = dev_view_type (view_alloc (label, AllowPadding),
176  lclNumRows, numCols);
177  }
178  else {
179  d_view = dev_view_type (view_alloc (label),
180  lclNumRows, numCols);
181  }
182  }
183  else {
184  if (allowPadding) {
185  d_view = dev_view_type (view_alloc (label,
186  WithoutInitializing,
187  AllowPadding),
188  lclNumRows, numCols);
189  }
190  else {
191  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
192  lclNumRows, numCols);
193  }
194  if (debug) {
195  // Filling with NaN is a cheap and effective way to tell if
196  // downstream code is trying to use a MultiVector's data
197  // without them having been initialized. ArithTraits lets us
198  // call nan() even if the scalar type doesn't define it; it
199  // just returns some undefined value in the latter case. This
200  // won't hurt anything because by setting zeroOut=false, users
201  // already agreed that they don't care about the contents of
202  // the MultiVector.
203  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
204  KokkosBlas::fill (d_view, nan);
205  }
206  }
207  if (debug) {
208  TEUCHOS_TEST_FOR_EXCEPTION
209  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
210  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
211  "allocDualView: d_view's dimensions actual dimensions do not match "
212  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
213  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
214  << ". Please report this bug to the Tpetra developers.");
215  }
216 
217  dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
218  // Whether or not the user cares about the initial contents of the
219  // MultiVector, the device and host views are out of sync. We
220  // prefer to work in device memory. The way to ensure this
221  // happens is to mark the device view as modified.
222  dv.modify_device ();
223 
224  return dv;
225  }
226 
227  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
228  //
229  // T: The type of the entries of the View.
230  // ExecSpace: The Kokkos execution space.
231  template<class T, class ExecSpace>
232  struct MakeUnmanagedView {
233  // The 'false' part of the branch carefully ensures that this
234  // won't attempt to use a host execution space that hasn't been
235  // initialized. For example, if Kokkos::OpenMP is disabled and
236  // Kokkos::Threads is enabled, the latter is always the default
237  // execution space of Kokkos::HostSpace, even when ExecSpace is
238  // Kokkos::Serial. That's why we go through the trouble of asking
239  // Kokkos::DualView what _its_ space is. That seems to work
240  // around this default execution space issue.
241  //
242  // NOTE (mfh 29 Jan 2016): See kokkos/kokkos#178 for why we use
243  // a memory space, rather than an execution space, as the first
244  // argument of VerifyExecutionCanAccessMemorySpace.
245  typedef typename Kokkos::Impl::if_c<
246  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
247  typename ExecSpace::memory_space,
248  Kokkos::HostSpace>::value,
249  typename ExecSpace::device_type,
250  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
251  typedef Kokkos::LayoutLeft array_layout;
252  typedef Kokkos::View<T*, array_layout, host_exec_space,
253  Kokkos::MemoryUnmanaged> view_type;
254 
255  static view_type getView (const Teuchos::ArrayView<T>& x_in)
256  {
257  const size_t numEnt = static_cast<size_t> (x_in.size ());
258  if (numEnt == 0) {
259  return view_type ();
260  } else {
261  return view_type (x_in.getRawPtr (), numEnt);
262  }
263  }
264  };
265 
266  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
267  // taking a subview of a 0 x N DualView incorrectly always results
268  // in a 0 x 0 DualView.
269  template<class DualViewType>
270  DualViewType
271  takeSubview (const DualViewType& X,
272  const Kokkos::Impl::ALL_t&,
273  const std::pair<size_t, size_t>& colRng)
274  {
275  if (X.extent (0) == 0 && X.extent (1) != 0) {
276  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
277  }
278  else {
279  return subview (X, Kokkos::ALL (), colRng);
280  }
281  }
282 
283  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
284  // taking a subview of a 0 x N DualView incorrectly always results
285  // in a 0 x 0 DualView.
286  template<class DualViewType>
287  DualViewType
288  takeSubview (const DualViewType& X,
289  const std::pair<size_t, size_t>& rowRng,
290  const std::pair<size_t, size_t>& colRng)
291  {
292  if (X.extent (0) == 0 && X.extent (1) != 0) {
293  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
294  }
295  else {
296  return subview (X, rowRng, colRng);
297  }
298  }
299 
300  template<class DualViewType>
301  size_t
302  getDualViewStride (const DualViewType& dv)
303  {
304  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
305  // method yet, but its Views do.
306  const size_t LDA = dv.d_view.stride (1);
307  const size_t numRows = dv.extent (0);
308 
309  if (LDA == 0) {
310  return (numRows == 0) ? size_t (1) : numRows;
311  }
312  else {
313  return LDA;
314  }
315  }
316 
317  template<class ViewType>
318  size_t
319  getViewStride (const ViewType& view)
320  {
321  const size_t LDA = view.stride (1);
322  const size_t numRows = view.extent (0);
323 
324  if (LDA == 0) {
325  return (numRows == 0) ? size_t (1) : numRows;
326  }
327  else {
328  return LDA;
329  }
330  }
331 
332  template <class impl_scalar_type, class buffer_device_type>
333  bool
334  runKernelOnHost ( Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports )
335  {
336  if (! imports.need_sync_device ()) {
337  return false; // most up-to-date on device
338  }
339  else { // most up-to-date on host
341  return imports.extent(0) <= localLengthThreshold;
342  }
343  }
344 
345 
346  template <class SC, class LO, class GO, class NT>
347  bool
348  runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
349  {
350  if (! X.need_sync_device ()) {
351  return false; // most up-to-date on device
352  }
353  else { // most up-to-date on host
354  constexpr size_t localLengthThreshold = 10000;
355  return X.getLocalLength () <= localLengthThreshold;
356  }
357  }
358 
359  template <class SC, class LO, class GO, class NT>
360  void
361  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
364  {
367  using val_type = typename MV::impl_scalar_type;
368  using mag_type = typename MV::mag_type;
369  using dual_view_type = typename MV::dual_view_type;
370 
371  auto map = X.getMap ();
372  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
373  auto whichVecs = getMultiVectorWhichVectors (X);
374  const bool isConstantStride = X.isConstantStride ();
375  const bool isDistributed = X.isDistributed ();
376 
377  const bool runOnHost = runKernelOnHost (X);
378  if (runOnHost) {
379  using view_type = typename dual_view_type::t_host;
380  using array_layout = typename view_type::array_layout;
381  using device_type = typename view_type::device_type;
382 
383  if (X.need_sync_host ()) {
384  X.sync_host ();
385  }
386  view_type X_lcl = X.getLocalViewHost ();
387  normImpl<val_type, array_layout, device_type,
388  mag_type> (norms, X_lcl, whichNorm, whichVecs,
389  isConstantStride, isDistributed, comm);
390  }
391  else {
392  using view_type = typename dual_view_type::t_dev;
393  using array_layout = typename view_type::array_layout;
394  using device_type = typename view_type::device_type;
395 
396  if (X.need_sync_device ()) {
397  X.sync_device ();
398  }
399  view_type X_lcl = X.getLocalViewDevice ();
400  normImpl<val_type, array_layout, device_type,
401  mag_type> (norms, X_lcl, whichNorm, whichVecs,
402  isConstantStride, isDistributed, comm);
403  }
404  }
405 } // namespace (anonymous)
406 
407 
408 namespace Tpetra {
409 
410  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
411  bool
412  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
413  vectorIndexOutOfRange (const size_t VectorIndex) const {
414  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
415  }
416 
417  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
420  base_type (Teuchos::rcp (new map_type ()))
421  {}
422 
423  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
425  MultiVector (const Teuchos::RCP<const map_type>& map,
426  const size_t numVecs,
427  const bool zeroOut) : /* default is true */
428  base_type (map)
429  {
430  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
431 
432  const size_t lclNumRows = this->getLocalLength ();
433  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
434  origView_ = view_;
435  }
436 
437  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
440  const Teuchos::DataAccess copyOrView) :
441  base_type (source),
442  view_ (source.view_),
443  origView_ (source.origView_),
444  whichVectors_ (source.whichVectors_)
445  {
447  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
448  "const Teuchos::DataAccess): ";
449 
450  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
451 
452  if (copyOrView == Teuchos::Copy) {
453  // Reuse the conveniently already existing function that creates
454  // a deep copy.
455  MV cpy = createCopy (source);
456  this->view_ = cpy.view_;
457  this->origView_ = cpy.origView_;
458  this->whichVectors_ = cpy.whichVectors_;
459  }
460  else if (copyOrView == Teuchos::View) {
461  }
462  else {
463  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
464  true, std::invalid_argument, "Second argument 'copyOrView' has an "
465  "invalid value " << copyOrView << ". Valid values include "
466  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
467  << Teuchos::View << ".");
468  }
469  }
470 
471  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
473  MultiVector (const Teuchos::RCP<const map_type>& map,
474  const dual_view_type& view) :
475  base_type (map),
476  view_ (view),
477  origView_ (view)
478  {
479  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
480  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
481  map->getNodeNumElements ();
482  const size_t lclNumRows_view = view.extent (0);
483  const size_t LDA = getDualViewStride (view);
484 
485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
486  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
487  std::invalid_argument, "Kokkos::DualView does not match Map. "
488  "map->getNodeNumElements() = " << lclNumRows_map
489  << ", view.extent(0) = " << lclNumRows_view
490  << ", and getStride() = " << LDA << ".");
491 
492  using ::Tpetra::Details::Behavior;
493  const bool debug = Behavior::debug ();
494  if (debug) {
495  using ::Tpetra::Details::checkGlobalDualViewValidity;
496  std::ostringstream gblErrStrm;
497  const bool verbose = Behavior::verbose ();
498  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
499  const bool gblValid =
500  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
501  comm.getRawPtr ());
502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
503  (! gblValid, std::runtime_error, gblErrStrm.str ());
504  }
505  }
506 
507 
508  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
510  MultiVector (const Teuchos::RCP<const map_type>& map,
511  const typename dual_view_type::t_dev& d_view) :
512  base_type (map)
513  {
514  using Teuchos::ArrayRCP;
515  using Teuchos::RCP;
516  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
517 
518  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
519 
520  const size_t LDA = getViewStride (d_view);
521  const size_t lclNumRows = map->getNodeNumElements ();
522  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
523  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
524  "Kokkos::View. map->getNodeNumElements() = " << lclNumRows
525  << ", View's column stride = " << LDA
526  << ", and View's extent(0) = " << d_view.extent (0) << ".");
527 
528  auto h_view = Kokkos::create_mirror_view (d_view);
529  view_ = dual_view_type (d_view, h_view);
530 
531  using ::Tpetra::Details::Behavior;
532  const bool debug = Behavior::debug ();
533  if (debug) {
534  using ::Tpetra::Details::checkGlobalDualViewValidity;
535  std::ostringstream gblErrStrm;
536  const bool verbose = Behavior::verbose ();
537  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
538  const bool gblValid =
539  checkGlobalDualViewValidity (&gblErrStrm, view_, verbose,
540  comm.getRawPtr ());
541  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
542  (! gblValid, std::runtime_error, gblErrStrm.str ());
543  }
544  // The user gave us a device view. In order to respect its
545  // initial contents, we mark the DualView as "modified on device."
546  // That way, the next sync will synchronize it with the host view.
547  this->modify_device ();
548  origView_ = view_;
549  }
550 
551  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
553  MultiVector (const Teuchos::RCP<const map_type>& map,
554  const dual_view_type& view,
555  const dual_view_type& origView) :
556  base_type (map),
557  view_ (view),
558  origView_ (origView)
559  {
560  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
561 
562  const size_t LDA = getDualViewStride (origView);
563  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
564  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
565  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
566  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
567  << ". This may also mean that the input origView's first dimension (number "
568  "of rows = " << origView.extent (0) << ") does not not match the number "
569  "of entries in the Map on the calling process.");
570 
571  using ::Tpetra::Details::Behavior;
572  const bool debug = Behavior::debug ();
573  if (debug) {
574  using ::Tpetra::Details::checkGlobalDualViewValidity;
575  std::ostringstream gblErrStrm;
576  const bool verbose = Behavior::verbose ();
577  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
578  const bool gblValid_0 =
579  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
580  comm.getRawPtr ());
581  const bool gblValid_1 =
582  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
583  comm.getRawPtr ());
584  const bool gblValid = gblValid_0 && gblValid_1;
585  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
586  (! gblValid, std::runtime_error, gblErrStrm.str ());
587  }
588  }
589 
590 
591  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
593  MultiVector (const Teuchos::RCP<const map_type>& map,
594  const dual_view_type& view,
595  const Teuchos::ArrayView<const size_t>& whichVectors) :
596  base_type (map),
597  view_ (view),
598  origView_ (view),
599  whichVectors_ (whichVectors.begin (), whichVectors.end ())
600  {
601  using Kokkos::ALL;
602  using Kokkos::subview;
603  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
604 
605  using ::Tpetra::Details::Behavior;
606  const bool debug = Behavior::debug ();
607  if (debug) {
608  using ::Tpetra::Details::checkGlobalDualViewValidity;
609  std::ostringstream gblErrStrm;
610  const bool verbose = Behavior::verbose ();
611  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
612  const bool gblValid =
613  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
614  comm.getRawPtr ());
615  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
616  (! gblValid, std::runtime_error, gblErrStrm.str ());
617  }
618 
619  const size_t lclNumRows = map.is_null () ? size_t (0) :
620  map->getNodeNumElements ();
621  // Check dimensions of the input DualView. We accept that Kokkos
622  // might not allow construction of a 0 x m (Dual)View with m > 0,
623  // so we only require the number of rows to match if the
624  // (Dual)View has more than zero columns. Likewise, we only
625  // require the number of columns to match if the (Dual)View has
626  // more than zero rows.
627  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
628  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
629  std::invalid_argument, "view.extent(0) = " << view.extent (0)
630  << " < map->getNodeNumElements() = " << lclNumRows << ".");
631  if (whichVectors.size () != 0) {
632  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
633  view.extent (1) != 0 && view.extent (1) == 0,
634  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
635  " = " << whichVectors.size () << " > 0.");
636  size_t maxColInd = 0;
637  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
638  for (size_type k = 0; k < whichVectors.size (); ++k) {
639  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
640  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
641  std::invalid_argument, "whichVectors[" << k << "] = "
642  "Teuchos::OrdinalTraits<size_t>::invalid().");
643  maxColInd = std::max (maxColInd, whichVectors[k]);
644  }
645  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
646  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
647  std::invalid_argument, "view.extent(1) = " << view.extent (1)
648  << " <= max(whichVectors) = " << maxColInd << ".");
649  }
650 
651  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
652  // zero strides, so modify in that case.
653  const size_t LDA = getDualViewStride (view);
654  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
655  (LDA < lclNumRows, std::invalid_argument,
656  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
657 
658  if (whichVectors.size () == 1) {
659  // If whichVectors has only one entry, we don't need to bother
660  // with nonconstant stride. Just shift the view over so it
661  // points to the desired column.
662  //
663  // NOTE (mfh 10 May 2014) This is a special case where we set
664  // origView_ just to view that one column, not all of the
665  // original columns. This ensures that the use of origView_ in
666  // offsetView works correctly.
667  const std::pair<size_t, size_t> colRng (whichVectors[0],
668  whichVectors[0] + 1);
669  view_ = takeSubview (view_, ALL (), colRng);
670  origView_ = takeSubview (origView_, ALL (), colRng);
671  // whichVectors_.size() == 0 means "constant stride."
672  whichVectors_.clear ();
673  }
674  }
675 
676  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
678  MultiVector (const Teuchos::RCP<const map_type>& map,
679  const dual_view_type& view,
680  const dual_view_type& origView,
681  const Teuchos::ArrayView<const size_t>& whichVectors) :
682  base_type (map),
683  view_ (view),
684  origView_ (origView),
685  whichVectors_ (whichVectors.begin (), whichVectors.end ())
686  {
687  using Kokkos::ALL;
688  using Kokkos::subview;
689  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
690 
691  using ::Tpetra::Details::Behavior;
692  const bool debug = Behavior::debug ();
693  if (debug) {
694  using ::Tpetra::Details::checkGlobalDualViewValidity;
695  std::ostringstream gblErrStrm;
696  const bool verbose = Behavior::verbose ();
697  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
698  const bool gblValid_0 =
699  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
700  comm.getRawPtr ());
701  const bool gblValid_1 =
702  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
703  comm.getRawPtr ());
704  const bool gblValid = gblValid_0 && gblValid_1;
705  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
706  (! gblValid, std::runtime_error, gblErrStrm.str ());
707  }
708 
709  const size_t lclNumRows = this->getLocalLength ();
710  // Check dimensions of the input DualView. We accept that Kokkos
711  // might not allow construction of a 0 x m (Dual)View with m > 0,
712  // so we only require the number of rows to match if the
713  // (Dual)View has more than zero columns. Likewise, we only
714  // require the number of columns to match if the (Dual)View has
715  // more than zero rows.
716  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
717  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
718  std::invalid_argument, "view.extent(0) = " << view.extent (0)
719  << " < map->getNodeNumElements() = " << lclNumRows << ".");
720  if (whichVectors.size () != 0) {
721  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
722  view.extent (1) != 0 && view.extent (1) == 0,
723  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
724  " = " << whichVectors.size () << " > 0.");
725  size_t maxColInd = 0;
726  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
727  for (size_type k = 0; k < whichVectors.size (); ++k) {
728  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
729  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
730  std::invalid_argument, "whichVectors[" << k << "] = "
731  "Teuchos::OrdinalTraits<size_t>::invalid().");
732  maxColInd = std::max (maxColInd, whichVectors[k]);
733  }
734  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
735  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
736  std::invalid_argument, "view.extent(1) = " << view.extent (1)
737  << " <= max(whichVectors) = " << maxColInd << ".");
738  }
739 
740  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
741  // zero strides, so modify in that case.
742  const size_t LDA = getDualViewStride (origView);
743  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
744  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
745  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
746  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
747  << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
748 
749  if (whichVectors.size () == 1) {
750  // If whichVectors has only one entry, we don't need to bother
751  // with nonconstant stride. Just shift the view over so it
752  // points to the desired column.
753  //
754  // NOTE (mfh 10 May 2014) This is a special case where we set
755  // origView_ just to view that one column, not all of the
756  // original columns. This ensures that the use of origView_ in
757  // offsetView works correctly.
758  const std::pair<size_t, size_t> colRng (whichVectors[0],
759  whichVectors[0] + 1);
760  view_ = takeSubview (view_, ALL (), colRng);
761  origView_ = takeSubview (origView_, ALL (), colRng);
762  // whichVectors_.size() == 0 means "constant stride."
763  whichVectors_.clear ();
764  }
765  }
766 
767  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
769  MultiVector (const Teuchos::RCP<const map_type>& map,
770  const Teuchos::ArrayView<const Scalar>& data,
771  const size_t LDA,
772  const size_t numVecs) :
773  base_type (map)
774  {
775  typedef LocalOrdinal LO;
776  typedef GlobalOrdinal GO;
777  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
778  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
779 
780  // Deep copy constructor, constant stride (NO whichVectors_).
781  // There is no need for a deep copy constructor with nonconstant stride.
782 
783  const size_t lclNumRows =
784  map.is_null () ? size_t (0) : map->getNodeNumElements ();
785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
786  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
787  "map->getNodeNumElements() = " << lclNumRows << ".");
788  if (numVecs != 0) {
789  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
790  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
791  (static_cast<size_t> (data.size ()) < minNumEntries,
792  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
793  "entries, given the input Map and number of vectors in the MultiVector."
794  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
795  "map->getNodeNumElements () = " << minNumEntries << ".");
796  }
797 
798  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
799  this->modify_device ();
800  auto X_out = this->getLocalViewDevice ();
801  origView_ = view_;
802 
803  // Make an unmanaged host Kokkos::View of the input data. First
804  // create a View (X_in_orig) with the original stride. Then,
805  // create a subview (X_in) with the right number of columns.
806  const impl_scalar_type* const X_in_raw =
807  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
808  Kokkos::View<const impl_scalar_type**,
809  Kokkos::LayoutLeft,
810  Kokkos::HostSpace,
811  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
812  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
813  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
814 
815  // If LDA != X_out's column stride, then we need to copy one
816  // column at a time; Kokkos::deep_copy refuses to work in that
817  // case.
818  const size_t outStride =
819  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
820  if (LDA == outStride) { // strides are the same; deep_copy once
821  // This only works because MultiVector uses LayoutLeft.
822  // We would need a custom copy functor otherwise.
823  Kokkos::deep_copy (X_out, X_in);
824  }
825  else { // strides differ; copy one column at a time
826  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
827  out_col_view_type;
828  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
829  in_col_view_type;
830  for (size_t j = 0; j < numVecs; ++j) {
831  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
832  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
833  Kokkos::deep_copy (X_out_j, X_in_j);
834  }
835  }
836  }
837 
838  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
840  MultiVector (const Teuchos::RCP<const map_type>& map,
841  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
842  const size_t numVecs) :
843  base_type (map)
844  {
845  typedef impl_scalar_type IST;
846  typedef LocalOrdinal LO;
847  typedef GlobalOrdinal GO;
848  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
849  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
850 
851  const size_t lclNumRows =
852  map.is_null () ? size_t (0) : map->getNodeNumElements ();
853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
854  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
855  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
856  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
857  for (size_t j = 0; j < numVecs; ++j) {
858  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
859  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
860  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
861  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
862  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
863  << ".");
864  }
865 
866  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
867  this->modify_device ();
868  auto X_out = this->getLocalViewDevice ();
869 
870  // Make sure that the type of a single input column has the same
871  // array layout as each output column does, so we can deep_copy.
872  using array_layout = typename decltype (X_out)::array_layout;
873  using input_col_view_type = typename Kokkos::View<const IST*,
874  array_layout,
875  Kokkos::HostSpace,
876  Kokkos::MemoryUnmanaged>;
877 
878  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
879  for (size_t j = 0; j < numVecs; ++j) {
880  Teuchos::ArrayView<const IST> X_j_av =
881  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
882  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
883  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
884  Kokkos::deep_copy (X_j_out, X_j_in);
885  }
886  origView_ = view_;
887  }
888 
889  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892  return whichVectors_.empty ();
893  }
894 
895  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
896  size_t
899  {
900  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
901  return static_cast<size_t> (0);
902  } else {
903  return this->getMap ()->getNodeNumElements ();
904  }
905  }
906 
907  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
911  {
912  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
913  return static_cast<size_t> (0);
914  } else {
915  return this->getMap ()->getGlobalNumElements ();
916  }
917  }
918 
919  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
920  size_t
922  getStride () const
923  {
924  return isConstantStride () ? getDualViewStride (origView_) : size_t (0);
925  }
926 
927  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
928  bool
930  checkSizes (const SrcDistObject& sourceObj)
931  {
932  // Check whether the source object is a MultiVector. If not, then
933  // we can't even compare sizes, so it's definitely not OK to
934  // Import or Export from it.
936  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
937  if (src == nullptr) {
938  return false;
939  }
940  else {
941  // The target of the Import or Export calls checkSizes() in
942  // DistObject::doTransfer(). By that point, we've already
943  // constructed an Import or Export object using the two
944  // multivectors' Maps, which means that (hopefully) we've
945  // already checked other attributes of the multivectors. Thus,
946  // all we need to do here is check the number of columns.
947  return src->getNumVectors () == this->getNumVectors ();
948  }
949  }
950 
951  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
952  size_t
955  return this->getNumVectors ();
956  }
957 
958  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
959  void
962  (const SrcDistObject& sourceObj,
963  const size_t numSameIDs,
964  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
965  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs)
966  {
967  using ::Tpetra::Details::Behavior;
969  using ::Tpetra::Details::ProfilingRegion;
970  using std::endl;
971  using KokkosRefactor::Details::permute_array_multi_column;
972  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
973  using Kokkos::Compat::create_const_view;
975  const char tfecfFuncName[] = "copyAndPermute: ";
976  ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
977 
978  const bool verbose = Behavior::verbose ();
979  std::unique_ptr<std::string> prefix;
980  if (verbose) {
981  auto map = this->getMap ();
982  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
983  const int myRank = comm.is_null () ? -1 : comm->getRank ();
984  std::ostringstream os;
985  os << "Proc " << myRank << ": MV::copyAndPermute: ";
986  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
987  os << "Start" << endl;
988  std::cerr << os.str ();
989  }
990 
991  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
992  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
993  std::logic_error, "permuteToLIDs.extent(0) = "
994  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
995  << permuteFromLIDs.extent (0) << ".");
996 
997  // We've already called checkSizes(), so this cast must succeed.
998  MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
999  const size_t numCols = this->getNumVectors ();
1000 
1001  // sourceMV doesn't belong to us, so we can't sync it. Do the
1002  // copying where it's currently sync'd.
1003  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1004  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1005  std::logic_error, "Input MultiVector needs sync to both host "
1006  "and device.");
1007  const bool copyOnHost = runKernelOnHost(sourceMV);
1008  if (verbose) {
1009  std::ostringstream os;
1010  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1011  std::cerr << os.str ();
1012  }
1013 
1014  if (copyOnHost) {
1015  sourceMV.sync_host();
1016  this->sync_host ();
1017  this->modify_host ();
1018  }
1019  else {
1020  sourceMV.sync_device();
1021  if (this->need_sync_device ()) {
1022  this->sync_device ();
1023  }
1024  this->modify_device ();
1025  }
1026 
1027  if (verbose) {
1028  std::ostringstream os;
1029  os << *prefix << "Copy" << endl;
1030  std::cerr << os.str ();
1031  }
1032 
1033  // TODO (mfh 15 Sep 2013) When we replace
1034  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1035  // ways to copy the data:
1036  //
1037  // 1. Get a (sub)view of each column and call deep_copy on that.
1038  // 2. Write a custom kernel to copy the data.
1039  //
1040  // The first is easier, but the second might be more performant in
1041  // case we decide to use layouts other than LayoutLeft. It might
1042  // even make sense to hide whichVectors_ in an entirely new layout
1043  // for Kokkos Views.
1044 
1045  // Copy rows [0, numSameIDs-1] of the local multivectors.
1046  //
1047  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1048  // time, even when both multivectors are constant-stride, since
1049  // deep_copy between strided subviews with more than one column
1050  // doesn't currently work.
1051 
1052  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1053  // both source and target are constant stride and have multiple
1054  // columns.
1055  if (numSameIDs > 0) {
1056  const std::pair<size_t, size_t> rows (0, numSameIDs);
1057  if (copyOnHost) {
1058  auto tgt_h = this->getLocalViewHost ();
1059  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
1060 
1061  for (size_t j = 0; j < numCols; ++j) {
1062  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1063  const size_t srcCol =
1064  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1065 
1066  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1067  auto src_j = Kokkos::subview (src_h, rows, srcCol);
1068  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
1069  }
1070  }
1071  else { // copy on device
1072  auto tgt_d = this->getLocalViewDevice ();
1073  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
1074 
1075  for (size_t j = 0; j < numCols; ++j) {
1076  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1077  const size_t srcCol =
1078  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1079 
1080  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1081  auto src_j = Kokkos::subview (src_d, rows, srcCol);
1082  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
1083  }
1084  }
1085  }
1086 
1087  // For the remaining GIDs, execute the permutations. This may
1088  // involve noncontiguous access of both source and destination
1089  // vectors, depending on the LID lists.
1090  //
1091  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1092  // the same process, this merges their values by replacement of
1093  // the last encountered GID, not by the specified merge rule
1094  // (such as ADD).
1095 
1096  // If there are no permutations, we are done
1097  if (permuteFromLIDs.extent (0) == 0 ||
1098  permuteToLIDs.extent (0) == 0) {
1099  if (verbose) {
1100  std::ostringstream os;
1101  os << *prefix << "No permutations. Done!" << endl;
1102  std::cerr << os.str ();
1103  }
1104  return;
1105  }
1106 
1107  if (verbose) {
1108  std::ostringstream os;
1109  os << *prefix << "Permute" << endl;
1110  std::cerr << os.str ();
1111  }
1112 
1113  // We could in theory optimize for the case where exactly one of
1114  // them is constant stride, but we don't currently do that.
1115  const bool nonConstStride =
1116  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1117 
1118  if (verbose) {
1119  std::ostringstream os;
1120  os << *prefix << "nonConstStride="
1121  << (nonConstStride ? "true" : "false") << endl;
1122  std::cerr << os.str ();
1123  }
1124 
1125  // We only need the "which vectors" arrays if either the source or
1126  // target MV is not constant stride. Since we only have one
1127  // kernel that must do double-duty, we have to create a "which
1128  // vectors" array for the MV that _is_ constant stride.
1129  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1130  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1131  if (nonConstStride) {
1132  if (this->whichVectors_.size () == 0) {
1133  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1134  tmpTgt.modify_host ();
1135  for (size_t j = 0; j < numCols; ++j) {
1136  tmpTgt.h_view(j) = j;
1137  }
1138  if (! copyOnHost) {
1139  tmpTgt.sync_device ();
1140  }
1141  tgtWhichVecs = tmpTgt;
1142  }
1143  else {
1144  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1145  tgtWhichVecs =
1146  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1147  "tgtWhichVecs",
1148  copyOnHost);
1149  }
1150  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1151  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1152  this->getNumVectors (),
1153  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1154  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1155  this->getNumVectors () << ".");
1156 
1157  if (sourceMV.whichVectors_.size () == 0) {
1158  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1159  tmpSrc.modify_host ();
1160  for (size_t j = 0; j < numCols; ++j) {
1161  tmpSrc.h_view(j) = j;
1162  }
1163  if (! copyOnHost) {
1164  tmpSrc.sync_device ();
1165  }
1166  srcWhichVecs = tmpSrc;
1167  }
1168  else {
1169  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1170  sourceMV.whichVectors_ ();
1171  srcWhichVecs =
1172  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1173  "srcWhichVecs",
1174  copyOnHost);
1175  }
1176  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1177  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1178  sourceMV.getNumVectors (), std::logic_error,
1179  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1180  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1181  << ".");
1182  }
1183 
1184  if (copyOnHost) { // permute on host too
1185  if (verbose) {
1186  std::ostringstream os;
1187  os << *prefix << "Get permute LIDs on host" << std::endl;
1188  std::cerr << os.str ();
1189  }
1190  auto tgt_h = this->getLocalViewHost ();
1191  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
1192 
1193  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1194  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1195  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1196  auto permuteFromLIDs_h =
1197  create_const_view (permuteFromLIDs.view_host ());
1198 
1199  if (verbose) {
1200  std::ostringstream os;
1201  os << *prefix << "Permute on host" << endl;
1202  std::cerr << os.str ();
1203  }
1204  if (nonConstStride) {
1205  // No need to sync first, because copyOnHost argument to
1206  // getDualViewCopyFromArrayView puts them in the right place.
1207  auto tgtWhichVecs_h =
1208  create_const_view (tgtWhichVecs.view_host ());
1209  auto srcWhichVecs_h =
1210  create_const_view (srcWhichVecs.view_host ());
1211  permute_array_multi_column_variable_stride (tgt_h, src_h,
1212  permuteToLIDs_h,
1213  permuteFromLIDs_h,
1214  tgtWhichVecs_h,
1215  srcWhichVecs_h, numCols);
1216  }
1217  else {
1218  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1219  permuteFromLIDs_h, numCols);
1220  }
1221  }
1222  else { // permute on device
1223  if (verbose) {
1224  std::ostringstream os;
1225  os << *prefix << "Get permute LIDs on device" << endl;
1226  std::cerr << os.str ();
1227  }
1228  auto tgt_d = this->getLocalViewDevice ();
1229  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
1230 
1231  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1232  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1233  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1234  auto permuteFromLIDs_d =
1235  create_const_view (permuteFromLIDs.view_device ());
1236 
1237  if (verbose) {
1238  std::ostringstream os;
1239  os << *prefix << "Permute on device" << endl;
1240  std::cerr << os.str ();
1241  }
1242  if (nonConstStride) {
1243  // No need to sync first, because copyOnHost argument to
1244  // getDualViewCopyFromArrayView puts them in the right place.
1245  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1246  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1247  permute_array_multi_column_variable_stride (tgt_d, src_d,
1248  permuteToLIDs_d,
1249  permuteFromLIDs_d,
1250  tgtWhichVecs_d,
1251  srcWhichVecs_d, numCols);
1252  }
1253  else {
1254  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1255  permuteFromLIDs_d, numCols);
1256  }
1257  }
1258 
1259  if (verbose) {
1260  std::ostringstream os;
1261  os << *prefix << "Done!" << endl;
1262  std::cerr << os.str ();
1263  }
1264  }
1265 
1266  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1267  void
1268  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1269  packAndPrepare
1270  (const SrcDistObject& sourceObj,
1271  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1272  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1273  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1274  size_t& constantNumPackets,
1275  Distributor & /* distor */ )
1276  {
1277  using ::Tpetra::Details::Behavior;
1278  using ::Tpetra::Details::ProfilingRegion;
1280  using Kokkos::Compat::create_const_view;
1281  using Kokkos::Compat::getKokkosViewDeepCopy;
1282  using std::endl;
1283  using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
1284  const char tfecfFuncName[] = "packAndPrepare: ";
1285  ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1286 
1287  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1288  // have the option to check indices. We do so when Tpetra is in
1289  // debug mode. It is in debug mode by default in a debug build,
1290  // but you may control this at run time, before launching the
1291  // executable, by setting the TPETRA_DEBUG environment variable to
1292  // "1" (or "TRUE").
1293  const bool debugCheckIndices = Behavior::debug ();
1294  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1295  // environment variable to "1" (or "TRUE") for copious debug
1296  // output to std::cerr on every MPI process. This is unwise for
1297  // runs with large numbers of MPI processes.
1298  const bool printDebugOutput = Behavior::verbose ();
1299 
1300  std::unique_ptr<std::string> prefix;
1301  if (printDebugOutput) {
1302  auto map = this->getMap ();
1303  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1304  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1305  std::ostringstream os;
1306  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1307  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1308  os << "Start" << endl;
1309  std::cerr << os.str ();
1310  }
1311 
1312  // We've already called checkSizes(), so this cast must succeed.
1313  MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1314 
1315  const size_t numCols = sourceMV.getNumVectors ();
1316 
1317  // This spares us from needing to fill numExportPacketsPerLID.
1318  // Setting constantNumPackets to a nonzero value signals that
1319  // all packets have the same number of entries.
1320  constantNumPackets = numCols;
1321 
1322  // If we have no exports, there is nothing to do. Make sure this
1323  // goes _after_ setting constantNumPackets correctly.
1324  if (exportLIDs.extent (0) == 0) {
1325  if (printDebugOutput) {
1326  std::ostringstream os;
1327  os << *prefix << "No exports on this proc, DONE" << std::endl;
1328  std::cerr << os.str ();
1329  }
1330  return;
1331  }
1332 
1333  /* The layout in the export for MultiVectors is as follows:
1334  exports = { all of the data from row exportLIDs.front() ;
1335  ....
1336  all of the data from row exportLIDs.back() }
1337  This doesn't have the best locality, but is necessary because
1338  the data for a Packet (all data associated with an LID) is
1339  required to be contiguous. */
1340 
1341  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1342  // packing scheme in the above comment? The data going to a
1343  // particular process must be contiguous, of course, but those
1344  // data could include entries from multiple LIDs. DistObject just
1345  // needs to know how to index into that data. Kokkos is good at
1346  // decoupling storage intent from data layout choice.
1347 
1348  const size_t numExportLIDs = exportLIDs.extent (0);
1349  const size_t newExportsSize = numCols * numExportLIDs;
1350  if (printDebugOutput) {
1351  std::ostringstream os;
1352  os << *prefix << "realloc: "
1353  << "numExportLIDs: " << numExportLIDs
1354  << ", exports.extent(0): " << exports.extent (0)
1355  << ", newExportsSize: " << newExportsSize << std::endl;
1356  std::cerr << os.str ();
1357  }
1358  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1359 
1360  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1361  // sync it. Pack it where it's currently sync'd.
1362  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1363  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1364  std::logic_error, "Input MultiVector needs sync to both host "
1365  "and device.");
1366  const bool packOnHost = runKernelOnHost(sourceMV);
1367  auto src_dev = sourceMV.getLocalViewHost ();
1368  auto src_host = sourceMV.getLocalViewDevice ();
1369  if (printDebugOutput) {
1370  std::ostringstream os;
1371  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1372  std::cerr << os.str ();
1373  }
1374 
1375  // Mark 'exports' here, since we might have resized it above.
1376  // Resizing currently requires calling the constructor, which
1377  // clears out the 'modified' flags.
1378  if (packOnHost) {
1379  // nde 06 Feb 2020: If 'exports' does not require resize
1380  // when reallocDualViewIfNeeded is called, the modified flags
1381  // are not cleared out. This can result in host and device views
1382  // being out-of-sync, resuling in an error in exports.modify_* calls.
1383  // Clearing the sync flags prevents this possible case.
1384  exports.clear_sync_state ();
1385  exports.modify_host ();
1386  sourceMV.sync_host();
1387  }
1388  else {
1389  // nde 06 Feb 2020: If 'exports' does not require resize
1390  // when reallocDualViewIfNeeded is called, the modified flags
1391  // are not cleared out. This can result in host and device views
1392  // being out-of-sync, resuling in an error in exports.modify_* calls.
1393  // Clearing the sync flags prevents this possible case.
1394  exports.clear_sync_state ();
1395  exports.modify_device ();
1396  sourceMV.sync_device();
1397  }
1398 
1399  if (numCols == 1) { // special case for one column only
1400  // MultiVector always represents a single column with constant
1401  // stride, but it doesn't hurt to implement both cases anyway.
1402  //
1403  // ETP: I'm not sure I agree with the above statement. Can't a single-
1404  // column multivector be a subview of another multi-vector, in which case
1405  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1406  // separately.
1407  //
1408  // mfh 18 Jan 2016: In answer to ETP's comment above:
1409  // MultiVector treats single-column MultiVectors created using a
1410  // "nonconstant stride constructor" as a special case, and makes
1411  // them constant stride (by making whichVectors_ have length 0).
1412  if (sourceMV.isConstantStride ()) {
1413  using KokkosRefactor::Details::pack_array_single_column;
1414  if (printDebugOutput) {
1415  std::ostringstream os;
1416  os << *prefix << "Pack numCols=1 const stride" << endl;
1417  std::cerr << os.str ();
1418  }
1419  if (packOnHost) {
1420  pack_array_single_column (exports.view_host (),
1421  create_const_view (src_host),
1422  exportLIDs.view_host (),
1423  0,
1424  debugCheckIndices);
1425  }
1426  else { // pack on device
1427  pack_array_single_column (exports.view_device (),
1428  create_const_view (src_dev),
1429  exportLIDs.view_device (),
1430  0,
1431  debugCheckIndices);
1432  }
1433  }
1434  else {
1435  using KokkosRefactor::Details::pack_array_single_column;
1436  if (printDebugOutput) {
1437  std::ostringstream os;
1438  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1439  std::cerr << os.str ();
1440  }
1441  if (packOnHost) {
1442  pack_array_single_column (exports.view_host (),
1443  create_const_view (src_host),
1444  exportLIDs.view_host (),
1445  sourceMV.whichVectors_[0],
1446  debugCheckIndices);
1447  }
1448  else { // pack on device
1449  pack_array_single_column (exports.view_device (),
1450  create_const_view (src_dev),
1451  exportLIDs.view_device (),
1452  sourceMV.whichVectors_[0],
1453  debugCheckIndices);
1454  }
1455  }
1456  }
1457  else { // the source MultiVector has multiple columns
1458  if (sourceMV.isConstantStride ()) {
1459  using KokkosRefactor::Details::pack_array_multi_column;
1460  if (printDebugOutput) {
1461  std::ostringstream os;
1462  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1463  std::cerr << os.str ();
1464  }
1465  if (packOnHost) {
1466  pack_array_multi_column (exports.view_host (),
1467  create_const_view (src_host),
1468  exportLIDs.view_host (),
1469  numCols,
1470  debugCheckIndices);
1471  }
1472  else { // pack on device
1473  pack_array_multi_column (exports.view_device (),
1474  create_const_view (src_dev),
1475  exportLIDs.view_device (),
1476  numCols,
1477  debugCheckIndices);
1478  }
1479  }
1480  else {
1481  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1482  if (printDebugOutput) {
1483  std::ostringstream os;
1484  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1485  << endl;
1486  std::cerr << os.str ();
1487  }
1488  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1489  // whichVectors_ can be expensive, but pack and unpack for
1490  // nonconstant-stride MultiVectors is slower anyway.
1491  using IST = impl_scalar_type;
1492  using DV = Kokkos::DualView<IST*, device_type>;
1493  using HES = typename DV::t_host::execution_space;
1494  using DES = typename DV::t_dev::execution_space;
1495  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1496  if (packOnHost) {
1497  pack_array_multi_column_variable_stride
1498  (exports.view_host (),
1499  create_const_view (src_host),
1500  exportLIDs.view_host (),
1501  getKokkosViewDeepCopy<HES> (whichVecs),
1502  numCols,
1503  debugCheckIndices);
1504  }
1505  else { // pack on device
1506  pack_array_multi_column_variable_stride
1507  (exports.view_device (),
1508  create_const_view (src_dev),
1509  exportLIDs.view_device (),
1510  getKokkosViewDeepCopy<DES> (whichVecs),
1511  numCols,
1512  debugCheckIndices);
1513  }
1514  }
1515  }
1516 
1517  if (printDebugOutput) {
1518  std::ostringstream os;
1519  os << *prefix << "Done!" << endl;
1520  std::cerr << os.str ();
1521  }
1522  }
1523 
1524 
1525  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1526  void
1527  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1528  unpackAndCombine
1529  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1530  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1531  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1532  const size_t constantNumPackets,
1533  Distributor& /* distor */,
1534  const CombineMode CM)
1535  {
1536  using ::Tpetra::Details::Behavior;
1537  using ::Tpetra::Details::ProfilingRegion;
1538  using KokkosRefactor::Details::unpack_array_multi_column;
1539  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1540  using Kokkos::Compat::getKokkosViewDeepCopy;
1541  using std::endl;
1542  using IST = impl_scalar_type;
1543  const char longFuncName[] = "Tpetra::MultiVector::unpackAndCombine";
1544  const char tfecfFuncName[] = "unpackAndCombine: ";
1545  ProfilingRegion regionUAC (longFuncName);
1546 
1547  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1548  // have the option to check indices. We do so when Tpetra is in
1549  // debug mode. It is in debug mode by default in a debug build,
1550  // but you may control this at run time, before launching the
1551  // executable, by setting the TPETRA_DEBUG environment variable to
1552  // "1" (or "TRUE").
1553  const bool debugCheckIndices = Behavior::debug ();
1554 
1555  const bool printDebugOutput = Behavior::verbose ();
1556  std::unique_ptr<std::string> prefix;
1557  if (printDebugOutput) {
1558  auto map = this->getMap ();
1559  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1560  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1561  std::ostringstream os;
1562  os << "Proc " << myRank << ": " << longFuncName << ": ";
1563  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1564  os << "Start" << endl;
1565  std::cerr << os.str ();
1566  }
1567 
1568  // If we have no imports, there is nothing to do
1569  if (importLIDs.extent (0) == 0) {
1570  if (printDebugOutput) {
1571  std::ostringstream os;
1572  os << *prefix << "No imports. Done!" << endl;
1573  }
1574  return;
1575  }
1576 
1577  const size_t numVecs = getNumVectors ();
1578  if (debugCheckIndices) {
1579  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1580  (static_cast<size_t> (imports.extent (0)) !=
1581  numVecs * importLIDs.extent (0),
1582  std::runtime_error,
1583  "imports.extent(0) = " << imports.extent (0)
1584  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1585  << " * " << importLIDs.extent (0) << " = "
1586  << numVecs * importLIDs.extent (0) << ".");
1587 
1588  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1589  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1590  "constantNumPackets input argument must be nonzero.");
1591 
1592  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1593  (static_cast<size_t> (numVecs) !=
1594  static_cast<size_t> (constantNumPackets),
1595  std::runtime_error, "constantNumPackets must equal numVecs.");
1596  }
1597 
1598  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1599  // the memory space in which the imports buffer was last modified.
1600  // DistObject::doTransferNew gets to decide this.
1601  const bool unpackOnHost = runKernelOnHost(imports);
1602 
1603  if (printDebugOutput) {
1604  std::ostringstream os;
1605  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1606  << endl;
1607  std::cerr << os.str ();
1608  }
1609 
1610  // We have to sync before modifying, because this method may read
1611  // as well as write (depending on the CombineMode).
1612  if (unpackOnHost) {
1613  imports.sync_host();
1614  this->sync_host ();
1615  this->modify_host ();
1616  }
1617  else {
1618  imports.sync_device();
1619  this->sync_device ();
1620  this->modify_device ();
1621  }
1622  auto X_d = this->getLocalViewDevice ();
1623  auto X_h = this->getLocalViewHost ();
1624  auto imports_d = imports.view_device ();
1625  auto imports_h = imports.view_host ();
1626  auto importLIDs_d = importLIDs.view_device ();
1627  auto importLIDs_h = importLIDs.view_host ();
1628 
1629  Kokkos::DualView<size_t*, device_type> whichVecs;
1630  if (! isConstantStride ()) {
1631  Kokkos::View<const size_t*, Kokkos::HostSpace,
1632  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1633  numVecs);
1634  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1635  if (unpackOnHost) {
1636  whichVecs.modify_host ();
1637  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1638  }
1639  else {
1640  whichVecs.modify_device ();
1641  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
1642  }
1643  }
1644  auto whichVecs_d = whichVecs.view_device ();
1645  auto whichVecs_h = whichVecs.view_host ();
1646 
1647  /* The layout in the export for MultiVectors is as follows:
1648  imports = { all of the data from row exportLIDs.front() ;
1649  ....
1650  all of the data from row exportLIDs.back() }
1651  This doesn't have the best locality, but is necessary because
1652  the data for a Packet (all data associated with an LID) is
1653  required to be contiguous. */
1654 
1655  if (numVecs > 0 && importLIDs.extent (0) > 0) {
1656  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1657  using host_exec_space = typename dual_view_type::t_host::execution_space;
1658 
1659  // This fixes GitHub Issue #4418.
1660  const bool use_atomic_updates = unpackOnHost ?
1661  host_exec_space::concurrency () != 1 :
1662  dev_exec_space::concurrency () != 1;
1663 
1664  if (printDebugOutput) {
1665  std::ostringstream os;
1666  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
1667  std::cerr << os.str ();
1668  }
1669 
1670  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1671  // custom combine modes, start editing here.
1672 
1673  if (CM == INSERT || CM == REPLACE) {
1674  using op_type = KokkosRefactor::Details::InsertOp<IST>;
1675  if (isConstantStride ()) {
1676  if (unpackOnHost) {
1677  unpack_array_multi_column (host_exec_space (),
1678  X_h, imports_h, importLIDs_h,
1679  op_type (), numVecs,
1680  use_atomic_updates,
1681  debugCheckIndices);
1682 
1683  }
1684  else { // unpack on device
1685  unpack_array_multi_column (dev_exec_space (),
1686  X_d, imports_d, importLIDs_d,
1687  op_type (), numVecs,
1688  use_atomic_updates,
1689  debugCheckIndices);
1690  }
1691  }
1692  else { // not constant stride
1693  if (unpackOnHost) {
1694  unpack_array_multi_column_variable_stride (host_exec_space (),
1695  X_h, imports_h,
1696  importLIDs_h,
1697  whichVecs_h,
1698  op_type (),
1699  numVecs,
1700  use_atomic_updates,
1701  debugCheckIndices);
1702  }
1703  else { // unpack on device
1704  unpack_array_multi_column_variable_stride (dev_exec_space (),
1705  X_d, imports_d,
1706  importLIDs_d,
1707  whichVecs_d,
1708  op_type (),
1709  numVecs,
1710  use_atomic_updates,
1711  debugCheckIndices);
1712  }
1713  }
1714  }
1715  else if (CM == ADD) {
1716  using op_type = KokkosRefactor::Details::AddOp<IST>;
1717  if (isConstantStride ()) {
1718  if (unpackOnHost) {
1719  unpack_array_multi_column (host_exec_space (),
1720  X_h, imports_h, importLIDs_h,
1721  op_type (), numVecs,
1722  use_atomic_updates,
1723  debugCheckIndices);
1724  }
1725  else { // unpack on device
1726  unpack_array_multi_column (dev_exec_space (),
1727  X_d, imports_d, importLIDs_d,
1728  op_type (), numVecs,
1729  use_atomic_updates,
1730  debugCheckIndices);
1731  }
1732  }
1733  else { // not constant stride
1734  if (unpackOnHost) {
1735  unpack_array_multi_column_variable_stride (host_exec_space (),
1736  X_h, imports_h,
1737  importLIDs_h,
1738  whichVecs_h,
1739  op_type (),
1740  numVecs,
1741  use_atomic_updates,
1742  debugCheckIndices);
1743  }
1744  else { // unpack on device
1745  unpack_array_multi_column_variable_stride (dev_exec_space (),
1746  X_d, imports_d,
1747  importLIDs_d,
1748  whichVecs_d,
1749  op_type (),
1750  numVecs,
1751  use_atomic_updates,
1752  debugCheckIndices);
1753  }
1754  }
1755  }
1756  else if (CM == ABSMAX) {
1757  using op_type = KokkosRefactor::Details::AbsMaxOp<IST>;
1758  if (isConstantStride ()) {
1759  if (unpackOnHost) {
1760  unpack_array_multi_column (host_exec_space (),
1761  X_h, imports_h, importLIDs_h,
1762  op_type (), numVecs,
1763  use_atomic_updates,
1764  debugCheckIndices);
1765  }
1766  else { // unpack on device
1767  unpack_array_multi_column (dev_exec_space (),
1768  X_d, imports_d, importLIDs_d,
1769  op_type (), numVecs,
1770  use_atomic_updates,
1771  debugCheckIndices);
1772  }
1773  }
1774  else {
1775  if (unpackOnHost) {
1776  unpack_array_multi_column_variable_stride (host_exec_space (),
1777  X_h, imports_h,
1778  importLIDs_h,
1779  whichVecs_h,
1780  op_type (),
1781  numVecs,
1782  use_atomic_updates,
1783  debugCheckIndices);
1784  }
1785  else { // unpack on device
1786  unpack_array_multi_column_variable_stride (dev_exec_space (),
1787  X_d, imports_d,
1788  importLIDs_d,
1789  whichVecs_d,
1790  op_type (),
1791  numVecs,
1792  use_atomic_updates,
1793  debugCheckIndices);
1794  }
1795  }
1796  }
1797  else {
1798  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1799  (true, std::logic_error, "Invalid CombineMode");
1800  }
1801  }
1802  else {
1803  if (printDebugOutput) {
1804  std::ostringstream os;
1805  os << *prefix << "Nothing to unpack" << endl;
1806  std::cerr << os.str ();
1807  }
1808  }
1809 
1810  if (printDebugOutput) {
1811  std::ostringstream os;
1812  os << *prefix << "Done!" << endl;
1813  std::cerr << os.str ();
1814  }
1815  }
1816 
1817  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1818  size_t
1821  {
1822  if (isConstantStride ()) {
1823  return static_cast<size_t> (view_.extent (1));
1824  } else {
1825  return static_cast<size_t> (whichVectors_.size ());
1826  }
1827  }
1828 
1829  namespace { // (anonymous)
1830 
1831  template<class RV>
1832  void
1833  gblDotImpl (const RV& dotsOut,
1834  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1835  const bool distributed)
1836  {
1837  using Teuchos::REDUCE_MAX;
1838  using Teuchos::REDUCE_SUM;
1839  using Teuchos::reduceAll;
1840  typedef typename RV::non_const_value_type dot_type;
1841 
1842  const size_t numVecs = dotsOut.extent (0);
1843 
1844  // If the MultiVector is distributed over multiple processes, do
1845  // the distributed (interprocess) part of the dot product. We
1846  // assume that the MPI implementation can read from and write to
1847  // device memory.
1848  //
1849  // replaceMap() may have removed some processes. Those
1850  // processes have a null Map. They must not participate in any
1851  // collective operations. We ask first whether the Map is null,
1852  // because isDistributed() defers that question to the Map. We
1853  // still compute and return local dots for processes not
1854  // participating in collective operations; those probably don't
1855  // make any sense, but it doesn't hurt to do them, since it's
1856  // illegal to call dot() on those processes anyway.
1857  if (distributed && ! comm.is_null ()) {
1858  // The calling process only participates in the collective if
1859  // both the Map and its Comm on that process are nonnull.
1860  const int nv = static_cast<int> (numVecs);
1861  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
1862 
1863  if (commIsInterComm) {
1864  // If comm is an intercomm, then we may not alias input and
1865  // output buffers, so we have to make a copy of the local
1866  // sum.
1867  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1868  Kokkos::deep_copy (lclDots, dotsOut);
1869  const dot_type* const lclSum = lclDots.data ();
1870  dot_type* const gblSum = dotsOut.data ();
1871  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1872  }
1873  else {
1874  dot_type* const inout = dotsOut.data ();
1875  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
1876  }
1877  }
1878  }
1879  } // namespace (anonymous)
1880 
1881  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1882  void
1885  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
1886  {
1887  using ::Tpetra::Details::Behavior;
1888  using Kokkos::subview;
1889  using Teuchos::Comm;
1890  using Teuchos::null;
1891  using Teuchos::RCP;
1892  // View of all the dot product results.
1893  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
1895  typedef typename dual_view_type::t_dev XMV;
1896  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1897 
1898  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
1899 
1900  const size_t numVecs = this->getNumVectors ();
1901  if (numVecs == 0) {
1902  return; // nothing to do
1903  }
1904  const size_t lclNumRows = this->getLocalLength ();
1905  const size_t numDots = static_cast<size_t> (dots.extent (0));
1906  const bool debug = Behavior::debug ();
1907 
1908  if (debug) {
1909  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1910  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1911  (! compat, std::invalid_argument, "'*this' MultiVector is not "
1912  "compatible with the input MultiVector A. We only test for this "
1913  "in debug mode.");
1914  }
1915 
1916  // FIXME (mfh 11 Jul 2014) These exception tests may not
1917  // necessarily be thrown on all processes consistently. We should
1918  // instead pass along error state with the inner product. We
1919  // could do this by setting an extra slot to
1920  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1921  // final sum should be
1922  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1923  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1924  lclNumRows != A.getLocalLength (), std::runtime_error,
1925  "MultiVectors do not have the same local length. "
1926  "this->getLocalLength() = " << lclNumRows << " != "
1927  "A.getLocalLength() = " << A.getLocalLength () << ".");
1928  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1929  numVecs != A.getNumVectors (), std::runtime_error,
1930  "MultiVectors must have the same number of columns (vectors). "
1931  "this->getNumVectors() = " << numVecs << " != "
1932  "A.getNumVectors() = " << A.getNumVectors () << ".");
1933  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1934  numDots != numVecs, std::runtime_error,
1935  "The output array 'dots' must have the same number of entries as the "
1936  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
1937  numDots << " != this->getNumVectors() = " << numVecs << ".");
1938 
1939  const std::pair<size_t, size_t> colRng (0, numVecs);
1940  RV dotsOut = subview (dots, colRng);
1941  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1942  this->getMap ()->getComm ();
1943 
1944  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1945  if (this->need_sync_device ()) {
1946  const_cast<MV&>(*this).sync_device ();
1947  }
1948  if (A.need_sync_device ()) {
1949  const_cast<MV&>(A).sync_device ();
1950  }
1951 
1952  auto thisView = this->getLocalViewDevice ();
1953  auto A_view = A.getLocalViewDevice ();
1954 
1955  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1956  this->whichVectors_.getRawPtr (),
1957  A.whichVectors_.getRawPtr (),
1958  this->isConstantStride (), A.isConstantStride ());
1959  gblDotImpl (dotsOut, comm, this->isDistributed ());
1960  }
1961 
1962  namespace { // (anonymous)
1963  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1965  multiVectorSingleColumnDot (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
1967  {
1968  using ::Tpetra::Details::ProfilingRegion;
1970  using dot_type = typename MV::dot_type;
1971  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
1972 
1973  auto map = x.getMap ();
1974  Teuchos::RCP<const Teuchos::Comm<int> > comm =
1975  map.is_null () ? Teuchos::null : map->getComm ();
1976  if (comm.is_null ()) {
1977  return Kokkos::ArithTraits<dot_type>::zero ();
1978  }
1979  else {
1980  using LO = LocalOrdinal;
1981  // The min just ensures that we don't overwrite memory that
1982  // doesn't belong to us, in the erroneous input case where x
1983  // and y have different numbers of rows.
1984  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
1985  y.getLocalLength ()));
1986  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
1987  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
1988  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
1989 
1990  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1991  if (x.need_sync_device ()) {
1992  x.sync_device ();
1993  }
1994  if (y.need_sync_device ()) {
1995  const_cast<MV&>(y).sync_device ();
1996  }
1997 
1998  x.modify_device ();
1999  auto x_2d = x.getLocalViewDevice ();
2000  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2001  auto y_2d = y.getLocalViewDevice ();
2002  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2003  lclDot = KokkosBlas::dot (x_1d, y_1d);
2004 
2005  if (x.isDistributed ()) {
2006  using Teuchos::outArg;
2007  using Teuchos::REDUCE_SUM;
2008  using Teuchos::reduceAll;
2009  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2010  }
2011  else {
2012  gblDot = lclDot;
2013  }
2014  return gblDot;
2015  }
2016  }
2017  } // namespace (anonymous)
2018 
2019 
2020 
2021  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2022  void
2025  const Teuchos::ArrayView<dot_type>& dots) const
2026  {
2028  const char tfecfFuncName[] = "dot: ";
2029  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2030 
2031  const size_t numVecs = this->getNumVectors ();
2032  const size_t lclNumRows = this->getLocalLength ();
2033  const size_t numDots = static_cast<size_t> (dots.size ());
2034 
2035  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2036  // not necessarily be thrown on all processes consistently. We
2037  // keep them for now, because MultiVector's unit tests insist on
2038  // them. In the future, we should instead pass along error state
2039  // with the inner product. We could do this by setting an extra
2040  // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
2041  // The final sum should be
2042  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2044  (lclNumRows != A.getLocalLength (), std::runtime_error,
2045  "MultiVectors do not have the same local length. "
2046  "this->getLocalLength() = " << lclNumRows << " != "
2047  "A.getLocalLength() = " << A.getLocalLength () << ".");
2048  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2049  (numVecs != A.getNumVectors (), std::runtime_error,
2050  "MultiVectors must have the same number of columns (vectors). "
2051  "this->getNumVectors() = " << numVecs << " != "
2052  "A.getNumVectors() = " << A.getNumVectors () << ".");
2053  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2054  (numDots != numVecs, std::runtime_error,
2055  "The output array 'dots' must have the same number of entries as the "
2056  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2057  numDots << " != this->getNumVectors() = " << numVecs << ".");
2058 
2059  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2060  const dot_type gblDot = multiVectorSingleColumnDot (const_cast<MV&> (*this), A);
2061  dots[0] = gblDot;
2062  }
2063  else {
2064  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2065  }
2066  }
2067 
2068  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2069  void
2071  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2072  {
2074  using ::Tpetra::Details::NORM_TWO;
2075  using ::Tpetra::Details::ProfilingRegion;
2076  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2077 
2078  // The function needs to be able to sync X.
2079  MV& X = const_cast<MV&> (*this);
2080  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2081  }
2082 
2083  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2084  void
2086  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2087  {
2088  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2089  this->norm2 (norms_av);
2090  }
2091 
2092 
2093  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2094  void
2096  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2097  {
2099  using ::Tpetra::Details::NORM_ONE;
2100  using ::Tpetra::Details::ProfilingRegion;
2101  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2102 
2103  // The function needs to be able to sync X.
2104  MV& X = const_cast<MV&> (*this);
2105  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2106  }
2107 
2108  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2109  void
2111  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2112  {
2113  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2114  this->norm1 (norms_av);
2115  }
2116 
2117  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2118  void
2120  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2121  {
2123  using ::Tpetra::Details::NORM_INF;
2124  using ::Tpetra::Details::ProfilingRegion;
2125  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2126 
2127  // The function needs to be able to sync X.
2128  MV& X = const_cast<MV&> (*this);
2129  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2130  }
2131 
2132  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2133  void
2135  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2136  {
2137  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2138  this->normInf (norms_av);
2139  }
2140 
2141  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2142  void
2144  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2145  {
2146  // KR FIXME Overload this method to take a View.
2147  using Kokkos::ALL;
2148  using Kokkos::subview;
2149  using Teuchos::Comm;
2150  using Teuchos::RCP;
2151  using Teuchos::reduceAll;
2152  using Teuchos::REDUCE_SUM;
2153  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2154 
2155  const size_t lclNumRows = this->getLocalLength ();
2156  const size_t numVecs = this->getNumVectors ();
2157  const size_t numMeans = static_cast<size_t> (means.size ());
2158 
2159  TEUCHOS_TEST_FOR_EXCEPTION(
2160  numMeans != numVecs, std::runtime_error,
2161  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2162  << " != this->getNumVectors() = " << numVecs << ".");
2163 
2164  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2165  const std::pair<size_t, size_t> colRng (0, numVecs);
2166 
2167  // Make sure that the final output view has the same layout as the
2168  // temporary view's HostMirror. Left or Right doesn't matter for
2169  // a 1-D array anyway; this is just to placate the compiler.
2170  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2171  typedef Kokkos::View<impl_scalar_type*,
2172  typename local_view_type::HostMirror::array_layout,
2173  Kokkos::HostSpace,
2174  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2175  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2176 
2177  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2178  this->getMap ()->getComm ();
2179 
2180  // If we need sync to device, then host has the most recent version.
2181  const bool useHostVersion = this->need_sync_device ();
2182  if (useHostVersion) {
2183  // DualView was last modified on host, so run the local kernel there.
2184  auto X_lcl = subview (getLocalViewHost (),
2185  rowRng, Kokkos::ALL ());
2186  // Compute the local sum of each column.
2187  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2188  if (isConstantStride ()) {
2189  KokkosBlas::sum (lclSums, X_lcl);
2190  }
2191  else {
2192  for (size_t j = 0; j < numVecs; ++j) {
2193  const size_t col = whichVectors_[j];
2194  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2195  }
2196  }
2197 
2198  // If there are multiple MPI processes, the all-reduce reads
2199  // from lclSums, and writes to meansOut. Otherwise, we just
2200  // copy lclSums into meansOut.
2201  if (! comm.is_null () && this->isDistributed ()) {
2202  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2203  lclSums.data (), meansOut.data ());
2204  }
2205  else {
2206  Kokkos::deep_copy (meansOut, lclSums);
2207  }
2208  }
2209  else {
2210  // DualView was last modified on device, so run the local kernel there.
2211  auto X_lcl = subview (this->getLocalViewDevice (),
2212  rowRng, Kokkos::ALL ());
2213 
2214  // Compute the local sum of each column.
2215  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2216  if (isConstantStride ()) {
2217  KokkosBlas::sum (lclSums, X_lcl);
2218  }
2219  else {
2220  for (size_t j = 0; j < numVecs; ++j) {
2221  const size_t col = whichVectors_[j];
2222  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2223  }
2224  }
2225 
2226  // If there are multiple MPI processes, the all-reduce reads
2227  // from lclSums, and writes to meansOut. (We assume that MPI
2228  // can read device memory.) Otherwise, we just copy lclSums
2229  // into meansOut.
2230  if (! comm.is_null () && this->isDistributed ()) {
2231  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2232  lclSums.data (), meansOut.data ());
2233  }
2234  else {
2235  Kokkos::deep_copy (meansOut, lclSums);
2236  }
2237  }
2238 
2239  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2240  // to the magnitude type, since operator/ (std::complex<T>, int)
2241  // isn't necessarily defined.
2242  const impl_scalar_type OneOverN =
2243  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2244  for (size_t k = 0; k < numMeans; ++k) {
2245  meansOut(k) = meansOut(k) * OneOverN;
2246  }
2247  }
2248 
2249 
2250  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2251  void
2254  {
2255  typedef impl_scalar_type IST;
2256  typedef Kokkos::Details::ArithTraits<IST> ATS;
2257  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2258  typedef typename pool_type::generator_type generator_type;
2259 
2260  const IST max = Kokkos::rand<generator_type, IST>::max ();
2261  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2262 
2263  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2264  }
2265 
2266 
2267  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2268  void
2270  randomize (const Scalar& minVal, const Scalar& maxVal)
2271  {
2272  typedef impl_scalar_type IST;
2273  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2274 
2275  // Seed the pseudorandom number generator using the calling
2276  // process' rank. This helps decorrelate different process'
2277  // pseudorandom streams. It's not perfect but it's effective and
2278  // doesn't require MPI communication. The seed also includes bits
2279  // from the standard library's rand().
2280  //
2281  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2282  // The code below just makes a new seed each time.
2283 
2284  const uint64_t myRank =
2285  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2286  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2287  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2288 
2289  pool_type rand_pool (seed);
2290  const IST max = static_cast<IST> (maxVal);
2291  const IST min = static_cast<IST> (minVal);
2292 
2293  // See #1510. In case diag has already been marked modified on
2294  // host or device, we need to clear those flags, since the code
2295  // below works on device.
2296  this->view_.clear_sync_state();
2297 
2298  this->modify_device ();
2299  auto thisView = this->getLocalViewDevice ();
2300 
2301  if (isConstantStride ()) {
2302  Kokkos::fill_random (thisView, rand_pool, min, max);
2303  }
2304  else {
2305  const size_t numVecs = getNumVectors ();
2306  for (size_t k = 0; k < numVecs; ++k) {
2307  const size_t col = whichVectors_[k];
2308  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2309  Kokkos::fill_random (X_k, rand_pool, min, max);
2310  }
2311  }
2312  }
2313 
2314  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2315  void
2317  putScalar (const Scalar& alpha)
2318  {
2319  using ::Tpetra::Details::ProfilingRegion;
2320  using ::Tpetra::Details::Blas::fill;
2321  using DES = typename dual_view_type::t_dev::execution_space;
2322  using HES = typename dual_view_type::t_host::execution_space;
2323  using LO = LocalOrdinal;
2324  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2325 
2326  // We need this cast for cases like Scalar = std::complex<T> but
2327  // impl_scalar_type = Kokkos::complex<T>.
2328  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2329  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2330  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2331 
2332  // Modify the most recently updated version of the data. This
2333  // avoids sync'ing, which could violate users' expectations.
2334  //
2335  // If we need sync to device, then host has the most recent version.
2336  const bool runOnHost = runKernelOnHost(*this);
2337 
2338  this->clear_sync_state();
2339  if (! runOnHost) {
2340  this->modify_device ();
2341  auto X = this->getLocalViewDevice ();
2342  if (this->isConstantStride ()) {
2343  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2344  }
2345  else {
2346  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2347  this->whichVectors_.getRawPtr ());
2348  }
2349  }
2350  else { // last modified in host memory, so modify data there.
2351  this->modify_host ();
2352  auto X = this->getLocalViewHost ();
2353  if (this->isConstantStride ()) {
2354  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2355  }
2356  else {
2357  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2358  this->whichVectors_.getRawPtr ());
2359  }
2360  }
2361  }
2362 
2363 
2364  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2365  void
2367  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2368  {
2369  using Teuchos::ArrayRCP;
2370  using Teuchos::Comm;
2371  using Teuchos::RCP;
2372  using ST = Scalar;
2373  using LO = LocalOrdinal;
2374  using GO = GlobalOrdinal;
2375 
2376  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2377  // it might work if the MV is a column view of another MV.
2378  // However, things might go wrong when restoring the original
2379  // Map, so we don't allow this case for now.
2380  TEUCHOS_TEST_FOR_EXCEPTION(
2381  ! this->isConstantStride (), std::logic_error,
2382  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2383  "if the MultiVector is a column view of another MultiVector (that is, if "
2384  "isConstantStride() == false).");
2385 
2386  // Case 1: current Map and new Map are both nonnull on this process.
2387  // Case 2: current Map is nonnull, new Map is null.
2388  // Case 3: current Map is null, new Map is nonnull.
2389  // Case 4: both Maps are null: forbidden.
2390  //
2391  // Case 1 means that we don't have to do anything on this process,
2392  // other than assign the new Map. (We always have to do that.)
2393  // It's an error for the user to supply a Map that requires
2394  // resizing in this case.
2395  //
2396  // Case 2 means that the calling process is in the current Map's
2397  // communicator, but will be excluded from the new Map's
2398  // communicator. We don't have to do anything on the calling
2399  // process; just leave whatever data it may have alone.
2400  //
2401  // Case 3 means that the calling process is excluded from the
2402  // current Map's communicator, but will be included in the new
2403  // Map's communicator. This means we need to (re)allocate the
2404  // local DualView if it does not have the right number of rows.
2405  // If the new number of rows is nonzero, we'll fill the newly
2406  // allocated local data with zeros, as befits a projection
2407  // operation.
2408  //
2409  // The typical use case for Case 3 is that the MultiVector was
2410  // first created with the Map with more processes, then that Map
2411  // was replaced with a Map with fewer processes, and finally the
2412  // original Map was restored on this call to replaceMap.
2413 
2414 #ifdef HAVE_TEUCHOS_DEBUG
2415  // mfh 28 Mar 2013: We can't check for compatibility across the
2416  // whole communicator, unless we know that the current and new
2417  // Maps are nonnull on _all_ participating processes.
2418  // TEUCHOS_TEST_FOR_EXCEPTION(
2419  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2420  // std::invalid_argument, "Tpetra::MultiVector::project: "
2421  // "If the input Map's communicator is compatible (has the same number of "
2422  // "processes as) the current Map's communicator, then the two Maps must be "
2423  // "compatible. The replaceMap() method is not for data redistribution; "
2424  // "use Import or Export for that purpose.");
2425 
2426  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2427  // of the Map, in case the process counts don't match.
2428 #endif // HAVE_TEUCHOS_DEBUG
2429 
2430  if (this->getMap ().is_null ()) { // current Map is null
2431  // If this->getMap() is null, that means that this MultiVector
2432  // has already had replaceMap happen to it. In that case, just
2433  // reallocate the DualView with the right size.
2434 
2435  TEUCHOS_TEST_FOR_EXCEPTION(
2436  newMap.is_null (), std::invalid_argument,
2437  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2438  "This probably means that the input Map is incorrect.");
2439 
2440  // Case 3: current Map is null, new Map is nonnull.
2441  // Reallocate the DualView with the right dimensions.
2442  const size_t newNumRows = newMap->getNodeNumElements ();
2443  const size_t origNumRows = view_.extent (0);
2444  const size_t numCols = this->getNumVectors ();
2445 
2446  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2447  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2448  }
2449  }
2450  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2451  // I am an excluded process. Reinitialize my data so that I
2452  // have 0 rows. Keep the number of columns as before.
2453  const size_t newNumRows = static_cast<size_t> (0);
2454  const size_t numCols = this->getNumVectors ();
2455  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2456  }
2457 
2458  this->map_ = newMap;
2459  }
2460 
2461  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2462  void
2464  scale (const Scalar& alpha)
2465  {
2466  using Kokkos::ALL;
2467  using IST = impl_scalar_type;
2468 
2469  const IST theAlpha = static_cast<IST> (alpha);
2470  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2471  return; // do nothing
2472  }
2473  const size_t lclNumRows = getLocalLength ();
2474  const size_t numVecs = getNumVectors ();
2475  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2476  const std::pair<size_t, size_t> colRng (0, numVecs);
2477 
2478  // We can't substitute putScalar(0.0) for scale(0.0), because the
2479  // former will overwrite NaNs present in the MultiVector. The
2480  // semantics of this call require multiplying them by 0, which
2481  // IEEE 754 requires to be NaN.
2482 
2483  // If we need sync to device, then host has the most recent version.
2484  const bool useHostVersion = need_sync_device ();
2485  if (useHostVersion) {
2486  auto Y_lcl = Kokkos::subview (getLocalViewHost (), rowRng, ALL ());
2487  if (isConstantStride ()) {
2488  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2489  }
2490  else {
2491  for (size_t k = 0; k < numVecs; ++k) {
2492  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2493  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2494  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2495  }
2496  }
2497  }
2498  else { // work on device
2499  auto Y_lcl = Kokkos::subview (getLocalViewDevice (), rowRng, ALL ());
2500  if (isConstantStride ()) {
2501  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2502  }
2503  else {
2504  for (size_t k = 0; k < numVecs; ++k) {
2505  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2506  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2507  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2508  }
2509  }
2510  }
2511  }
2512 
2513 
2514  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2515  void
2517  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2518  {
2519  const size_t numVecs = this->getNumVectors ();
2520  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2521  TEUCHOS_TEST_FOR_EXCEPTION(
2522  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2523  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2524  << numVecs << ".");
2525 
2526  // Use a DualView to copy the scaling constants onto the device.
2527  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2528  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2529  k_alphas.modify_host ();
2530  for (size_t i = 0; i < numAlphas; ++i) {
2531  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2532  }
2533  k_alphas.sync_device ();
2534  // Invoke the scale() overload that takes a device View of coefficients.
2535  this->scale (k_alphas.view_device ());
2536  }
2537 
2538  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2539  void
2541  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2542  {
2543  using Kokkos::ALL;
2544  using Kokkos::subview;
2545 
2546  const size_t lclNumRows = this->getLocalLength ();
2547  const size_t numVecs = this->getNumVectors ();
2548  TEUCHOS_TEST_FOR_EXCEPTION(
2549  static_cast<size_t> (alphas.extent (0)) != numVecs,
2550  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2551  "alphas.extent(0) = " << alphas.extent (0)
2552  << " != this->getNumVectors () = " << numVecs << ".");
2553  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2554  const std::pair<size_t, size_t> colRng (0, numVecs);
2555 
2556  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2557  // type of the return value of subview. This is because if we
2558  // switch the array layout from LayoutLeft to LayoutRight
2559  // (preferred for performance of block operations), the types
2560  // below won't be valid. (A view of a column of a LayoutRight
2561  // multivector has LayoutStride, not LayoutLeft.)
2562 
2563  // If we need sync to device, then host has the most recent version.
2564  const bool useHostVersion = this->need_sync_device ();
2565  if (useHostVersion) {
2566  // Work in host memory. This means we need to create a host
2567  // mirror of the input View of coefficients.
2568  auto alphas_h = Kokkos::create_mirror_view (alphas);
2569  Kokkos::deep_copy (alphas_h, alphas);
2570 
2571  auto Y_lcl = subview (this->getLocalViewHost (), rowRng, ALL ());
2572  if (isConstantStride ()) {
2573  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2574  }
2575  else {
2576  for (size_t k = 0; k < numVecs; ++k) {
2577  const size_t Y_col = this->isConstantStride () ? k :
2578  this->whichVectors_[k];
2579  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2580  // We don't have to use the entire 1-D View here; we can use
2581  // the version that takes a scalar coefficient.
2582  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2583  }
2584  }
2585  }
2586  else { // Work in device memory, using the input View 'alphas' directly.
2587  auto Y_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2588  if (isConstantStride ()) {
2589  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2590  }
2591  else {
2592  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2593  // as values on host, so copy them to host. Another approach
2594  // would be to fix scal() so that it takes a 0-D View as the
2595  // second argument.
2596  auto alphas_h = Kokkos::create_mirror_view (alphas);
2597  Kokkos::deep_copy (alphas_h, alphas);
2598 
2599  for (size_t k = 0; k < numVecs; ++k) {
2600  const size_t Y_col = this->isConstantStride () ? k :
2601  this->whichVectors_[k];
2602  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2603  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2604  }
2605  }
2606  }
2607  }
2608 
2609  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2610  void
2612  scale (const Scalar& alpha,
2614  {
2615  using Kokkos::ALL;
2616  using Kokkos::subview;
2618  const char tfecfFuncName[] = "scale: ";
2619 
2620  const size_t lclNumRows = getLocalLength ();
2621  const size_t numVecs = getNumVectors ();
2622 
2623  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2624  lclNumRows != A.getLocalLength (), std::invalid_argument,
2625  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2626  << A.getLocalLength () << ".");
2627  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2628  numVecs != A.getNumVectors (), std::invalid_argument,
2629  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2630  << A.getNumVectors () << ".");
2631 
2632  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2633  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2634  const std::pair<size_t, size_t> colRng (0, numVecs);
2635 
2636  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2637  if (this->need_sync_device ()) {
2638  this->sync_device ();
2639  }
2640  if (A.need_sync_device ()) {
2641  const_cast<MV&>(A).sync_device ();
2642  }
2643 
2644  this->modify_device ();
2645  auto Y_lcl_orig = this->getLocalViewDevice ();
2646  auto X_lcl_orig = A.getLocalViewDevice ();
2647  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2648  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2649 
2650  if (isConstantStride () && A.isConstantStride ()) {
2651  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2652  }
2653  else {
2654  // Make sure that Kokkos only uses the local length for add.
2655  for (size_t k = 0; k < numVecs; ++k) {
2656  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2657  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2658  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2659  auto X_k = subview (X_lcl, ALL (), X_col);
2660 
2661  KokkosBlas::scal (Y_k, theAlpha, X_k);
2662  }
2663  }
2664  }
2665 
2666 
2667 
2668  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2669  void
2672  {
2674  const char tfecfFuncName[] = "reciprocal: ";
2675 
2676  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2677  getLocalLength () != A.getLocalLength (), std::runtime_error,
2678  "MultiVectors do not have the same local length. "
2679  "this->getLocalLength() = " << getLocalLength ()
2680  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2681  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2682  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2683  ": MultiVectors do not have the same number of columns (vectors). "
2684  "this->getNumVectors() = " << getNumVectors ()
2685  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2686 
2687  const size_t numVecs = getNumVectors ();
2688 
2689  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2690  if (this->need_sync_device ()) {
2691  this->sync_device ();
2692  }
2693  if (A.need_sync_device ()) {
2694  const_cast<MV&>(A).sync_device ();
2695  }
2696  this->modify_device ();
2697 
2698  auto this_view_dev = this->getLocalViewDevice ();
2699  auto A_view_dev = A.getLocalViewDevice ();
2700 
2701  if (isConstantStride () && A.isConstantStride ()) {
2702  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2703  }
2704  else {
2705  using Kokkos::ALL;
2706  using Kokkos::subview;
2707  for (size_t k = 0; k < numVecs; ++k) {
2708  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2709  auto vector_k = subview (this_view_dev, ALL (), this_col);
2710  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2711  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2712  KokkosBlas::reciprocal (vector_k, vector_Ak);
2713  }
2714  }
2715  }
2716 
2717  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2718  void
2721  {
2723  const char tfecfFuncName[] = "abs";
2724 
2725  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2726  getLocalLength () != A.getLocalLength (), std::runtime_error,
2727  ": MultiVectors do not have the same local length. "
2728  "this->getLocalLength() = " << getLocalLength ()
2729  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2730  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2731  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2732  ": MultiVectors do not have the same number of columns (vectors). "
2733  "this->getNumVectors() = " << getNumVectors ()
2734  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2735  const size_t numVecs = getNumVectors ();
2736 
2737  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2738  if (this->need_sync_device ()) {
2739  this->sync_device ();
2740  }
2741  if (A.need_sync_device ()) {
2742  const_cast<MV&>(A).sync_device ();
2743  }
2744  this->modify_device ();
2745 
2746  auto this_view_dev = this->getLocalViewDevice ();
2747  auto A_view_dev = A.getLocalViewDevice ();
2748 
2749  if (isConstantStride () && A.isConstantStride ()) {
2750  KokkosBlas::abs (this_view_dev, A_view_dev);
2751  }
2752  else {
2753  using Kokkos::ALL;
2754  using Kokkos::subview;
2755 
2756  for (size_t k=0; k < numVecs; ++k) {
2757  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2758  auto vector_k = subview (this_view_dev, ALL (), this_col);
2759  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2760  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2761  KokkosBlas::abs (vector_k, vector_Ak);
2762  }
2763  }
2764  }
2765 
2766  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2767  void
2769  update (const Scalar& alpha,
2771  const Scalar& beta)
2772  {
2773  const char tfecfFuncName[] = "update: ";
2774  using Kokkos::subview;
2775  using Kokkos::ALL;
2777 
2778  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
2779 
2780  const size_t lclNumRows = getLocalLength ();
2781  const size_t numVecs = getNumVectors ();
2782 
2783  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2784  lclNumRows != A.getLocalLength (), std::invalid_argument,
2785  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2786  << A.getLocalLength () << ".");
2787  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2788  numVecs != A.getNumVectors (), std::invalid_argument,
2789  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2790  << A.getNumVectors () << ".");
2791 
2792  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2793  if (this->need_sync_device ()) {
2794  this->sync_device ();
2795  }
2796  if (A.need_sync_device ()) {
2797  const_cast<MV&>(A).sync_device ();
2798  }
2799 
2800  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2801  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2802  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2803  const std::pair<size_t, size_t> colRng (0, numVecs);
2804 
2805  auto Y_lcl_orig = this->getLocalViewDevice ();
2806  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2807  auto X_lcl_orig = A.getLocalViewDevice ();
2808  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2809 
2810  // The device memory of *this is about to be modified
2811  this->modify_device ();
2812  if (isConstantStride () && A.isConstantStride ()) {
2813  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2814  }
2815  else {
2816  // Make sure that Kokkos only uses the local length for add.
2817  for (size_t k = 0; k < numVecs; ++k) {
2818  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2819  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2820  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2821  auto X_k = subview (X_lcl, ALL (), X_col);
2822 
2823  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2824  }
2825  }
2826  }
2827 
2828  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2829  void
2831  update (const Scalar& alpha,
2833  const Scalar& beta,
2835  const Scalar& gamma)
2836  {
2837  using Kokkos::ALL;
2838  using Kokkos::subview;
2840 
2841  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
2842 
2843  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
2844 
2845  const size_t lclNumRows = this->getLocalLength ();
2846  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2847  lclNumRows != A.getLocalLength (), std::invalid_argument,
2848  "The input MultiVector A has " << A.getLocalLength () << " local "
2849  "row(s), but this MultiVector has " << lclNumRows << " local "
2850  "row(s).");
2851  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2852  lclNumRows != B.getLocalLength (), std::invalid_argument,
2853  "The input MultiVector B has " << B.getLocalLength () << " local "
2854  "row(s), but this MultiVector has " << lclNumRows << " local "
2855  "row(s).");
2856  const size_t numVecs = getNumVectors ();
2857  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2858  A.getNumVectors () != numVecs, std::invalid_argument,
2859  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
2860  "but this MultiVector has " << numVecs << " column(s).");
2861  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2862  B.getNumVectors () != numVecs, std::invalid_argument,
2863  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
2864  "but this MultiVector has " << numVecs << " column(s).");
2865 
2866  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2867  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2868  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
2869 
2870  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2871  if (this->need_sync_device ()) this->sync_device ();
2872  if (A.need_sync_device ()) const_cast<MV&>(A).sync_device ();
2873  if (B.need_sync_device ()) const_cast<MV&>(B).sync_device ();
2874 
2875  // This method modifies *this.
2876  this->modify_device ();
2877 
2878  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2879  const std::pair<size_t, size_t> colRng (0, numVecs);
2880 
2881  // Prefer 'auto' over specifying the type explicitly. This avoids
2882  // issues with a subview possibly having a different type than the
2883  // original view.
2884  auto C_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2885  auto A_lcl = subview (A.getLocalViewDevice (), rowRng, ALL ());
2886  auto B_lcl = subview (B.getLocalViewDevice (), rowRng, ALL ());
2887 
2888  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
2889  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
2890  }
2891  else {
2892  // Some input (or *this) is not constant stride,
2893  // so perform the update one column at a time.
2894  for (size_t k = 0; k < numVecs; ++k) {
2895  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2896  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
2897  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
2898  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
2899  theBeta, subview (B_lcl, rowRng, B_col),
2900  theGamma, subview (C_lcl, rowRng, this_col));
2901  }
2902  }
2903  }
2904 
2905  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2906  Teuchos::ArrayRCP<const Scalar>
2908  getData (size_t j) const
2909  {
2910  using Kokkos::ALL;
2912  using IST = impl_scalar_type;
2913  const char tfecfFuncName[] = "getData: ";
2914 
2915  // Any MultiVector method that called the (classic) Kokkos Node's
2916  // viewBuffer or viewBufferNonConst methods always implied a
2917  // device->host synchronization. Thus, we synchronize here as
2918  // well.
2919  const_cast<MV&> (*this).sync_host ();
2920 
2921  auto hostView = getLocalViewHost ();
2922  const size_t col = isConstantStride () ? j : whichVectors_[j];
2923  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
2924  Teuchos::ArrayRCP<const IST> dataAsArcp =
2925  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
2926 
2927  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2928  (static_cast<size_t> (hostView_j.extent (0)) <
2929  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
2930  "hostView_j.extent(0) = " << hostView_j.extent (0)
2931  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
2932  "Please report this bug to the Tpetra developers.");
2933 
2934  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
2935  }
2936 
2937  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2938  Teuchos::ArrayRCP<Scalar>
2941  {
2942  using Kokkos::ALL;
2943  using Kokkos::subview;
2945  using IST = impl_scalar_type;
2946  const char tfecfFuncName[] = "getDataNonConst: ";
2947 
2948  // Any MultiVector method that called the (classic) Kokkos Node's
2949  // viewBuffer or viewBufferNonConst methods always implied a
2950  // device->host synchronization. Thus, we synchronize here as
2951  // well.
2952  const_cast<MV*> (this)->sync_host ();
2953  // Calling getDataNonConst() implies that the user plans to modify
2954  // the values in the MultiVector, so we mark the host data as modified.
2955  modify_host ();
2956 
2957  auto hostView = getLocalViewHost ();
2958  const size_t col = isConstantStride () ? j : whichVectors_[j];
2959  auto hostView_j = subview (hostView, ALL (), col);
2960  Teuchos::ArrayRCP<IST> dataAsArcp =
2961  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
2962 
2963  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2964  (static_cast<size_t> (hostView_j.extent (0)) <
2965  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
2966  "hostView_j.extent(0) = " << hostView_j.extent (0)
2967  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
2968  "Please report this bug to the Tpetra developers.");
2969 
2970  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
2971  }
2972 
2973  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2974  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
2976  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
2977  {
2978  using Teuchos::RCP;
2980 
2981  // Check whether the index set in cols is contiguous. If it is,
2982  // use the more efficient Range1D version of subCopy.
2983  bool contiguous = true;
2984  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
2985  for (size_t j = 1; j < numCopyVecs; ++j) {
2986  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
2987  contiguous = false;
2988  break;
2989  }
2990  }
2991  if (contiguous && numCopyVecs > 0) {
2992  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
2993  }
2994  else {
2995  RCP<const MV> X_sub = this->subView (cols);
2996  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
2997  Y->assign (*X_sub);
2998  return Y;
2999  }
3000  }
3001 
3002  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3003  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3005  subCopy (const Teuchos::Range1D &colRng) const
3006  {
3007  using Teuchos::RCP;
3009 
3010  RCP<const MV> X_sub = this->subView (colRng);
3011  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3012  Y->assign (*X_sub);
3013  return Y;
3014  }
3015 
3016  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3017  size_t
3020  return origView_.extent (0);
3021  }
3022 
3023  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3024  size_t
3027  return origView_.extent (1);
3028  }
3029 
3030  template <class Scalar, class LO, class GO, class Node>
3033  const Teuchos::RCP<const map_type>& subMap,
3034  const local_ordinal_type rowOffset) :
3035  base_type (subMap)
3036  {
3037  using Kokkos::ALL;
3038  using Kokkos::subview;
3039  using Teuchos::outArg;
3040  using Teuchos::RCP;
3041  using Teuchos::rcp;
3042  using Teuchos::reduceAll;
3043  using Teuchos::REDUCE_MIN;
3044  using std::endl;
3046  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3047  const char suffix[] = "Please report this bug to the Tpetra developers.";
3048  int lclGood = 1;
3049  int gblGood = 1;
3050  std::unique_ptr<std::ostringstream> errStrm;
3051  const bool debug = ::Tpetra::Details::Behavior::debug ();
3052  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3053 
3054  // Be careful to use the input Map's communicator, not X's. The
3055  // idea is that, on return, *this is a subview of X, using the
3056  // input Map.
3057  const auto comm = subMap->getComm ();
3058  TEUCHOS_ASSERT( ! comm.is_null () );
3059  const int myRank = comm->getRank ();
3060 
3061  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3062  const LO numCols = static_cast<LO> (X.getNumVectors ());
3063  const LO newNumRows = static_cast<LO> (subMap->getNodeNumElements ());
3064  if (verbose) {
3065  std::ostringstream os;
3066  os << "Proc " << myRank << ": " << prefix
3067  << "X: {lclNumRows: " << lclNumRowsBefore
3068  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3069  << ", numCols: " << numCols << "}, "
3070  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3071  std::cerr << os.str ();
3072  }
3073  // We ask for the _original_ number of rows in X, because X could
3074  // be a shorter (fewer rows) view of a longer MV. (For example, X
3075  // could be a domain Map view of a column Map MV.)
3076  const bool tooManyElts =
3077  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3078  if (tooManyElts) {
3079  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3080  *errStrm << " Proc " << myRank << ": subMap->getNodeNumElements() (="
3081  << newNumRows << ") + rowOffset (=" << rowOffset
3082  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3083  << ")." << endl;
3084  lclGood = 0;
3085  TEUCHOS_TEST_FOR_EXCEPTION
3086  (! debug && tooManyElts, std::invalid_argument,
3087  prefix << errStrm->str () << suffix);
3088  }
3089 
3090  if (debug) {
3091  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3092  if (gblGood != 1) {
3093  std::ostringstream gblErrStrm;
3094  const std::string myErrStr =
3095  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3096  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3097  TEUCHOS_TEST_FOR_EXCEPTION
3098  (true, std::invalid_argument, gblErrStrm.str ());
3099  }
3100  }
3101 
3102  using range_type = std::pair<LO, LO>;
3103  const range_type origRowRng
3104  (rowOffset, static_cast<LO> (X.origView_.extent (0)));
3105  const range_type rowRng
3106  (rowOffset, rowOffset + newNumRows);
3107 
3108  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3109  // FIXME (mfh 29 Sep 2016) If we just use X.view_ here, it breaks
3110  // CrsMatrix's Gauss-Seidel implementation (which assumes the
3111  // ability to create domain Map views of column Map MultiVectors,
3112  // and then get the original column Map MultiVector out again).
3113  // If we just use X.origView_ here, it breaks the fix for #46.
3114  // The test for rowOffset == 0 is a hack that makes both tests
3115  // pass, but doesn't actually fix the more general issue. In
3116  // particular, the right way to fix Gauss-Seidel would be to fix
3117  // #385; that would make "getting the original column Map
3118  // MultiVector out again" unnecessary.
3119  dual_view_type newView =
3120  subview (rowOffset == 0 ? X.origView_ : X.view_, rowRng, ALL ());
3121 
3122  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3123  // handling subviews of degenerate Views quite so well. For some
3124  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3125  // 0. We work around by creating a new empty DualView of the
3126  // desired (degenerate) dimensions.
3127  if (newOrigView.extent (0) == 0 &&
3128  newOrigView.extent (1) != X.origView_.extent (1)) {
3129  newOrigView =
3130  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3131  }
3132  if (newView.extent (0) == 0 &&
3133  newView.extent (1) != X.view_.extent (1)) {
3134  newView =
3135  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3136  }
3137 
3138  MV subViewMV = X.isConstantStride () ?
3139  MV (subMap, newView, newOrigView) :
3140  MV (subMap, newView, newOrigView, X.whichVectors_ ());
3141 
3142  if (debug) {
3143  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3144  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3145  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3146  lclGood = 0;
3147  if (errStrm.get () == nullptr) {
3148  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3149  }
3150  *errStrm << " Proc " << myRank <<
3151  ": subMap.getNodeNumElements(): " << newNumRows <<
3152  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3153  ", X.getNumVectors(): " << numCols <<
3154  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3155  }
3156  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3157  if (gblGood != 1) {
3158  std::ostringstream gblErrStrm;
3159  if (myRank == 0) {
3160  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3161  "dimensions on one or more processes:" << endl;
3162  }
3163  const std::string myErrStr =
3164  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3165  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3166  gblErrStrm << suffix << endl;
3167  TEUCHOS_TEST_FOR_EXCEPTION
3168  (true, std::invalid_argument, gblErrStrm.str ());
3169  }
3170  }
3171 
3172  if (verbose) {
3173  std::ostringstream os;
3174  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3175  std::cerr << os.str ();
3176  }
3177 
3178  *this = subViewMV; // shallow copy
3179 
3180  if (verbose) {
3181  std::ostringstream os;
3182  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3183  std::cerr << os.str ();
3184  }
3185  }
3186 
3187  template <class Scalar, class LO, class GO, class Node>
3190  const map_type& subMap,
3191  const size_t rowOffset) :
3192  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3193  static_cast<local_ordinal_type> (rowOffset))
3194  {}
3195 
3196  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3197  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3199  offsetView (const Teuchos::RCP<const map_type>& subMap,
3200  const size_t offset) const
3201  {
3203  return Teuchos::rcp (new MV (*this, *subMap, offset));
3204  }
3205 
3206  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3207  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3209  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3210  const size_t offset)
3211  {
3213  return Teuchos::rcp (new MV (*this, *subMap, offset));
3214  }
3215 
3216  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3217  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3219  subView (const Teuchos::ArrayView<const size_t>& cols) const
3220  {
3221  using Teuchos::Array;
3222  using Teuchos::rcp;
3224 
3225  const size_t numViewCols = static_cast<size_t> (cols.size ());
3226  TEUCHOS_TEST_FOR_EXCEPTION(
3227  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3228  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3229  "contain at least one entry, but cols.size() = " << cols.size ()
3230  << " == 0.");
3231 
3232  // Check whether the index set in cols is contiguous. If it is,
3233  // use the more efficient Range1D version of subView.
3234  bool contiguous = true;
3235  for (size_t j = 1; j < numViewCols; ++j) {
3236  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3237  contiguous = false;
3238  break;
3239  }
3240  }
3241  if (contiguous) {
3242  if (numViewCols == 0) {
3243  // The output MV has no columns, so there is nothing to view.
3244  return rcp (new MV (this->getMap (), numViewCols));
3245  } else {
3246  // Use the more efficient contiguous-index-range version.
3247  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3248  }
3249  }
3250 
3251  if (isConstantStride ()) {
3252  return rcp (new MV (this->getMap (), view_, origView_, cols));
3253  }
3254  else {
3255  Array<size_t> newcols (cols.size ());
3256  for (size_t j = 0; j < numViewCols; ++j) {
3257  newcols[j] = whichVectors_[cols[j]];
3258  }
3259  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3260  }
3261  }
3262 
3263 
3264  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3265  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3267  subView (const Teuchos::Range1D& colRng) const
3268  {
3269  using ::Tpetra::Details::Behavior;
3270  using Kokkos::ALL;
3271  using Kokkos::subview;
3272  using Teuchos::Array;
3273  using Teuchos::RCP;
3274  using Teuchos::rcp;
3276  const char tfecfFuncName[] = "subView(Range1D): ";
3277 
3278  const size_t lclNumRows = this->getLocalLength ();
3279  const size_t numVecs = this->getNumVectors ();
3280  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3281  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3282  // "at least one vector.");
3283  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3284  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3285  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3286  << numVecs << ".");
3287  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3288  numVecs != 0 && colRng.size () != 0 &&
3289  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3290  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3291  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3292  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3293  "[0, " << numVecs << "].");
3294 
3295  RCP<const MV> X_ret; // the MultiVector subview to return
3296 
3297  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3298  // broken for the case of views with zero rows. I will brutally
3299  // enforce that the subview has the correct dimensions. In
3300  // particular, in the case of zero rows, I will, if necessary,
3301  // create a new dual_view_type with zero rows and the correct
3302  // number of columns. In a debug build, I will use an all-reduce
3303  // to ensure that it has the correct dimensions on all processes.
3304 
3305  const std::pair<size_t, size_t> rows (0, lclNumRows);
3306  if (colRng.size () == 0) {
3307  const std::pair<size_t, size_t> cols (0, 0); // empty range
3308  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3309  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3310  }
3311  else {
3312  // Returned MultiVector is constant stride only if *this is.
3313  if (isConstantStride ()) {
3314  const std::pair<size_t, size_t> cols (colRng.lbound (),
3315  colRng.ubound () + 1);
3316  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3317  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3318  }
3319  else {
3320  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3321  // We're only asking for one column, so the result does have
3322  // constant stride, even though this MultiVector does not.
3323  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3324  whichVectors_[0] + colRng.ubound () + 1);
3325  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3326  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3327  }
3328  else {
3329  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3330  whichVectors_.begin () + colRng.ubound () + 1);
3331  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3332  }
3333  }
3334  }
3335 
3336  const bool debug = Behavior::debug ();
3337  if (debug) {
3338  using Teuchos::Comm;
3339  using Teuchos::outArg;
3340  using Teuchos::REDUCE_MIN;
3341  using Teuchos::reduceAll;
3342 
3343  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3344  Teuchos::null : this->getMap ()->getComm ();
3345  if (! comm.is_null ()) {
3346  int lclSuccess = 1;
3347  int gblSuccess = 1;
3348 
3349  if (X_ret.is_null ()) {
3350  lclSuccess = 0;
3351  }
3352  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3353  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3354  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3355  "MultiVector; the return value of this method) is null on some MPI "
3356  "process in this MultiVector's communicator. This should never "
3357  "happen. Please report this bug to the Tpetra developers.");
3358  if (! X_ret.is_null () &&
3359  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3360  lclSuccess = 0;
3361  }
3362  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3363  outArg (gblSuccess));
3364  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3365  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3366  "colRng.size(), on at least one MPI process in this MultiVector's "
3367  "communicator. This should never happen. "
3368  "Please report this bug to the Tpetra developers.");
3369  }
3370  }
3371  return X_ret;
3372  }
3373 
3374 
3375  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3376  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3378  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3379  {
3381  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3382  }
3383 
3384 
3385  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3386  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3388  subViewNonConst (const Teuchos::Range1D &colRng)
3389  {
3391  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3392  }
3393 
3394 
3395  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3398  const size_t j)
3399  : base_type (X.getMap ())
3400  {
3401  using Kokkos::subview;
3402  typedef std::pair<size_t, size_t> range_type;
3403  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3404 
3405  const size_t numCols = X.getNumVectors ();
3406  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3407  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3408  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3409  const size_t jj = X.isConstantStride () ?
3410  static_cast<size_t> (j) :
3411  static_cast<size_t> (X.whichVectors_[j]);
3412  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3413  this->origView_ = X.origView_;
3414 
3415  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3416  // Export or Import operations with different subviews of a
3417  // MultiVector. Thus, it is safe to reuse communication buffers.
3418  // See #1560 discussion.
3419  //
3420  // We only need one column's worth of buffer for imports_ and
3421  // exports_. Taking subviews now ensures that their lengths will
3422  // be exactly what we need, so we won't have to resize them later.
3423  {
3424  const size_t newSize = X.imports_.extent (0) / numCols;
3425  const size_t offset = jj*newSize;
3426  auto newImports = X.imports_;
3427  newImports.d_view = subview (X.imports_.d_view,
3428  range_type (offset, offset+newSize));
3429  newImports.h_view = subview (X.imports_.h_view,
3430  range_type (offset, offset+newSize));
3431  this->imports_ = newImports;
3432  }
3433  {
3434  const size_t newSize = X.exports_.extent (0) / numCols;
3435  const size_t offset = jj*newSize;
3436  auto newExports = X.exports_;
3437  newExports.d_view = subview (X.exports_.d_view,
3438  range_type (offset, offset+newSize));
3439  newExports.h_view = subview (X.exports_.h_view,
3440  range_type (offset, offset+newSize));
3441  this->exports_ = newExports;
3442  }
3443  // These two DualViews already either have the right number of
3444  // entries, or zero entries. This means that we don't need to
3445  // resize them.
3448  }
3449 
3450 
3451  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3452  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3454  getVector (const size_t j) const
3455  {
3457  return Teuchos::rcp (new V (*this, j));
3458  }
3459 
3460 
3461  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3462  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3464  getVectorNonConst (const size_t j)
3465  {
3467  return Teuchos::rcp (new V (*this, j));
3468  }
3469 
3470 
3471  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3472  void
3474  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3475  {
3476  using dev_view_type = typename dual_view_type::t_dev;
3477  using host_view_type = typename dual_view_type::t_host;
3478  using IST = impl_scalar_type;
3479  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3480  Kokkos::HostSpace,
3481  Kokkos::MemoryUnmanaged>;
3482  const char tfecfFuncName[] = "get1dCopy: ";
3483 
3484  const size_t numRows = this->getLocalLength ();
3485  const size_t numCols = this->getNumVectors ();
3486 
3487  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3488  (LDA < numRows, std::runtime_error,
3489  "LDA = " << LDA << " < numRows = " << numRows << ".");
3490  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3491  (numRows > size_t (0) && numCols > size_t (0) &&
3492  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3493  std::runtime_error,
3494  "A.size() = " << A.size () << ", but its size must be at least "
3495  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3496 
3497  const std::pair<size_t, size_t> rowRange (0, numRows);
3498  const std::pair<size_t, size_t> colRange (0, numCols);
3499 
3500  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3501  LDA, numCols);
3502  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3503 
3504  // Use the most recently updated version of this MultiVector's
3505  // data. This avoids sync'ing, which could violate users'
3506  // expectations.
3507  //
3508  // If we need sync to device, then host has the most recent version.
3509  const bool useHostVersion = this->need_sync_device ();
3510 
3511  dev_view_type srcView_dev;
3512  host_view_type srcView_host;
3513  if (useHostVersion) {
3514  srcView_host = this->getLocalViewHost ();
3515  }
3516  else {
3517  srcView_dev = this->getLocalViewDevice ();
3518  }
3519 
3520  if (this->isConstantStride ()) {
3521  if (useHostVersion) {
3522  Kokkos::deep_copy (A_view, srcView_host);
3523  }
3524  else {
3525  Kokkos::deep_copy (A_view, srcView_dev);
3526  }
3527  }
3528  else {
3529  for (size_t j = 0; j < numCols; ++j) {
3530  const size_t srcCol = this->whichVectors_[j];
3531  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3532 
3533  if (useHostVersion) {
3534  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3535  Kokkos::deep_copy (dstColView, srcColView_host);
3536  }
3537  else {
3538  auto srcColView_dev = Kokkos::subview (srcView_dev, rowRange, srcCol);
3539  Kokkos::deep_copy (dstColView, srcColView_dev);
3540  }
3541  }
3542  }
3543  }
3544 
3545 
3546  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3547  void
3549  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3550  {
3552  const char tfecfFuncName[] = "get2dCopy: ";
3553  const size_t numRows = this->getLocalLength ();
3554  const size_t numCols = this->getNumVectors ();
3555 
3556  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3557  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3558  std::runtime_error, "Input array of pointers must contain as many "
3559  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3560  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3561 
3562  if (numRows != 0 && numCols != 0) {
3563  // No side effects until we've validated the input.
3564  for (size_t j = 0; j < numCols; ++j) {
3565  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3566  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3567  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3568  "the input array of arrays is not long enough to fit all entries in "
3569  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3570  << " < getLocalLength() = " << numRows << ".");
3571  }
3572 
3573  // We've validated the input, so it's safe to start copying.
3574  for (size_t j = 0; j < numCols; ++j) {
3575  Teuchos::RCP<const V> X_j = this->getVector (j);
3576  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3577  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3578  }
3579  }
3580  }
3581 
3582  namespace { // (anonymous)
3583  template <class SC, class LO, class GO, class NT>
3585  syncMVToHostIfNeededAndGetHostView (MultiVector<SC, LO, GO, NT>& X,
3586  const bool markModified)
3587  {
3588  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3589  // (replace ? with 1 or 2) have always been device->host
3590  // synchronization points, since <= 2012. We retain this
3591  // behavior for backwards compatibility.
3592  if (X.need_sync_host ()) {
3593  X.sync_host ();
3594  }
3595  if (markModified) {
3596  X.modify_host ();
3597  }
3598  return X.getLocalViewHost ();
3599  }
3600  } // namespace (anonymous)
3601 
3602 
3603  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3604  Teuchos::ArrayRCP<const Scalar>
3606  get1dView () const
3607  {
3608  if (getLocalLength () == 0 || getNumVectors () == 0) {
3609  return Teuchos::null;
3610  } else {
3611  TEUCHOS_TEST_FOR_EXCEPTION(
3612  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3613  "get1dView: This MultiVector does not have constant stride, so it is "
3614  "not possible to view its data as a single array. You may check "
3615  "whether a MultiVector has constant stride by calling "
3616  "isConstantStride().");
3617  // Since get1dView() is and was always marked const, I have to
3618  // cast away const here in order not to break backwards
3619  // compatibility.
3620  constexpr bool markModified = false;
3622  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3623  markModified);
3624  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3625  Kokkos::Compat::persistingView (X_lcl);
3626  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3627  }
3628  }
3629 
3630  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3631  Teuchos::ArrayRCP<Scalar>
3634  {
3635  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3636  return Teuchos::null;
3637  }
3638  else {
3639  TEUCHOS_TEST_FOR_EXCEPTION
3640  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3641  "get1dViewNonConst: This MultiVector does not have constant stride, "
3642  "so it is not possible to view its data as a single array. You may "
3643  "check whether a MultiVector has constant stride by calling "
3644  "isConstantStride().");
3645  constexpr bool markModified = true;
3646  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3647  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3648  Kokkos::Compat::persistingView (X_lcl);
3649  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3650  }
3651  }
3652 
3653  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3654  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3657  {
3658  constexpr bool markModified = true;
3659  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3660 
3661  // Don't use the row range here on the outside, in order to avoid
3662  // a strided return type (in case Kokkos::subview is conservative
3663  // about that). Instead, use the row range for the column views
3664  // in the loop.
3665  const size_t myNumRows = this->getLocalLength ();
3666  const size_t numCols = this->getNumVectors ();
3667  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3668 
3669  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3670  for (size_t j = 0; j < numCols; ++j) {
3671  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3672  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3673  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3674  Kokkos::Compat::persistingView (X_lcl_j);
3675  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3676  }
3677  return views;
3678  }
3679 
3680  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3681  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3683  get2dView () const
3684  {
3685  // Since get2dView() is and was always marked const, I have to
3686  // cast away const here in order not to break backwards
3687  // compatibility.
3688  constexpr bool markModified = false;
3690  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3691  markModified);
3692  // Don't use the row range here on the outside, in order to avoid
3693  // a strided return type (in case Kokkos::subview is conservative
3694  // about that). Instead, use the row range for the column views
3695  // in the loop.
3696  const size_t myNumRows = this->getLocalLength ();
3697  const size_t numCols = this->getNumVectors ();
3698  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3699 
3700  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3701  for (size_t j = 0; j < numCols; ++j) {
3702  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3703  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3704  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3705  Kokkos::Compat::persistingView (X_lcl_j);
3706  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3707  }
3708  return views;
3709  }
3710 
3711  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3712  void
3714  multiply (Teuchos::ETransp transA,
3715  Teuchos::ETransp transB,
3716  const Scalar& alpha,
3719  const Scalar& beta)
3720  {
3721  using ::Tpetra::Details::ProfilingRegion;
3722  using Teuchos::CONJ_TRANS;
3723  using Teuchos::NO_TRANS;
3724  using Teuchos::TRANS;
3725  using Teuchos::RCP;
3726  using Teuchos::rcp;
3727  using Teuchos::rcpFromRef;
3728  using std::endl;
3729  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3730  using LO = local_ordinal_type;
3731  using STS = Teuchos::ScalarTraits<Scalar>;
3733  const char tfecfFuncName[] = "multiply: ";
3734  ProfilingRegion region ("Tpetra::MV::multiply");
3735 
3736  // This routine performs a variety of matrix-matrix multiply
3737  // operations, interpreting the MultiVector (this-aka C , A and B)
3738  // as 2D matrices. Variations are due to the fact that A, B and C
3739  // can be locally replicated or globally distributed MultiVectors
3740  // and that we may or may not operate with the transpose of A and
3741  // B. Possible cases are:
3742  //
3743  // Operations # Cases Notes
3744  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3745  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3746  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3747  //
3748  // The following operations are not meaningful for 1-D
3749  // distributions:
3750  //
3751  // u1) C(local) = A^T(distr) * B^T(distr) 1
3752  // u2) C(local) = A (distr) * B^X(distr) 2
3753  // u3) C(distr) = A^X(local) * B^X(local) 4
3754  // u4) C(distr) = A^X(local) * B^X(distr) 4
3755  // u5) C(distr) = A^T(distr) * B^X(local) 2
3756  // u6) C(local) = A^X(distr) * B^X(local) 4
3757  // u7) C(distr) = A^X(distr) * B^X(local) 4
3758  // u8) C(local) = A^X(local) * B^X(distr) 4
3759  //
3760  // Total number of cases: 32 (= 2^5).
3761 
3762  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3763 
3764  const bool A_is_local = ! A.isDistributed ();
3765  const bool B_is_local = ! B.isDistributed ();
3766  const bool C_is_local = ! this->isDistributed ();
3767 
3768  // In debug mode, check compatibility of local dimensions. We
3769  // only do this in debug mode, since it requires an all-reduce
3770  // to ensure correctness on all processses. It's entirely
3771  // possible that only some processes may have incompatible local
3772  // dimensions. Throwing an exception only on those processes
3773  // could cause this method to hang.
3774  const bool debug = ::Tpetra::Details::Behavior::debug ();
3775  if (debug) {
3776  auto myMap = this->getMap ();
3777  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
3778  using Teuchos::REDUCE_MIN;
3779  using Teuchos::reduceAll;
3780  using Teuchos::outArg;
3781 
3782  auto comm = myMap->getComm ();
3783  const size_t A_nrows =
3784  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
3785  const size_t A_ncols =
3786  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
3787  const size_t B_nrows =
3788  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
3789  const size_t B_ncols =
3790  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
3791 
3792  const bool lclBad = this->getLocalLength () != A_nrows ||
3793  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
3794 
3795  const int myRank = comm->getRank ();
3796  std::ostringstream errStrm;
3797  if (this->getLocalLength () != A_nrows) {
3798  errStrm << "Proc " << myRank << ": this->getLocalLength()="
3799  << this->getLocalLength () << " != A_nrows=" << A_nrows
3800  << "." << std::endl;
3801  }
3802  if (this->getNumVectors () != B_ncols) {
3803  errStrm << "Proc " << myRank << ": this->getNumVectors()="
3804  << this->getNumVectors () << " != B_ncols=" << B_ncols
3805  << "." << std::endl;
3806  }
3807  if (A_ncols != B_nrows) {
3808  errStrm << "Proc " << myRank << ": A_ncols="
3809  << A_ncols << " != B_nrows=" << B_nrows
3810  << "." << std::endl;
3811  }
3812 
3813  const int lclGood = lclBad ? 0 : 1;
3814  int gblGood = 0;
3815  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3816  if (gblGood != 1) {
3817  std::ostringstream os;
3818  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
3819 
3820  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3821  (true, std::runtime_error, "Inconsistent local dimensions on at "
3822  "least one process in this object's communicator." << std::endl
3823  << "Operation: "
3824  << "C(" << (C_is_local ? "local" : "distr") << ") = "
3825  << alpha << "*A"
3826  << (transA == Teuchos::TRANS ? "^T" :
3827  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3828  << "(" << (A_is_local ? "local" : "distr") << ") * "
3829  << beta << "*B"
3830  << (transA == Teuchos::TRANS ? "^T" :
3831  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3832  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3833  << "Global dimensions: C(" << this->getGlobalLength () << ", "
3834  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
3835  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
3836  << ", " << B.getNumVectors () << ")." << std::endl
3837  << os.str ());
3838  }
3839  }
3840  }
3841 
3842  // Case 1: C(local) = A^X(local) * B^X(local)
3843  const bool Case1 = C_is_local && A_is_local && B_is_local;
3844  // Case 2: C(local) = A^T(distr) * B (distr)
3845  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
3846  transA != NO_TRANS &&
3847  transB == NO_TRANS;
3848  // Case 3: C(distr) = A (distr) * B^X(local)
3849  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
3850  transA == NO_TRANS;
3851 
3852  // Test that we are considering a meaningful case
3853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3854  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
3855  "Multiplication of op(A) and op(B) into *this is not a "
3856  "supported use case.");
3857 
3858  if (beta != STS::zero () && Case2) {
3859  // If Case2, then C is local and contributions must be summed
3860  // across all processes. However, if beta != 0, then accumulate
3861  // beta*C into the sum. When summing across all processes, we
3862  // only want to accumulate this once, so set beta == 0 on all
3863  // processes except Process 0.
3864  const int myRank = this->getMap ()->getComm ()->getRank ();
3865  if (myRank != 0) {
3866  beta_local = ATS::zero ();
3867  }
3868  }
3869 
3870  // We only know how to do matrix-matrix multiplies if all the
3871  // MultiVectors have constant stride. If not, we have to make
3872  // temporary copies of those MultiVectors (including possibly
3873  // *this) that don't have constant stride.
3874  RCP<MV> C_tmp;
3875  if (! isConstantStride ()) {
3876  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
3877  } else {
3878  C_tmp = rcp (this, false);
3879  }
3880 
3881  RCP<const MV> A_tmp;
3882  if (! A.isConstantStride ()) {
3883  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
3884  } else {
3885  A_tmp = rcpFromRef (A);
3886  }
3887 
3888  RCP<const MV> B_tmp;
3889  if (! B.isConstantStride ()) {
3890  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
3891  } else {
3892  B_tmp = rcpFromRef (B);
3893  }
3894 
3895  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3896  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
3897  ! A_tmp->isConstantStride (), std::logic_error,
3898  "Failed to make temporary constant-stride copies of MultiVectors.");
3899 
3900  {
3901  if (A_tmp->need_sync_device ()) {
3902  const_cast<MV&> (*A_tmp).sync_device (); // const is a lie
3903  }
3904  const LO A_lclNumRows = A_tmp->getLocalLength ();
3905  const LO A_numVecs = A_tmp->getNumVectors ();
3906  auto A_lcl = A_tmp->getLocalViewDevice ();
3907  auto A_sub = Kokkos::subview (A_lcl,
3908  std::make_pair (LO (0), A_lclNumRows),
3909  std::make_pair (LO (0), A_numVecs));
3910 
3911  if (B_tmp->need_sync_device ()) {
3912  const_cast<MV&> (*B_tmp).sync_device (); // const is a lie
3913  }
3914  const LO B_lclNumRows = B_tmp->getLocalLength ();
3915  const LO B_numVecs = B_tmp->getNumVectors ();
3916  auto B_lcl = B_tmp->getLocalViewDevice ();
3917  auto B_sub = Kokkos::subview (B_lcl,
3918  std::make_pair (LO (0), B_lclNumRows),
3919  std::make_pair (LO (0), B_numVecs));
3920 
3921  if (C_tmp->need_sync_device ()) {
3922  const_cast<MV&> (*C_tmp).sync_device (); // const is a lie
3923  }
3924  const LO C_lclNumRows = C_tmp->getLocalLength ();
3925  const LO C_numVecs = C_tmp->getNumVectors ();
3926  auto C_lcl = C_tmp->getLocalViewDevice ();
3927  auto C_sub = Kokkos::subview (C_lcl,
3928  std::make_pair (LO (0), C_lclNumRows),
3929  std::make_pair (LO (0), C_numVecs));
3930  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
3931  (transA == Teuchos::TRANS ? 'T' : 'C'));
3932  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
3933  (transB == Teuchos::TRANS ? 'T' : 'C'));
3934  const impl_scalar_type alpha_IST (alpha);
3935 
3936  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
3937 
3938  this->modify_device ();
3939 
3940  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
3941  beta_local, C_sub);
3942  }
3943 
3944  if (! isConstantStride ()) {
3945  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
3946  }
3947 
3948  // Dispose of (possibly) extra copies of A and B.
3949  A_tmp = Teuchos::null;
3950  B_tmp = Teuchos::null;
3951 
3952  // If Case 2 then sum up *this and distribute it to all processes.
3953  if (Case2) {
3954  Kokkos::fence();
3955  this->reduce ();
3956  }
3957  }
3958 
3959  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3960  void
3962  elementWiseMultiply (Scalar scalarAB,
3965  Scalar scalarThis)
3966  {
3967  using Kokkos::ALL;
3968  using Kokkos::subview;
3971  const char tfecfFuncName[] = "elementWiseMultiply: ";
3972 
3973  const size_t lclNumRows = this->getLocalLength ();
3974  const size_t numVecs = this->getNumVectors ();
3975 
3976  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3977  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
3978  std::runtime_error, "MultiVectors do not have the same local length.");
3979  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3980  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
3981  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
3982  << ".");
3983 
3984  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
3985  if (this->need_sync_device ()) {
3986  this->sync_device ();
3987  }
3988  if (A.need_sync_device ()) {
3989  const_cast<V&>(A).sync_device ();
3990  }
3991  if (B.need_sync_device ()) {
3992  const_cast<MV&>(B).sync_device ();
3993  }
3994  this->modify_device ();
3995 
3996  auto this_view = this->getLocalViewDevice ();
3997  auto A_view = A.getLocalViewDevice ();
3998  auto B_view = B.getLocalViewDevice ();
3999 
4000  if (isConstantStride () && B.isConstantStride ()) {
4001  // A is just a Vector; it only has one column, so it always has
4002  // constant stride.
4003  //
4004  // If both *this and B have constant stride, we can do an
4005  // element-wise multiply on all columns at once.
4006  KokkosBlas::mult (scalarThis,
4007  this_view,
4008  scalarAB,
4009  subview (A_view, ALL (), 0),
4010  B_view);
4011  }
4012  else {
4013  for (size_t j = 0; j < numVecs; ++j) {
4014  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4015  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4016  KokkosBlas::mult (scalarThis,
4017  subview (this_view, ALL (), C_col),
4018  scalarAB,
4019  subview (A_view, ALL (), 0),
4020  subview (B_view, ALL (), B_col));
4021  }
4022  }
4023  }
4024 
4025  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4026  void
4029  {
4031  using ::Tpetra::Details::ProfilingRegion;
4032  ProfilingRegion region ("Tpetra::MV::reduce");
4033 
4034  const auto map = this->getMap ();
4035  if (map.get () == nullptr) {
4036  return;
4037  }
4038  const auto comm = map->getComm ();
4039  if (comm.get () == nullptr) {
4040  return;
4041  }
4042 
4043  // Avoid giving device buffers to MPI. Even if MPI handles them
4044  // correctly, doing so may not perform well.
4045  const bool changed_on_device = this->need_sync_host ();
4046  if (changed_on_device) {
4047  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4048  // and host will be separate allocations. In that case, it may
4049  // pay to do the all-reduce from device to host.
4050  this->modify_device ();
4051  auto X_lcl = this->getLocalViewDevice ();
4052  allReduceView (X_lcl, X_lcl, *comm);
4053  }
4054  else {
4055  this->modify_host ();
4056  auto X_lcl = this->getLocalViewHost ();
4057  allReduceView (X_lcl, X_lcl, *comm);
4058  }
4059  }
4060 
4061  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4062  void
4064  replaceLocalValue (const LocalOrdinal lclRow,
4065  const size_t col,
4066  const impl_scalar_type& ScalarValue) const
4067  {
4068 #ifdef HAVE_TPETRA_DEBUG
4069  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4070  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4071  TEUCHOS_TEST_FOR_EXCEPTION(
4072  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4073  std::runtime_error,
4074  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4075  << " is invalid. The range of valid row indices on this process "
4076  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4077  << ", " << maxLocalIndex << "].");
4078  TEUCHOS_TEST_FOR_EXCEPTION(
4079  vectorIndexOutOfRange(col),
4080  std::runtime_error,
4081  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4082  << " of the multivector is invalid.");
4083 #endif
4084  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4085  view_.h_view (lclRow, colInd) = ScalarValue;
4086  }
4087 
4088 
4089  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4090  void
4092  sumIntoLocalValue (const LocalOrdinal lclRow,
4093  const size_t col,
4094  const impl_scalar_type& value,
4095  const bool atomic) const
4096  {
4097 #ifdef HAVE_TPETRA_DEBUG
4098  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4099  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4100  TEUCHOS_TEST_FOR_EXCEPTION(
4101  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4102  std::runtime_error,
4103  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4104  << " is invalid. The range of valid row indices on this process "
4105  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4106  << ", " << maxLocalIndex << "].");
4107  TEUCHOS_TEST_FOR_EXCEPTION(
4108  vectorIndexOutOfRange(col),
4109  std::runtime_error,
4110  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4111  << " of the multivector is invalid.");
4112 #endif
4113  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4114  if (atomic) {
4115  Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
4116  }
4117  else {
4118  view_.h_view (lclRow, colInd) += value;
4119  }
4120  }
4121 
4122 
4123  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4124  void
4126  replaceGlobalValue (const GlobalOrdinal gblRow,
4127  const size_t col,
4128  const impl_scalar_type& ScalarValue) const
4129  {
4130  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4131  // touches the RCP's reference count, which isn't thread safe.
4132  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4133 #ifdef HAVE_TPETRA_DEBUG
4134  const char tfecfFuncName[] = "replaceGlobalValue: ";
4135  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4136  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4137  std::runtime_error,
4138  "Global row index " << gblRow << " is not present on this process "
4139  << this->getMap ()->getComm ()->getRank () << ".");
4140  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4141  (this->vectorIndexOutOfRange (col), std::runtime_error,
4142  "Vector index " << col << " of the MultiVector is invalid.");
4143 #endif // HAVE_TPETRA_DEBUG
4144  this->replaceLocalValue (lclRow, col, ScalarValue);
4145  }
4146 
4147  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4148  void
4150  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4151  const size_t col,
4152  const impl_scalar_type& value,
4153  const bool atomic) const
4154  {
4155  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4156  // touches the RCP's reference count, which isn't thread safe.
4157  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4158 #ifdef HAVE_TEUCHOS_DEBUG
4159  TEUCHOS_TEST_FOR_EXCEPTION(
4160  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4161  std::runtime_error,
4162  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4163  << " is not present on this process "
4164  << this->getMap ()->getComm ()->getRank () << ".");
4165  TEUCHOS_TEST_FOR_EXCEPTION(
4166  vectorIndexOutOfRange(col),
4167  std::runtime_error,
4168  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4169  << " of the multivector is invalid.");
4170 #endif
4171  this->sumIntoLocalValue (lclRow, col, value, atomic);
4172  }
4173 
4174  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4175  template <class T>
4176  Teuchos::ArrayRCP<T>
4178  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4179  size_t j) const
4180  {
4181  typedef Kokkos::DualView<impl_scalar_type*,
4182  typename dual_view_type::array_layout,
4183  execution_space> col_dual_view_type;
4184  const size_t col = isConstantStride () ? j : whichVectors_[j];
4185  col_dual_view_type X_col =
4186  Kokkos::subview (view_, Kokkos::ALL (), col);
4187  return Kokkos::Compat::persistingView (X_col.d_view);
4188  }
4189 
4190 
4191  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4192  void
4195  view_.clear_sync_state ();
4196  }
4197 
4198  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4199  void
4202  view_.sync_host ();
4203 
4204  // This fence was motivated by the following specific situation:
4205  // For transform Y to X:
4206  // Y.putScalar() // acts on device
4207  // Y.sync_host() // now need_sync_host() and need_sync_device() are false
4208  // transform (on device)
4209  // Y.sync_host() // no modifications so no fence - this usually will be a fence
4210  // read Y // crashes
4211  // The expectation is that Tpetra developers would not normally be using sync_host
4212  // so this fence should not be an issue for internal performance.
4213  execution_space ().fence ();
4214  }
4215 
4216  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4217  void
4220  view_.sync_device ();
4221  }
4222 
4223  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4224  bool
4227  return view_.need_sync_host ();
4228  }
4229 
4230  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4231  bool
4234  return view_.need_sync_device ();
4235  }
4236 
4237  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4238  void
4241  view_.modify_device ();
4242  }
4243 
4244  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4245  void
4248  view_.modify_host ();
4249  }
4250 
4251  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4255  return view_.view_device ();
4256  }
4257 
4258  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4262  return view_.view_host ();
4263  }
4264 
4265  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4266  std::string
4268  descriptionImpl (const std::string& className) const
4269  {
4270  using Teuchos::TypeNameTraits;
4271 
4272  std::ostringstream out;
4273  out << "\"" << className << "\": {";
4274  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4275  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4276  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4277  << ", Node" << Node::name ()
4278  << "}, ";
4279  if (this->getObjectLabel () != "") {
4280  out << "Label: \"" << this->getObjectLabel () << "\", ";
4281  }
4282  out << ", numRows: " << this->getGlobalLength ();
4283  if (className != "Tpetra::Vector") {
4284  out << ", numCols: " << this->getNumVectors ()
4285  << ", isConstantStride: " << this->isConstantStride ();
4286  }
4287  if (this->isConstantStride ()) {
4288  out << ", columnStride: " << this->getStride ();
4289  }
4290  out << "}";
4291 
4292  return out.str ();
4293  }
4294 
4295  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4296  std::string
4299  {
4300  return this->descriptionImpl ("Tpetra::MultiVector");
4301  }
4302 
4303  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4304  std::string
4306  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4307  {
4308  typedef LocalOrdinal LO;
4309  using std::endl;
4310 
4311  if (vl <= Teuchos::VERB_LOW) {
4312  return std::string ();
4313  }
4314  auto map = this->getMap ();
4315  if (map.is_null ()) {
4316  return std::string ();
4317  }
4318  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4319  auto outp = Teuchos::getFancyOStream (outStringP);
4320  Teuchos::FancyOStream& out = *outp;
4321  auto comm = map->getComm ();
4322  const int myRank = comm->getRank ();
4323  const int numProcs = comm->getSize ();
4324 
4325  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4326  Teuchos::OSTab tab1 (out);
4327 
4328  // VERB_MEDIUM and higher prints getLocalLength()
4329  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4330  out << "Local number of rows: " << lclNumRows << endl;
4331 
4332  if (vl > Teuchos::VERB_MEDIUM) {
4333  // VERB_HIGH and higher prints isConstantStride() and getStride().
4334  // The first is only relevant if the Vector has multiple columns.
4335  if (this->getNumVectors () != static_cast<size_t> (1)) {
4336  out << "isConstantStride: " << this->isConstantStride () << endl;
4337  }
4338  if (this->isConstantStride ()) {
4339  out << "Column stride: " << this->getStride () << endl;
4340  }
4341 
4342  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4343  // VERB_EXTREME prints values. Get a host View of the
4344  // Vector's local data, so we can print it. (Don't assume
4345  // that we can access device data directly in host code.)
4346  typename dual_view_type::t_host X_host;
4347  if (need_sync_host ()) {
4348  // Device memory has the latest version. Don't actually
4349  // sync to host; that changes the Vector's state, and may
4350  // change future computations (that use the data's current
4351  // place to decide where to compute). Instead, create a
4352  // temporary host copy and print that.
4353  auto X_dev = getLocalViewDevice ();
4354  auto X_host_copy = Kokkos::create_mirror_view (X_dev);
4355  Kokkos::deep_copy (X_host_copy, X_dev);
4356  X_host = X_host_copy;
4357  }
4358  else {
4359  // Either host and device are in sync, or host has the
4360  // latest version of the Vector's data. Thus, we can use
4361  // the host version directly.
4362  X_host = getLocalViewHost ();
4363  }
4364  // The square braces [] and their contents are in Matlab
4365  // format, so users may copy and paste directly into Matlab.
4366  out << "Values: " << endl
4367  << "[";
4368  const LO numCols = static_cast<LO> (this->getNumVectors ());
4369  if (numCols == 1) {
4370  for (LO i = 0; i < lclNumRows; ++i) {
4371  out << X_host(i,0);
4372  if (i + 1 < lclNumRows) {
4373  out << "; ";
4374  }
4375  }
4376  }
4377  else {
4378  for (LO i = 0; i < lclNumRows; ++i) {
4379  for (LO j = 0; j < numCols; ++j) {
4380  out << X_host(i,j);
4381  if (j + 1 < numCols) {
4382  out << ", ";
4383  }
4384  }
4385  if (i + 1 < lclNumRows) {
4386  out << "; ";
4387  }
4388  }
4389  }
4390  out << "]" << endl;
4391  }
4392  }
4393 
4394  out.flush (); // make sure the ostringstream got everything
4395  return outStringP->str ();
4396  }
4397 
4398  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4399  void
4401  describeImpl (Teuchos::FancyOStream& out,
4402  const std::string& className,
4403  const Teuchos::EVerbosityLevel verbLevel) const
4404  {
4405  using Teuchos::TypeNameTraits;
4406  using Teuchos::VERB_DEFAULT;
4407  using Teuchos::VERB_NONE;
4408  using Teuchos::VERB_LOW;
4409  using std::endl;
4410  const Teuchos::EVerbosityLevel vl =
4411  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4412 
4413  if (vl == VERB_NONE) {
4414  return; // don't print anything
4415  }
4416  // If this Vector's Comm is null, then the Vector does not
4417  // participate in collective operations with the other processes.
4418  // In that case, it is not even legal to call this method. The
4419  // reasonable thing to do in that case is nothing.
4420  auto map = this->getMap ();
4421  if (map.is_null ()) {
4422  return;
4423  }
4424  auto comm = map->getComm ();
4425  if (comm.is_null ()) {
4426  return;
4427  }
4428 
4429  const int myRank = comm->getRank ();
4430 
4431  // Only Process 0 should touch the output stream, but this method
4432  // in general may need to do communication. Thus, we may need to
4433  // preserve the current tab level across multiple "if (myRank ==
4434  // 0) { ... }" inner scopes. This is why we sometimes create
4435  // OSTab instances by pointer, instead of by value. We only need
4436  // to create them by pointer if the tab level must persist through
4437  // multiple inner scopes.
4438  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4439 
4440  // VERB_LOW and higher prints the equivalent of description().
4441  if (myRank == 0) {
4442  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4443  out << "\"" << className << "\":" << endl;
4444  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4445  {
4446  out << "Template parameters:" << endl;
4447  Teuchos::OSTab tab2 (out);
4448  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4449  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4450  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4451  << "Node: " << Node::name () << endl;
4452  }
4453  if (this->getObjectLabel () != "") {
4454  out << "Label: \"" << this->getObjectLabel () << "\", ";
4455  }
4456  out << "Global number of rows: " << this->getGlobalLength () << endl;
4457  if (className != "Tpetra::Vector") {
4458  out << "Number of columns: " << this->getNumVectors () << endl;
4459  }
4460  // getStride() may differ on different processes, so it (and
4461  // isConstantStride()) properly belong to per-process data.
4462  }
4463 
4464  // This is collective over the Map's communicator.
4465  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4466  const std::string lclStr = this->localDescribeToString (vl);
4467  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4468  }
4469  }
4470 
4471  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4472  void
4474  describe (Teuchos::FancyOStream &out,
4475  const Teuchos::EVerbosityLevel verbLevel) const
4476  {
4477  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4478  }
4479 
4480  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4481  void
4483  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4484  {
4485  replaceMap (newMap);
4486  }
4487 
4488  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4489  void
4492  {
4493  using ::Tpetra::Details::localDeepCopy;
4494  const char prefix[] = "Tpetra::MultiVector::assign: ";
4495 
4496  TEUCHOS_TEST_FOR_EXCEPTION
4497  (this->getGlobalLength () != src.getGlobalLength () ||
4498  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4499  prefix << "Global dimensions of the two Tpetra::MultiVector "
4500  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4501  << "," << src.getNumVectors () << "], and *this has dimensions ["
4502  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4503  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4504  TEUCHOS_TEST_FOR_EXCEPTION
4505  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4506  prefix << "The local row counts of the two Tpetra::MultiVector "
4507  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4508  << " and *this has " << this->getLocalLength () << " row(s).");
4509 
4510  // See #1510. We're writing to *this, so we don't care about its
4511  // contents in either memory space, and we don't want
4512  // DualView::modify to complain about "concurrent modification" of
4513  // host and device Views.
4514  this->clear_sync_state();
4515  this->modify_device ();
4516 
4517  // If need sync to device, then host has most recent version.
4518  const bool src_last_updated_on_host = src.need_sync_device ();
4519 
4520  if (src_last_updated_on_host) {
4521  localDeepCopy (this->getLocalViewDevice (),
4522  src.getLocalViewHost (),
4523  this->isConstantStride (),
4524  src.isConstantStride (),
4525  this->whichVectors_ (),
4526  src.whichVectors_ ());
4527  }
4528  else {
4529  localDeepCopy (this->getLocalViewDevice (),
4530  src.getLocalViewDevice (),
4531  this->isConstantStride (),
4532  src.isConstantStride (),
4533  this->whichVectors_ (),
4534  src.whichVectors_ ());
4535  }
4536  }
4537 
4538  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4539  bool
4542  {
4543  using ::Tpetra::Details::PackTraits;
4544  using ST = impl_scalar_type;
4545 
4546  const size_t l1 = this->getLocalLength();
4547  const size_t l2 = vec.getLocalLength();
4548  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4549  return false;
4550  }
4551  if (l1==0) {
4552  return true;
4553  }
4554 
4555  auto v1 = this->getLocalViewHost ();
4556  auto v2 = vec.getLocalViewHost ();
4557  if (PackTraits<ST>::packValueCount (v1(0,0)) !=
4558  PackTraits<ST>::packValueCount (v2(0,0))) {
4559  return false;
4560  }
4561 
4562  return true;
4563  }
4564 
4565  template <class ST, class LO, class GO, class NT>
4568  std::swap(mv.map_, this->map_);
4569  std::swap(mv.view_, this->view_);
4570  std::swap(mv.origView_, this->origView_);
4571  std::swap(mv.whichVectors_, this->whichVectors_);
4572  }
4573 
4574 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4575  template <class ST, class LO, class GO, class NT>
4576  void
4578  const Teuchos::SerialDenseMatrix<int, ST>& src)
4579  {
4580  using ::Tpetra::Details::localDeepCopy;
4581  using MV = MultiVector<ST, LO, GO, NT>;
4582  using IST = typename MV::impl_scalar_type;
4583  using input_view_type =
4584  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4585  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4586  using pair_type = std::pair<LO, LO>;
4587 
4588  const LO numRows = static_cast<LO> (src.numRows ());
4589  const LO numCols = static_cast<LO> (src.numCols ());
4590 
4591  TEUCHOS_TEST_FOR_EXCEPTION
4592  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4593  numCols != static_cast<LO> (dst.getNumVectors ()),
4594  std::invalid_argument, "Tpetra::deep_copy: On Process "
4595  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4596  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4597  << ", but src is " << numRows << " x " << numCols << ".");
4598 
4599  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4600  input_view_type src_orig (src_raw, src.stride (), numCols);
4601  input_view_type src_view =
4602  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4603 
4604  dst.clear_sync_state ();
4605  dst.modify_device ();
4606  constexpr bool src_isConstantStride = true;
4607  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4608  localDeepCopy (dst.getLocalViewDevice (),
4609  src_view,
4610  dst.isConstantStride (),
4611  src_isConstantStride,
4612  getMultiVectorWhichVectors (dst),
4613  srcWhichVectors);
4614  }
4615 
4616  template <class ST, class LO, class GO, class NT>
4617  void
4618  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4619  const MultiVector<ST, LO, GO, NT>& src)
4620  {
4621  using ::Tpetra::Details::localDeepCopy;
4622  using MV = MultiVector<ST, LO, GO, NT>;
4623  using IST = typename MV::impl_scalar_type;
4624  using output_view_type =
4625  Kokkos::View<IST**, Kokkos::LayoutLeft,
4626  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4627  using pair_type = std::pair<LO, LO>;
4628 
4629  const LO numRows = static_cast<LO> (dst.numRows ());
4630  const LO numCols = static_cast<LO> (dst.numCols ());
4631 
4632  TEUCHOS_TEST_FOR_EXCEPTION
4633  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4634  numCols != static_cast<LO> (src.getNumVectors ()),
4635  std::invalid_argument, "Tpetra::deep_copy: On Process "
4636  << src.getMap ()->getComm ()->getRank () << ", src is "
4637  << src.getLocalLength () << " x " << src.getNumVectors ()
4638  << ", but dst is " << numRows << " x " << numCols << ".");
4639 
4640  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4641  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4642  auto dst_view =
4643  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4644 
4645  constexpr bool dst_isConstantStride = true;
4646  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4647 
4648  // Prefer the host version of src's data.
4649  if (src.need_sync_host ()) { // last modified on device
4650  localDeepCopy (dst_view,
4651  src.getLocalViewDevice (),
4652  dst_isConstantStride,
4653  src.isConstantStride (),
4654  dstWhichVectors,
4655  getMultiVectorWhichVectors (src));
4656  }
4657  else {
4658  localDeepCopy (dst_view,
4659  src.getLocalViewHost (),
4660  dst_isConstantStride,
4661  src.isConstantStride (),
4662  dstWhichVectors,
4663  getMultiVectorWhichVectors (src));
4664  }
4665  }
4666 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4667 
4668  template <class Scalar, class LO, class GO, class NT>
4669  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4670  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4671  size_t numVectors)
4672  {
4673  typedef MultiVector<Scalar, LO, GO, NT> MV;
4674  return Teuchos::rcp (new MV (map, numVectors));
4675  }
4676 
4677  template <class ST, class LO, class GO, class NT>
4678  MultiVector<ST, LO, GO, NT>
4680  {
4681  typedef MultiVector<ST, LO, GO, NT> MV;
4682  MV cpy (src.getMap (), src.getNumVectors (), false);
4683  cpy.assign (src);
4684  return cpy;
4685  }
4686 
4687 } // namespace Tpetra
4688 
4689 //
4690 // Explicit instantiation macro
4691 //
4692 // Must be expanded from within the Tpetra namespace!
4693 //
4694 
4695 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4696 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4697  template class MultiVector< SCALAR , LO , GO , NODE >; \
4698  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4699  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4700  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4701  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4702 
4703 #else
4704 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4705  template class MultiVector< SCALAR , LO , GO , NODE >; \
4706  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4707 
4708 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4709 
4710 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector &quot;constructor&quot;: Create a MultiVector from a given Map.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
dual_view_type::t_host getLocalViewHost() const
A local Kokkos::View of host memory.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector&#39;s local values.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector&#39;s local values.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void clear_sync_state()
Clear &quot;modified&quot; flags on both host and device sides.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process&#39; verbose describe() information to the returned string. ...
One or more distributed dense vectors.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using global row index.
bool isDistributed() const
Whether this is a globally distributed object.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector&#39;s local values.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
static bool debug()
Whether Tpetra is in debug mode.
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalCols() const
&quot;Original&quot; number of columns in the (local) data.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
void sync_host()
Synchronize to Host.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
dual_view_type origView_
The &quot;original view&quot; of the MultiVector&#39;s data.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using global row index.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner (&quot;dot&quot;) product result.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Declaration of Tpetra::Details::isInterComm.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
void modify_device()
Mark data as modified on the device side.
Replace old value with maximum of magnitudes of old and new values.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
Abstract base class for objects that can be the source of an Import or Export operation.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Replace existing values with new values.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
void randomize()
Set all values in the multivector to pseudorandom numbers.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void modify_host()
Mark data as modified on the host side.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using local (row) index.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
A parallel distribution of indices over processes.
size_t getStride() const
Stride between columns in the multivector.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual std::string description() const
A simple one-line description of this object.
void reduce()
Sum values of a locally replicated multivector across all processes.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
dual_view_type::t_dev getLocalViewDevice() const
A local Kokkos::View of device memory.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using local row index.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,j).
Base class for distributed Tpetra objects that support data redistribution.
void localDeepCopy(const DstViewType &dst, const SrcViewType &src, const bool dstConstStride, const bool srcConstStride, const DstWhichVecsType &dstWhichVecs, const SrcWhichVecsType &srcWhichVecs)
Implementation of Tpetra::MultiVector deep copy of local data.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
EWhichNorm
Input argument for normImpl() (which see).
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector&#39;s local values.
size_t getOrigNumLocalRows() const
&quot;Original&quot; number of rows in the (local) data.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
void sync_device()
Synchronize to Device.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.