Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_idot.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_IDOT_HPP
11 #define TPETRA_DETAILS_IDOT_HPP
12 
28 
29 #include "Tpetra_iallreduce.hpp"
30 #include "Tpetra_MultiVector.hpp"
31 #include "Tpetra_Vector.hpp"
32 #include "Teuchos_CommHelpers.hpp"
33 #include "KokkosBlas1_dot.hpp"
34 #include <stdexcept>
35 #include <sstream>
36 
37 namespace Tpetra {
38 namespace Details {
39 
40 // Temporary helper to get read-only view from multivector in requested space.
41 // TODO: when https://github.com/kokkos/kokkos/issues/3850 is resolved,
42 // remove this and just use templated getLocalView<Device>(ReadOnly).
43 template<typename MV>
44 struct GetReadOnly
45 {
46  using DevView = typename MV::dual_view_type::t_dev::const_type;
47  using HostView = typename MV::dual_view_type::t_host::const_type;
48 
49  template<typename exec_space>
50  static DevView get(const MV& x, typename std::enable_if<std::is_same<exec_space, typename MV::execution_space>::value>::type* = nullptr)
51  {
52  return x.getLocalViewDevice(Tpetra::Access::ReadOnly);
53  }
54 
55  template<typename exec_space>
56  static HostView get(const MV& x, typename std::enable_if<!std::is_same<exec_space, typename MV::execution_space>::value>::type* = nullptr)
57  {
58  return x.getLocalViewHost(Tpetra::Access::ReadOnly);
59  }
60 };
61 
63 
64 template<class MV, class ResultView, bool runOnDevice>
65 void idotLocal(const ResultView& localResult,
66  const MV& X,
67  const MV& Y)
68 {
69  using pair_type = Kokkos::pair<size_t, size_t>;
70  using exec_space = typename std::conditional<runOnDevice, typename MV::execution_space, Kokkos::DefaultHostExecutionSpace>::type;
71  //if the execution space can access localResult, use it directly. Otherwise need to make a copy.
72  static_assert(Kokkos::SpaceAccessibility<exec_space, typename ResultView::memory_space>::accessible,
73  "idotLocal: Execution space must be able to access localResult");
74  //If Dot executes on Serial, it requires the result to be HostSpace. If localResult is CudaUVMSpace,
75  //we can just treat it like HostSpace.
76  Kokkos::View<typename ResultView::data_type, typename exec_space::memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
77  localResultUnmanaged(localResult.data(), localResult.extent(0));
78  const size_t numRows = X.getLocalLength ();
79  const pair_type rowRange (0, numRows);
80  const size_t X_numVecs = X.getNumVectors ();
81  const size_t Y_numVecs = Y.getNumVectors ();
82  const size_t numVecs = X_numVecs > Y_numVecs ? X_numVecs : Y_numVecs;
83  // Check compatibility of number of columns; allow special cases of
84  // a multivector dot a single vector, or vice versa.
85  if (X_numVecs != Y_numVecs &&
86  X_numVecs != size_t (1) &&
87  Y_numVecs != size_t (1)) {
88  std::ostringstream os;
89  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
90  << " != Y.getNumVectors() = " << Y_numVecs
91  << ", but neither is 1.";
92  throw std::invalid_argument (os.str ());
93  }
94  auto X_lcl = GetReadOnly<MV>::template get<exec_space>(X);
95  auto Y_lcl = GetReadOnly<MV>::template get<exec_space>(Y);
96  //Can the multivector dot kernel be used?
97  bool useMVDot = X.isConstantStride() && Y.isConstantStride() && X_numVecs == Y_numVecs;
98  if(useMVDot)
99  {
100  if (numVecs == size_t (1)) {
101  auto X_lcl_1d = Kokkos::subview (X_lcl, rowRange, 0);
102  auto Y_lcl_1d = Kokkos::subview (Y_lcl, rowRange, 0);
103  auto result_0d = Kokkos::subview (localResultUnmanaged, 0);
104  KokkosBlas::dot (result_0d, X_lcl_1d, Y_lcl_1d);
105  }
106  else {
107  auto X_lcl_2d = Kokkos::subview (X_lcl, rowRange, pair_type (0, X_numVecs));
108  auto Y_lcl_2d = Kokkos::subview (Y_lcl, rowRange, pair_type (0, Y_numVecs));
109  KokkosBlas::dot (localResultUnmanaged, X_lcl_2d, Y_lcl_2d);
110  }
111  }
112  else
113  {
114  auto XWhichVectors = Tpetra::getMultiVectorWhichVectors(X);
115  auto YWhichVectors = Tpetra::getMultiVectorWhichVectors(Y);
116  //Need to compute each dot individually, using 1D subviews of X_lcl and Y_lcl
117  for(size_t vec = 0; vec < numVecs; vec++) {
118  //Get the Vectors for each column of X and Y that will be dotted together.
119  //Note: "which vectors" is not populated for constant stride MVs (but it's the identity mapping)
120  size_t Xj = (numVecs == X_numVecs) ? vec : 0;
121  Xj = X.isConstantStride() ? Xj : XWhichVectors[Xj];
122  size_t Yj = (numVecs == Y_numVecs) ? vec : 0;
123  Yj = Y.isConstantStride() ? Yj : YWhichVectors[Yj];
124  auto Xcol = Kokkos::subview(X_lcl, rowRange, Xj);
125  auto Ycol = Kokkos::subview(Y_lcl, rowRange, Yj);
126 
127  //Compute the rank-1 dot product, and place the result in an element of localResult
128  KokkosBlas::dot(Kokkos::subview(localResultUnmanaged, vec), Xcol, Ycol);
129  }
130  }
131 }
132 
133 //Helper to avoid extra instantiations of KokkosBlas::dot and iallreduce.
134 template<typename MV, typename ResultView>
135 struct IdotHelper
136 {
137  using dot_type = typename MV::dot_type;
138 
139  //First version: use result directly
140  template<typename exec_space>
141  static std::shared_ptr< ::Tpetra::Details::CommRequest> run(
142  const ResultView& globalResult, const MV& X, const MV& Y,
143  typename std::enable_if<Kokkos::SpaceAccessibility<exec_space, typename ResultView::memory_space>::accessible>::type* = nullptr)
144 
145  {
146  constexpr bool runOnDevice = std::is_same<exec_space, typename MV::execution_space>::value;
147  idotLocal<MV, ResultView, runOnDevice>(globalResult, X, Y);
148  //Fence because we're giving result of device kernel to MPI
149  if(runOnDevice)
150  exec_space().fence();
151  auto comm = X.getMap()->getComm();
152  return iallreduce(globalResult, globalResult, ::Teuchos::REDUCE_SUM, *comm);
153  }
154 
155  //Second version: use a temporary local result, because exec_space can't write to globalResult
156  template<typename exec_space>
157  static std::shared_ptr< ::Tpetra::Details::CommRequest> run(
158  const ResultView& globalResult, const MV& X, const MV& Y,
159  typename std::enable_if<!Kokkos::SpaceAccessibility<exec_space, typename ResultView::memory_space>::accessible>::type* = nullptr)
160  {
161  constexpr bool runOnDevice = std::is_same<exec_space, typename MV::execution_space>::value;
162  Kokkos::View<dot_type*, typename exec_space::memory_space> localResult(Kokkos::ViewAllocateWithoutInitializing("idot:localResult"), X.getNumVectors());
163  idotLocal<MV, decltype(localResult), runOnDevice>(localResult, X, Y);
164  //Fence because we're giving result of device kernel to MPI
165  if(runOnDevice)
166  exec_space().fence();
167  auto comm = X.getMap()->getComm();
168  return iallreduce(localResult, globalResult, ::Teuchos::REDUCE_SUM, *comm);
169  }
170 
171 };
172 
173 
176 template<class MV, class ResultView>
177 std::shared_ptr< ::Tpetra::Details::CommRequest>
178 idotImpl(const ResultView& globalResult,
179  const MV& X,
180  const MV& Y)
181 {
182  static_assert(std::is_same<typename ResultView::non_const_value_type, typename MV::dot_type>::value,
183  "Tpetra::idot: result view's element type must match MV::dot_type");
184 
185  // Execution space to use for dot kernel(s) is whichever has access to up-to-date X.
186  if(X.need_sync_device())
187  {
188  //run on host.
189  return IdotHelper<MV, ResultView>::template run<Kokkos::DefaultHostExecutionSpace>(globalResult, X, Y);
190  }
191  else
192  {
193  //run on device.
194  return IdotHelper<MV, ResultView>::template run<typename MV::execution_space>(globalResult, X, Y);
195  }
196 }
197 } // namespace Details
198 
199 //
200 // SKIP DOWN TO HERE
201 //
202 
255 template<class SC, class LO, class GO, class NT>
256 std::shared_ptr< ::Tpetra::Details::CommRequest>
257 idot (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type* resultRaw,
258  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
259  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y)
260 {
261  using dot_type = typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type;
262  const size_t X_numVecs = X.getNumVectors ();
263  const size_t Y_numVecs = Y.getNumVectors ();
264  const size_t numVecs = (X_numVecs > Y_numVecs) ? X_numVecs : Y_numVecs;
265  Kokkos::View<dot_type*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
266  resultView(resultRaw, numVecs);
267  return Details::idotImpl(resultView, X, Y);
268 }
269 
332 template<class SC, class LO, class GO, class NT>
333 std::shared_ptr< ::Tpetra::Details::CommRequest>
334 idot (const Kokkos::View<typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type*,
335  typename ::Tpetra::MultiVector<SC, LO, GO, NT>::device_type>& result,
336  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
337  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y)
338 {
339  return Details::idotImpl(result, X, Y);
340 }
341 
383 template<class SC, class LO, class GO, class NT>
384 std::shared_ptr< ::Tpetra::Details::CommRequest>
385 idot (const Kokkos::View<typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type,
386  typename ::Tpetra::Vector<SC, LO, GO, NT>::device_type>& result,
387  const ::Tpetra::Vector<SC, LO, GO, NT>& X,
388  const ::Tpetra::Vector<SC, LO, GO, NT>& Y)
389 {
390  using dot_type = typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type;
391  using result_device_t = typename ::Tpetra::Vector<SC, LO, GO, NT>::device_type;
392  Kokkos::View<dot_type*, result_device_t, Kokkos::MemoryTraits<Kokkos::Unmanaged>> result1D(result.data(), 1);
393  return Details::idotImpl(result1D, X, Y);
394 }
395 
396 } // namespace Tpetra
397 
398 #endif // TPETRA_DETAILS_IDOT_HPP
std::shared_ptr< ::Tpetra::Details::CommRequest > idotImpl(const ResultView &globalResult, const MV &X, const MV &Y)
Internal (common) version of idot, a global dot product that uses a non-blocking MPI reduction...
void idotLocal(const ResultView &localResult, const MV &X, const MV &Y)
Compute dot product locally. Where the kernel runs controlled by runOnDevice.
Definition: Tpetra_idot.hpp:65
std::shared_ptr< ::Tpetra::Details::CommRequest > idot(typename::Tpetra::MultiVector< SC, LO, GO, NT >::dot_type *resultRaw, const ::Tpetra::MultiVector< SC, LO, GO, NT > &X, const ::Tpetra::MultiVector< SC, LO, GO, NT > &Y)
Nonblocking dot product, with either Tpetra::MultiVector or Tpetra::Vector inputs, and raw pointer or raw array output.
Declaration of Tpetra::iallreduce.