Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_idot.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_IDOT_HPP
11 #define TPETRA_DETAILS_IDOT_HPP
12 
28 
29 #include "Tpetra_iallreduce.hpp"
30 #include "Tpetra_MultiVector.hpp"
31 #include "Tpetra_Vector.hpp"
32 #include "Teuchos_CommHelpers.hpp"
33 #include "KokkosBlas1_dot.hpp"
34 #include <stdexcept>
35 #include <sstream>
36 
37 namespace Tpetra {
38 namespace Details {
39 
40 // Temporary helper to get read-only view from multivector in requested space.
41 // TODO: when https://github.com/kokkos/kokkos/issues/3850 is resolved,
42 // remove this and just use templated getLocalView<Device>(ReadOnly).
43 template <typename MV>
44 struct GetReadOnly {
45  using DevView = typename MV::dual_view_type::t_dev::const_type;
46  using HostView = typename MV::dual_view_type::t_host::const_type;
47 
48  template <typename exec_space>
49  static DevView get(const MV& x, typename std::enable_if<std::is_same<exec_space, typename MV::execution_space>::value>::type* = nullptr) {
50  return x.getLocalViewDevice(Tpetra::Access::ReadOnly);
51  }
52 
53  template <typename exec_space>
54  static HostView get(const MV& x, typename std::enable_if<!std::is_same<exec_space, typename MV::execution_space>::value>::type* = nullptr) {
55  return x.getLocalViewHost(Tpetra::Access::ReadOnly);
56  }
57 };
58 
60 
61 template <class MV, class ResultView, bool runOnDevice>
62 void idotLocal(const ResultView& localResult,
63  const MV& X,
64  const MV& Y) {
65  using pair_type = Kokkos::pair<size_t, size_t>;
66  using exec_space = typename std::conditional<runOnDevice, typename MV::execution_space, Kokkos::DefaultHostExecutionSpace>::type;
67  // if the execution space can access localResult, use it directly. Otherwise need to make a copy.
68  static_assert(Kokkos::SpaceAccessibility<exec_space, typename ResultView::memory_space>::accessible,
69  "idotLocal: Execution space must be able to access localResult");
70  // If Dot executes on Serial, it requires the result to be HostSpace. If localResult is CudaUVMSpace,
71  // we can just treat it like HostSpace.
72  Kokkos::View<typename ResultView::data_type, typename exec_space::memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
73  localResultUnmanaged(localResult.data(), localResult.extent(0));
74  const size_t numRows = X.getLocalLength();
75  const pair_type rowRange(0, numRows);
76  const size_t X_numVecs = X.getNumVectors();
77  const size_t Y_numVecs = Y.getNumVectors();
78  const size_t numVecs = X_numVecs > Y_numVecs ? X_numVecs : Y_numVecs;
79  // Check compatibility of number of columns; allow special cases of
80  // a multivector dot a single vector, or vice versa.
81  if (X_numVecs != Y_numVecs &&
82  X_numVecs != size_t(1) &&
83  Y_numVecs != size_t(1)) {
84  std::ostringstream os;
85  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
86  << " != Y.getNumVectors() = " << Y_numVecs
87  << ", but neither is 1.";
88  throw std::invalid_argument(os.str());
89  }
90  auto X_lcl = GetReadOnly<MV>::template get<exec_space>(X);
91  auto Y_lcl = GetReadOnly<MV>::template get<exec_space>(Y);
92  // Can the multivector dot kernel be used?
93  bool useMVDot = X.isConstantStride() && Y.isConstantStride() && X_numVecs == Y_numVecs;
94  if (useMVDot) {
95  if (numVecs == size_t(1)) {
96  auto X_lcl_1d = Kokkos::subview(X_lcl, rowRange, 0);
97  auto Y_lcl_1d = Kokkos::subview(Y_lcl, rowRange, 0);
98  auto result_0d = Kokkos::subview(localResultUnmanaged, 0);
99  KokkosBlas::dot(result_0d, X_lcl_1d, Y_lcl_1d);
100  } else {
101  auto X_lcl_2d = Kokkos::subview(X_lcl, rowRange, pair_type(0, X_numVecs));
102  auto Y_lcl_2d = Kokkos::subview(Y_lcl, rowRange, pair_type(0, Y_numVecs));
103  KokkosBlas::dot(localResultUnmanaged, X_lcl_2d, Y_lcl_2d);
104  }
105  } else {
106  auto XWhichVectors = Tpetra::getMultiVectorWhichVectors(X);
107  auto YWhichVectors = Tpetra::getMultiVectorWhichVectors(Y);
108  // Need to compute each dot individually, using 1D subviews of X_lcl and Y_lcl
109  for (size_t vec = 0; vec < numVecs; vec++) {
110  // Get the Vectors for each column of X and Y that will be dotted together.
111  // Note: "which vectors" is not populated for constant stride MVs (but it's the identity mapping)
112  size_t Xj = (numVecs == X_numVecs) ? vec : 0;
113  Xj = X.isConstantStride() ? Xj : XWhichVectors[Xj];
114  size_t Yj = (numVecs == Y_numVecs) ? vec : 0;
115  Yj = Y.isConstantStride() ? Yj : YWhichVectors[Yj];
116  auto Xcol = Kokkos::subview(X_lcl, rowRange, Xj);
117  auto Ycol = Kokkos::subview(Y_lcl, rowRange, Yj);
118 
119  // Compute the rank-1 dot product, and place the result in an element of localResult
120  KokkosBlas::dot(Kokkos::subview(localResultUnmanaged, vec), Xcol, Ycol);
121  }
122  }
123 }
124 
125 // Helper to avoid extra instantiations of KokkosBlas::dot and iallreduce.
126 template <typename MV, typename ResultView>
127 struct IdotHelper {
128  using dot_type = typename MV::dot_type;
129 
130  // First version: use result directly
131  template <typename exec_space>
132  static std::shared_ptr<::Tpetra::Details::CommRequest> run(
133  const ResultView& globalResult, const MV& X, const MV& Y,
134  typename std::enable_if<Kokkos::SpaceAccessibility<exec_space, typename ResultView::memory_space>::accessible>::type* = nullptr)
135 
136  {
137  constexpr bool runOnDevice = std::is_same<exec_space, typename MV::execution_space>::value;
138  idotLocal<MV, ResultView, runOnDevice>(globalResult, X, Y);
139  // Fence because we're giving result of device kernel to MPI
140  if (runOnDevice)
141  exec_space().fence();
142  auto comm = X.getMap()->getComm();
143  return iallreduce(globalResult, globalResult, ::Teuchos::REDUCE_SUM, *comm);
144  }
145 
146  // Second version: use a temporary local result, because exec_space can't write to globalResult
147  template <typename exec_space>
148  static std::shared_ptr<::Tpetra::Details::CommRequest> run(
149  const ResultView& globalResult, const MV& X, const MV& Y,
150  typename std::enable_if<!Kokkos::SpaceAccessibility<exec_space, typename ResultView::memory_space>::accessible>::type* = nullptr) {
151  constexpr bool runOnDevice = std::is_same<exec_space, typename MV::execution_space>::value;
152  Kokkos::View<dot_type*, typename exec_space::memory_space> localResult(Kokkos::ViewAllocateWithoutInitializing("idot:localResult"), X.getNumVectors());
153  idotLocal<MV, decltype(localResult), runOnDevice>(localResult, X, Y);
154  // Fence because we're giving result of device kernel to MPI
155  if (runOnDevice)
156  exec_space().fence();
157  auto comm = X.getMap()->getComm();
158  return iallreduce(localResult, globalResult, ::Teuchos::REDUCE_SUM, *comm);
159  }
160 };
161 
164 template <class MV, class ResultView>
165 std::shared_ptr<::Tpetra::Details::CommRequest>
166 idotImpl(const ResultView& globalResult,
167  const MV& X,
168  const MV& Y) {
169  static_assert(std::is_same<typename ResultView::non_const_value_type, typename MV::dot_type>::value,
170  "Tpetra::idot: result view's element type must match MV::dot_type");
171 
172  // Execution space to use for dot kernel(s) is whichever has access to up-to-date X.
173  if (X.need_sync_device()) {
174  // run on host.
175  return IdotHelper<MV, ResultView>::template run<Kokkos::DefaultHostExecutionSpace>(globalResult, X, Y);
176  } else {
177  // run on device.
178  return IdotHelper<MV, ResultView>::template run<typename MV::execution_space>(globalResult, X, Y);
179  }
180 }
181 } // namespace Details
182 
183 //
184 // SKIP DOWN TO HERE
185 //
186 
239 template <class SC, class LO, class GO, class NT>
240 std::shared_ptr<::Tpetra::Details::CommRequest>
241 idot(typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type* resultRaw,
242  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
243  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y) {
244  using dot_type = typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type;
245  const size_t X_numVecs = X.getNumVectors();
246  const size_t Y_numVecs = Y.getNumVectors();
247  const size_t numVecs = (X_numVecs > Y_numVecs) ? X_numVecs : Y_numVecs;
248  Kokkos::View<dot_type*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
249  resultView(resultRaw, numVecs);
250  return Details::idotImpl(resultView, X, Y);
251 }
252 
315 template <class SC, class LO, class GO, class NT>
316 std::shared_ptr<::Tpetra::Details::CommRequest>
317 idot(const Kokkos::View<typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type*,
318  typename ::Tpetra::MultiVector<SC, LO, GO, NT>::device_type>& result,
319  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
320  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y) {
321  return Details::idotImpl(result, X, Y);
322 }
323 
365 template <class SC, class LO, class GO, class NT>
366 std::shared_ptr<::Tpetra::Details::CommRequest>
367 idot(const Kokkos::View<typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type,
368  typename ::Tpetra::Vector<SC, LO, GO, NT>::device_type>& result,
369  const ::Tpetra::Vector<SC, LO, GO, NT>& X,
370  const ::Tpetra::Vector<SC, LO, GO, NT>& Y) {
371  using dot_type = typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type;
372  using result_device_t = typename ::Tpetra::Vector<SC, LO, GO, NT>::device_type;
373  Kokkos::View<dot_type*, result_device_t, Kokkos::MemoryTraits<Kokkos::Unmanaged>> result1D(result.data(), 1);
374  return Details::idotImpl(result1D, X, Y);
375 }
376 
377 } // namespace Tpetra
378 
379 #endif // TPETRA_DETAILS_IDOT_HPP
std::shared_ptr<::Tpetra::Details::CommRequest > idotImpl(const ResultView &globalResult, const MV &X, const MV &Y)
Internal (common) version of idot, a global dot product that uses a non-blocking MPI reduction...
void idotLocal(const ResultView &localResult, const MV &X, const MV &Y)
Compute dot product locally. Where the kernel runs controlled by runOnDevice.
Definition: Tpetra_idot.hpp:62
std::shared_ptr<::Tpetra::Details::CommRequest > idot(typename::Tpetra::MultiVector< SC, LO, GO, NT >::dot_type *resultRaw, const ::Tpetra::MultiVector< SC, LO, GO, NT > &X, const ::Tpetra::MultiVector< SC, LO, GO, NT > &Y)
Nonblocking dot product, with either Tpetra::MultiVector or Tpetra::Vector inputs, and raw pointer or raw array output.
Declaration of Tpetra::iallreduce.