Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_allReduceView.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_ALLREDUCEVIEW_HPP
43 #define TPETRA_DETAILS_ALLREDUCEVIEW_HPP
44 
47 #include "Kokkos_Core.hpp"
48 #include "Teuchos_CommHelpers.hpp"
49 #include <limits>
50 #include <type_traits>
51 
54 
55 namespace { // (anonymous)
56 
57 // helper for detecting views that have Cuda memory
58 // Technically, UVM should not be here, but it seems we
59 // treat UVM as CudaSpace, so I have left it. With Cuda >= 8 on Linux
60 // UVM is handled by page faults in the Kernel, so it should always
61 // be addressable.
62 template<class ViewType>
63 struct view_uses_cuda_spaces {
64  static constexpr bool value =
65 #ifdef KOKKOS_ENABLE_CUDA
66  std::is_same<typename ViewType::memory_space, Kokkos::CudaSpace>::value
67  || std::is_same<typename ViewType::memory_space, Kokkos::CudaUVMSpace>::value;
68 #else
69  false;
70 #endif // KOKKOS_ENABLE_CUDA
71 };
72 
73 template<class ViewType>
74 struct MakeContiguousBuffer {
75  static constexpr bool is_contiguous_layout =
76  std::is_same<
77  typename ViewType::array_layout,
78  Kokkos::LayoutLeft>::value ||
79  std::is_same<
80  typename ViewType::array_layout,
81  Kokkos::LayoutRight>::value;
82  using contiguous_array_layout =
83  typename std::conditional<is_contiguous_layout,
84  typename ViewType::array_layout,
85  Kokkos::LayoutLeft>::type;
86  using contiguous_device_type =
87  typename std::conditional<
88  std::is_same<
89  typename ViewType::memory_space,
90  Kokkos::HostSpace>::value,
91  typename ViewType::device_type,
92  Kokkos::HostSpace::device_type>::type;
93  using contiguous_buffer_type =
94  Kokkos::View<typename ViewType::non_const_data_type,
95  contiguous_array_layout,
96  contiguous_device_type>;
97 
98  static contiguous_array_layout
99  makeLayout (const ViewType& view)
100  {
101  // NOTE (mfh 17 Mar 2019) This would be a good chance to use if
102  // constexpr, once we have C++17.
103  return contiguous_array_layout (view.extent (0), view.extent (1),
104  view.extent (2), view.extent (3),
105  view.extent (4), view.extent (5),
106  view.extent (6), view.extent (7));
107  }
108 
109  static contiguous_buffer_type
110  make (const ViewType& view)
111  {
112  using Kokkos::view_alloc;
113  using Kokkos::WithoutInitializing;
114  return contiguous_buffer_type
115  (view_alloc (view.label (), WithoutInitializing),
116  makeLayout (view));
117  }
118 };
119 
120 template<class ViewType>
121 typename MakeContiguousBuffer<ViewType>::contiguous_buffer_type
122 makeContiguousBuffer (const ViewType& view)
123 {
124  return MakeContiguousBuffer<ViewType>::make (view);
125 }
126 
127 template<class ValueType>
128 static void
129 allReduceRawContiguous (ValueType output[],
130  const ValueType input[],
131  const size_t count,
132  const Teuchos::Comm<int>& comm)
133 {
134  using Teuchos::outArg;
135  using Teuchos::REDUCE_SUM;
136  using Teuchos::reduceAll;
137  constexpr size_t max_int = size_t (std::numeric_limits<int>::max ());
138  TEUCHOS_ASSERT( count <= size_t (max_int) );
139  reduceAll<int, ValueType> (comm, REDUCE_SUM, static_cast<int> (count),
140  input, output);
141 }
142 
143 } // namespace (anonymous)
144 
145 namespace Tpetra {
146 namespace Details {
147 
151 template<class InputViewType, class OutputViewType>
152 static void
153 allReduceView (const OutputViewType& output,
154  const InputViewType& input,
155  const Teuchos::Comm<int>& comm)
156 {
157  // If all the right conditions hold, we may all-reduce directly from
158  // the input to the output. Here are the relevant conditions:
159  //
160  // - assumeMpiCanAccessBuffers: May we safely assume that MPI may
161  // read from the input View and write to the output View? (Just
162  // because MPI _can_, doesn't mean that doing so will be faster.)
163  // - Do input and output Views alias each other, and is the
164  // communicator an intercommunicator? (Intercommunicators do not
165  // permit collectives to alias input and output buffers.)
166  // - Is either View noncontiguous?
167  //
168  // If either View is noncontiguous, we could use MPI_Type_Vector to
169  // create a noncontiguous MPI_Datatype, instead of packing and
170  // unpacking to resp. from a contiguous temporary buffer. Since
171  // MPI_Allreduce requires that the input and output buffers both
172  // have the same MPI_Datatype, this optimization might only work if
173  // the MPI communicator is an intercommunicator. Furthermore,
174  // creating an MPI_Datatype instance may require memory allocation
175  // anyway. Thus, it's probably better just to use a temporary
176  // contiguous buffer. We use a host buffer for that, since device
177  // buffers are slow to allocate.
178 
179  const bool viewsAlias = output.data () == input.data ();
180  if (comm.getSize () == 1) {
181  if (! viewsAlias) {
182  // InputViewType and OutputViewType can't be AnonymousSpace
183  // Views, because deep_copy needs to know their memory spaces.
184  Kokkos::deep_copy (output, input);
185  }
186  return;
187  }
188 
189  // we must esnure MPI can handle the pointers we pass it
190  // if CudaAware, we are done
191  // otherwise, if the views use Cuda, then we should copy them
192  const bool mpiCannotAccessBuffers =
193  // if assumeMpiIsCudaAware, then we can access cuda buffers
195  && (
196  view_uses_cuda_spaces<OutputViewType>::value
197  ||
198  view_uses_cuda_spaces<InputViewType>::value
199  );
200 
201  const bool needContiguousTemporaryBuffers =
202  // we must alloc/copy if MPI cannot access the buffers
203  mpiCannotAccessBuffers ||
204  // If the comm is Inter and the views alias we must alloc/copy
205  (::Tpetra::Details::isInterComm (comm)
206  &&
207  viewsAlias) ||
208  // if either view is not contiguous then we must alloc/copy
209  ! output.span_is_contiguous () ||
210  ! input.span_is_contiguous ();
211 
212  if (needContiguousTemporaryBuffers) {
213  auto output_tmp = makeContiguousBuffer (output);
214  auto input_tmp = makeContiguousBuffer (input);
215  Kokkos::deep_copy (input_tmp, input);
216  // It's OK if LayoutLeft allocations have padding at the end of
217  // each row. MPI might write to those padding bytes, but it's
218  // undefined behavior for users to use Kokkos to access whatever
219  // bytes are there, and the bytes there don't need to define valid
220  // ValueType instances.
221  allReduceRawContiguous (output_tmp.data (), input_tmp.data (),
222  output_tmp.span (), comm);
223  Kokkos::deep_copy (output, output_tmp);
224  }
225  else {
226  allReduceRawContiguous (output.data (), input.data (),
227  output.span (), comm);
228  }
229 }
230 
231 } // namespace Details
232 } // namespace Tpetra
233 
234 #endif // TPETRA_DETAILS_ALLREDUCEVIEW_HPP
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
static bool assumeMpiIsCudaAware()
Whether to assume that MPI is CUDA aware.
Declaration of Tpetra::Details::isInterComm.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.