Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_idot.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_IDOT_HPP
43 #define TPETRA_DETAILS_IDOT_HPP
44 
60 
63 #include "Tpetra_MultiVector.hpp"
64 #include "Tpetra_Vector.hpp"
65 #include "KokkosBlas1_dot.hpp"
66 #include <stdexcept>
67 #include <sstream>
68 
69 namespace Tpetra {
70 namespace Details {
71 
101 template<class SC, class LO, class GO, class NT>
102 void
103 lclDotRaw (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type* const resultRaw,
104  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
105  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y,
106  const bool resultOnDevice)
107 {
108  using ::Kokkos::Impl::MemorySpaceAccess;
110  using ::Kokkos::HostSpace;
111  using ::Kokkos::subview;
112  using ::Kokkos::View;
113  typedef ::Kokkos::pair<size_t, size_t> pair_type;
114  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
115  typedef typename MV::dot_type dot_type;
116  typedef typename MV::device_type device_type;
117  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
118  typedef typename MV::dual_view_type::t_host::memory_space host_memory_space;
119  typedef View<dot_type*, device_type> result_dev_view_type;
120  typedef typename result_dev_view_type::HostMirror result_host_view_type;
121 
122  const size_t numRows = X.getLocalLength ();
123  const pair_type rowRange (0, numRows);
124  const size_t X_numVecs = X.getNumVectors ();
125  const size_t Y_numVecs = Y.getNumVectors ();
126  const size_t numVecs = X_numVecs > Y_numVecs ? X_numVecs : Y_numVecs;
127 
128  // Check compatibility of number of columns; allow special cases of
129  // a multivector dot a single vector, or vice versa.
130  if (X_numVecs != Y_numVecs &&
131  X_numVecs != size_t (1) &&
132  Y_numVecs != size_t (1)) {
133  std::ostringstream os;
134  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
135  << " != Y.getNumVectors() = " << Y_numVecs
136  << ", but neither is 1.";
137  throw std::invalid_argument (os.str ());
138  }
139 
140  const bool X_latestOnHost = X.template need_sync<dev_memory_space> () &&
141  ! X.template need_sync<host_memory_space> ();
142  const bool Y_latestOnHost = Y.template need_sync<dev_memory_space> () &&
143  ! Y.template need_sync<host_memory_space> ();
144  // Let X guide whether to execute on device or host.
145  const bool runOnHost = X_latestOnHost;
146  // Assume that we have to copy Y to where we need to run. idot()
147  // takes the input MultiVectors as const, so it can't sync them. We
148  // just have to copy their data, if needed, to the memory space
149  // where we want to run.
150  const bool Y_copyToHost = (X_latestOnHost && ! Y_latestOnHost);
151  const bool Y_copyToDev = (! X_latestOnHost && Y_latestOnHost);
152 
153  result_dev_view_type result;
154  result_host_view_type result_h;
155  if (resultOnDevice) {
156  result = result_dev_view_type (resultRaw, numVecs);
157  if (runOnHost) {
158  // create_mirror_view doesn't promise to create a deep copy, so we
159  // can't rely on this case to avoid an extra buffer if the input
160  // communicator is an intercomm.
161  result_h = ::Kokkos::create_mirror_view (result);
162  }
163  // If running on device, not host, then we don't use result_h.
164  }
165  else {
166  result_h = result_host_view_type (resultRaw, numVecs);
167  if (! runOnHost) { // running on device, not host
168  // We have to do a separate allocation here, since resultRaw is
169  // not accessible from device.
170  result = result_dev_view_type ("result", numVecs);
171  }
172  // If running on host, not device, then we don't use result.
173  }
174 
175  const bool X_or_Y_have_nonconstant_stride =
176  ! X.isConstantStride () || ! Y.isConstantStride ();
177  if (X_or_Y_have_nonconstant_stride && numVecs > size_t (1)) {
178  // The "nonconstant stride" case relates to the ability of a
179  // Tpetra::MultiVector to view multiple, noncontiguous columns of
180  // another Tpetra::MultiVector. In this case, we can't just get
181  // the Kokkos::View out of X and Y, since the Kokkos::View has all
182  // the columns, not just the columns that X or Y view. The
183  // Kokkos::View of X or Y would view a contiguous subset of
184  // columns of the latter, "parent" MultiVector, not of the former,
185  // "child" MultiVector. (Kokkos::View doesn't have a layout to
186  // cover this case where consecutive entries within a column are
187  // contiguous, but consecutive entries within a row may have
188  // varying strides.) Instead, we work one column at a time.
189  for (size_t j = 0; j < numVecs; ++j) {
190  // numVecs = max(X_numVecs, Y_numVecs). Allow special case of
191  // X_numVecs != Y_numVecs && (X_numVecs == 1 || Y_numVecs == 1).
192  const size_t X_col = (X_numVecs == size_t (1)) ? size_t (0) : j;
193  const size_t Y_col = (Y_numVecs == size_t (1)) ? size_t (0) : j;
194  auto X_j = X.getVector (X_col);
195  auto Y_j = Y.getVector (Y_col);
196 
197  if (runOnHost) {
198  auto X_j_2d_h = X_j->template getLocalView<host_memory_space> ();
199  auto X_j_1d_h = subview (X_j_2d_h, rowRange, 0);
200  decltype (X_j_1d_h) Y_j_1d_h;
201 
202  if (Y_copyToHost) {
203  auto Y_j_2d = Y_j->template getLocalView<dev_memory_space> ();
204  auto Y_j_1d = subview (Y_j_2d, rowRange, 0);
205  typename decltype (Y_j_1d_h)::non_const_type
206  Y_j_1d_h_nc ("Y_j_1d_h", Y_j_1d.extent (0));
207  deep_copy (Y_j_1d_h_nc, Y_j_1d);
208  Y_j_1d_h = Y_j_1d_h_nc;
209  }
210  else { // Y already sync'd to host; just use its host view
211  auto Y_j_2d_h = Y_j->template getLocalView<host_memory_space> ();
212  Y_j_1d_h = subview (Y_j_2d_h, rowRange, 0);
213  }
214  auto result_h_j = subview (result_h, j);
215  KokkosBlas::dot (result_h_j, X_j_1d_h, Y_j_1d_h);
216  }
217  else { // run on device
218  auto X_j_2d = X_j->template getLocalView<dev_memory_space> ();
219  auto X_j_1d = subview (X_j_2d, rowRange, 0);
220  decltype (X_j_1d) Y_j_1d;
221 
222  if (Y_copyToDev) {
223  auto Y_j_2d_h = Y_j->template getLocalView<host_memory_space> ();
224  auto Y_j_1d_h = subview (Y_j_2d_h, rowRange, 0);
225  typename decltype (Y_j_1d)::non_const_type
226  Y_j_1d_nc ("Y_j_1d", Y_j_1d_h.extent (0));
227  deep_copy (Y_j_1d_nc, Y_j_1d_h);
228  Y_j_1d = Y_j_1d_nc;
229  }
230  else { // Y already sync'd to dev; just use its dev view
231  auto Y_j_2d = Y_j->template getLocalView<dev_memory_space> ();
232  Y_j_1d = subview (Y_j_2d, rowRange, 0);
233  }
234  auto result_j = subview (result, j);
235  KokkosBlas::dot (result_j, X_j_1d, Y_j_1d);
236  }
237  } // for each column j of X and Y
238  }
239  else { // numVecs == 1 || ! X_or_Y_have_nonconstant_stride
240  if (runOnHost) {
241  auto X_lcl_h = X.template getLocalView<host_memory_space> ();
242  decltype (X_lcl_h) Y_lcl_h;
243 
244  if (Y_copyToHost) {
245  auto Y_lcl = Y.template getLocalView<dev_memory_space> ();
246  typename decltype (Y_lcl_h)::non_const_type
247  Y_lcl_h_nc ("Y_lcl_h", Y_lcl.extent (0), Y_numVecs);
248  deep_copy (Y_lcl_h_nc, Y_lcl);
249  Y_lcl_h = Y_lcl_h_nc;
250  }
251  else { // Y already sync'd to host; just use its host view
252  Y_lcl_h = Y.template getLocalView<host_memory_space> ();
253  }
254 
255  if (numVecs == size_t (1)) {
256  auto X_lcl_h_1d = subview (X_lcl_h, rowRange, 0);
257  auto Y_lcl_h_1d = subview (Y_lcl_h, rowRange, 0);
258  auto result_h_0d = subview (result_h, 0);
259  KokkosBlas::dot (result_h_0d, X_lcl_h_1d, Y_lcl_h_1d);
260  }
261  else {
262  auto X_lcl_h_2d = subview (X_lcl_h, rowRange,
263  pair_type (0, X_numVecs));
264  auto Y_lcl_h_2d = subview (Y_lcl_h, rowRange,
265  pair_type (0, Y_numVecs));
266  KokkosBlas::dot (result_h, X_lcl_h_2d, Y_lcl_h_2d);
267  }
268  }
269  else { // run on device
270  auto X_lcl = X.template getLocalView<dev_memory_space> ();
271  decltype (X_lcl) Y_lcl;
272 
273  if (Y_copyToDev) {
274  auto Y_lcl_h = Y.template getLocalView<host_memory_space> ();
275  typename decltype (Y_lcl)::non_const_type
276  Y_lcl_nc ("Y_lcl", Y_lcl_h.extent (0), Y_numVecs);
277  deep_copy (Y_lcl_nc, Y_lcl_h);
278  Y_lcl = Y_lcl_nc;
279  }
280  else { // Y already sync'd to dev; just use its dev view
281  Y_lcl = Y.template getLocalView<dev_memory_space> ();
282  }
283 
284  if (numVecs == size_t (1)) {
285  auto X_lcl_1d = subview (X_lcl, rowRange, 0);
286  auto Y_lcl_1d = subview (Y_lcl, rowRange, 0);
287  auto result_0d = subview (result, 0);
288  KokkosBlas::dot (result_0d, X_lcl_1d, Y_lcl_1d);
289  }
290  else {
291  auto X_lcl_2d = subview (X_lcl, rowRange,
292  pair_type (0, X_numVecs));
293  auto Y_lcl_2d = subview (Y_lcl, rowRange,
294  pair_type (0, Y_numVecs));
295  KokkosBlas::dot (result, X_lcl_2d, Y_lcl_2d);
296  }
297  } // host or device?
298  } // multivector with nonconstant stride?
299 
300  if (runOnHost && resultOnDevice) {
301  // Copy result from host to device, where the user wanted it.
302  deep_copy (result, result_h);
303  }
304  else if (! runOnHost && ! resultOnDevice) {
305  // Copy result from device to host, where the user wanted it.
306  deep_copy (result_h, result);
307  }
308 }
309 
310 } // namespace Details
311 
312 //
313 // SKIP DOWN TO HERE
314 //
315 
368 template<class SC, class LO, class GO, class NT>
369 std::shared_ptr< ::Tpetra::Details::CommRequest>
370 idot (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type* resultRaw,
371  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
372  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y)
373 {
374  using ::Kokkos::Impl::MemorySpaceAccess;
375  using ::Kokkos::HostSpace;
376  using ::Kokkos::View;
378  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
379  typedef typename MV::dot_type dot_type;
380  typedef typename MV::device_type device_type;
381  typedef View<dot_type*, device_type> result_dev_view_type;
382  typedef typename result_dev_view_type::HostMirror result_host_view_type;
383  typedef typename device_type::memory_space dev_memory_space;
384 
385  auto map = X.getMap ();
386  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
387  if (comm.is_null ()) { // calling process does not participate
388  return std::shared_ptr< ::Tpetra::Details::CommRequest> (NULL);
389  }
390 
391  const size_t X_numVecs = X.getNumVectors ();
392  const size_t Y_numVecs = Y.getNumVectors ();
393  const size_t numVecs = (X_numVecs > Y_numVecs) ? X_numVecs : Y_numVecs;
394 
395  // Check compatibility of number of columns; allow special cases of
396  // a multivector dot a single vector, or vice versa.
397  if (X_numVecs != Y_numVecs &&
398  X_numVecs != size_t (1) &&
399  Y_numVecs != size_t (1)) {
400  std::ostringstream os;
401  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
402  << " != Y.getNumVectors() = " << Y_numVecs
403  << ", but neither is 1.";
404  throw std::invalid_argument (os.str ());
405  }
406 
407  // If the input communicator is an intercomm, then the input and
408  // output buffers of iallreduce may not alias one another, so we
409  // must allocate a temporary buffer for the local dot product(s).
410  const bool needCopy = Details::isInterComm (*comm);
411 
412  // We know that resultRaw is host memory. Can the device access it?
413  // If so, lclDotRaw may be able to avoid a copy.
414  const bool resultOnDevice =
415  MemorySpaceAccess<dev_memory_space, HostSpace>::accessible;
416  if (resultOnDevice) {
417  result_dev_view_type gblResult (resultRaw, numVecs);
418  result_dev_view_type lclResult = needCopy ?
419  result_dev_view_type ("lclResult", numVecs) :
420  gblResult;
421  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
422  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
423  }
424  else {
425  result_host_view_type gblResult (resultRaw, numVecs);
426  result_host_view_type lclResult = needCopy ?
427  result_host_view_type ("lclResult", numVecs) :
428  gblResult;
429  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
430  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
431  }
432 }
433 
475 template<class SC, class LO, class GO, class NT>
476 std::shared_ptr< ::Tpetra::Details::CommRequest>
477 idot (const Kokkos::View<typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type,
478  typename ::Tpetra::Vector<SC, LO, GO, NT>::device_type>& result,
479  const ::Tpetra::Vector<SC, LO, GO, NT>& X,
480  const ::Tpetra::Vector<SC, LO, GO, NT>& Y)
481 {
482  using ::Kokkos::Impl::MemorySpaceAccess;
483  using ::Kokkos::HostSpace;
484  using ::Kokkos::View;
486  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
487  typedef typename MV::dot_type dot_type;
488  typedef typename MV::device_type device_type;
489  typedef View<dot_type, device_type> result_view_type;
490 
491  auto map = X.getMap ();
492  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
493  if (comm.is_null ()) { // calling process does not participate
494  return std::shared_ptr< ::Tpetra::Details::CommRequest> (NULL);
495  }
496 
497  // If the input communicator is an intercomm, then the input and
498  // output buffers of iallreduce may not alias one another, so we
499  // must allocate a temporary buffer for the local dot product.
500  const bool needCopy = Details::isInterComm (*comm);
501 
502  constexpr bool resultOnDevice = true; // 'result' is a device View
503  result_view_type gblResult = result;
504  result_view_type lclResult = needCopy ?
505  result_view_type ("lclResult") :
506  gblResult;
507  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
508  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
509 }
510 
573 template<class SC, class LO, class GO, class NT>
574 std::shared_ptr< ::Tpetra::Details::CommRequest>
575 idot (const Kokkos::View<typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type*,
576  typename ::Tpetra::MultiVector<SC, LO, GO, NT>::device_type>& result,
577  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
578  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y)
579 {
580  using ::Kokkos::Impl::MemorySpaceAccess;
581  using ::Kokkos::HostSpace;
582  using ::Kokkos::View;
584  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
585  typedef typename MV::dot_type dot_type;
586  typedef typename MV::device_type device_type;
587  typedef View<dot_type*, device_type> result_view_type;
588 
589  auto map = X.getMap ();
590  auto comm = map.is_null () ? ::Teuchos::null : map->getComm ();
591  if (comm.is_null ()) { // calling process does not participate
592  return std::shared_ptr< ::Tpetra::Details::CommRequest> (NULL);
593  }
594 
595  // Check compatibility of number of columns; allow special cases of
596  // a multivector dot a single vector, or vice versa. It's OK to
597  // throw without MPI communication, since the number of columns of a
598  // Tpetra::MultiVector must be the same on all processes of its
599  // communicator.
600  const size_t X_numVecs = X.getNumVectors ();
601  const size_t Y_numVecs = Y.getNumVectors ();
602  if (X_numVecs != Y_numVecs &&
603  X_numVecs != size_t (1) &&
604  Y_numVecs != size_t (1)) {
605  std::ostringstream os;
606  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
607  << " != Y.getNumVectors() = " << Y_numVecs
608  << ", but neither is 1.";
609  throw std::invalid_argument (os.str ());
610  }
611 
612  // If the input communicator is an intercomm, then the input and
613  // output buffers of iallreduce may not alias one another, so we
614  // must allocate a temporary buffer for the local dot product(s).
615  const bool needCopy = Details::isInterComm (*comm);
616 
617  constexpr bool resultOnDevice = true; // 'result' is a device View
618  result_view_type gblResult = result;
619  result_view_type lclResult = needCopy ?
620  result_view_type ("lclResult", result.extent (0)) :
621  gblResult;
622  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
623  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
624 }
625 
626 } // namespace Tpetra
627 
628 #endif // TPETRA_DETAILS_IDOT_HPP
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator. ...
Declaration of Tpetra::Details::isInterComm.
std::shared_ptr< ::Tpetra::Details::CommRequest > idot(typename::Tpetra::MultiVector< SC, LO, GO, NT >::dot_type *resultRaw, const ::Tpetra::MultiVector< SC, LO, GO, NT > &X, const ::Tpetra::MultiVector< SC, LO, GO, NT > &Y)
Nonblocking dot product, with either Tpetra::MultiVector or Tpetra::Vector inputs, and raw pointer or raw array output.
void lclDotRaw(typename::Tpetra::MultiVector< SC, LO, GO, NT >::dot_type *const resultRaw, const ::Tpetra::MultiVector< SC, LO, GO, NT > &X, const ::Tpetra::MultiVector< SC, LO, GO, NT > &Y, const bool resultOnDevice)
Compute the local dot product(s), columnwise, of X and Y.
Declaration of Tpetra::Details::iallreduce.
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.