Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_idot.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_IDOT_HPP
43 #define TPETRA_DETAILS_IDOT_HPP
44 
60 
63 #include "Tpetra_MultiVector.hpp"
64 #include "Tpetra_Vector.hpp"
65 #include "KokkosBlas1_dot.hpp"
66 #include <stdexcept>
67 #include <sstream>
68 
69 namespace Tpetra {
70 namespace Details {
71 
101 template<class SC, class LO, class GO, class NT>
102 void
103 lclDotRaw (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type* const resultRaw,
104  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
105  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y,
106  const bool resultOnDevice)
107 {
108  using ::Kokkos::Impl::MemorySpaceAccess;
110  using ::Kokkos::HostSpace;
111  using ::Kokkos::subview;
112  using ::Kokkos::View;
113  typedef ::Kokkos::pair<size_t, size_t> pair_type;
114  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
115  typedef typename MV::dot_type dot_type;
116  typedef typename MV::device_type device_type;
117  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
118  typedef typename MV::dual_view_type::t_host::memory_space host_memory_space;
119  typedef View<dot_type*, device_type> result_dev_view_type;
120  typedef typename result_dev_view_type::HostMirror result_host_view_type;
121 
122  const size_t numRows = X.getLocalLength ();
123  const pair_type rowRange (0, numRows);
124  const size_t X_numVecs = X.getNumVectors ();
125  const size_t Y_numVecs = Y.getNumVectors ();
126  const size_t numVecs = X_numVecs > Y_numVecs ? X_numVecs : Y_numVecs;
127 
128  // Check compatibility of number of columns; allow special cases of
129  // a multivector dot a single vector, or vice versa.
130  if (X_numVecs != Y_numVecs &&
131  X_numVecs != size_t (1) &&
132  Y_numVecs != size_t (1)) {
133  std::ostringstream os;
134  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
135  << " != Y.getNumVectors() = " << Y_numVecs
136  << ", but neither is 1.";
137  throw std::invalid_argument (os.str ());
138  }
139 
140  const bool X_latestOnHost = X.template need_sync<dev_memory_space> () &&
141  ! X.template need_sync<host_memory_space> ();
142  const bool Y_latestOnHost = Y.template need_sync<dev_memory_space> () &&
143  ! Y.template need_sync<host_memory_space> ();
144  // Let X guide whether to execute on device or host.
145  const bool runOnHost = X_latestOnHost;
146  // Assume that we have to copy Y to where we need to run. idot()
147  // takes the input MultiVectors as const, so it can't sync them. We
148  // just have to copy their data, if needed, to the memory space
149  // where we want to run.
150  const bool Y_copyToHost = (X_latestOnHost && ! Y_latestOnHost);
151  const bool Y_copyToDev = (! X_latestOnHost && Y_latestOnHost);
152 
153  result_dev_view_type result;
154  result_host_view_type result_h;
155  if (resultOnDevice) {
156  result = result_dev_view_type (resultRaw, numVecs);
157  if (runOnHost) {
158  // create_mirror_view doesn't promise to create a deep copy, so we
159  // can't rely on this case to avoid an extra buffer if the input
160  // communicator is an intercomm.
161  result_h = ::Kokkos::create_mirror_view (result);
162  }
163  // If running on device, not host, then we don't use result_h.
164  }
165  else {
166  result_h = result_host_view_type (resultRaw, numVecs);
167  if (! runOnHost) { // running on device, not host
168  // We have to do a separate allocation here, since resultRaw is
169  // not accessible from device.
170  result = result_dev_view_type ("result", numVecs);
171  }
172  // If running on host, not device, then we don't use result.
173  }
174 
175  const bool X_or_Y_have_nonconstant_stride =
176  ! X.isConstantStride () || ! Y.isConstantStride ();
177  if (X_or_Y_have_nonconstant_stride && numVecs > size_t (1)) {
178  // The "nonconstant stride" case relates to the ability of a
179  // Tpetra::MultiVector to view multiple, noncontiguous columns of
180  // another Tpetra::MultiVector. In this case, we can't just get
181  // the Kokkos::View out of X and Y, since the Kokkos::View has all
182  // the columns, not just the columns that X or Y view. The
183  // Kokkos::View of X or Y would view a contiguous subset of
184  // columns of the latter, "parent" MultiVector, not of the former,
185  // "child" MultiVector. (Kokkos::View doesn't have a layout to
186  // cover this case where consecutive entries within a column are
187  // contiguous, but consecutive entries within a row may have
188  // varying strides.) Instead, we work one column at a time.
189  for (size_t j = 0; j < numVecs; ++j) {
190  // numVecs = max(X_numVecs, Y_numVecs). Allow special case of
191  // X_numVecs != Y_numVecs && (X_numVecs == 1 || Y_numVecs == 1).
192  const size_t X_col = (X_numVecs == size_t (1)) ? size_t (0) : j;
193  const size_t Y_col = (Y_numVecs == size_t (1)) ? size_t (0) : j;
194  auto X_j = X.getVector (X_col);
195  auto Y_j = Y.getVector (Y_col);
196 
197  if (runOnHost) {
198  auto X_j_2d_h = X_j->template getLocalView<host_memory_space> ();
199  auto X_j_1d_h = subview (X_j_2d_h, rowRange, 0);
200  decltype (X_j_1d_h) Y_j_1d_h;
201 
202  if (Y_copyToHost) {
203  auto Y_j_2d = Y_j->template getLocalView<dev_memory_space> ();
204  auto Y_j_1d = subview (Y_j_2d, rowRange, 0);
205  typename decltype (Y_j_1d_h)::non_const_type
206  Y_j_1d_h_nc ("Y_j_1d_h", Y_j_1d.extent (0));
207  deep_copy (Y_j_1d_h_nc, Y_j_1d);
208  Y_j_1d_h = Y_j_1d_h_nc;
209  }
210  else { // Y already sync'd to host; just use its host view
211  auto Y_j_2d_h = Y_j->template getLocalView<host_memory_space> ();
212  Y_j_1d_h = subview (Y_j_2d_h, rowRange, 0);
213  }
214  auto result_h_j = subview (result_h, j);
215  KokkosBlas::dot (result_h_j, X_j_1d_h, Y_j_1d_h);
216  }
217  else { // run on device
218  auto X_j_2d = X_j->template getLocalView<dev_memory_space> ();
219  auto X_j_1d = subview (X_j_2d, rowRange, 0);
220  decltype (X_j_1d) Y_j_1d;
221 
222  if (Y_copyToDev) {
223  auto Y_j_2d_h = Y_j->template getLocalView<host_memory_space> ();
224  auto Y_j_1d_h = subview (Y_j_2d_h, rowRange, 0);
225  typename decltype (Y_j_1d)::non_const_type
226  Y_j_1d_nc ("Y_j_1d", Y_j_1d_h.extent (0));
227  deep_copy (Y_j_1d_nc, Y_j_1d_h);
228  Y_j_1d = Y_j_1d_nc;
229  }
230  else { // Y already sync'd to dev; just use its dev view
231  auto Y_j_2d = Y_j->template getLocalView<dev_memory_space> ();
232  Y_j_1d = subview (Y_j_2d, rowRange, 0);
233  }
234  auto result_j = subview (result, j);
235  KokkosBlas::dot (result_j, X_j_1d, Y_j_1d);
236  Kokkos::fence();
237  }
238  } // for each column j of X and Y
239  }
240  else { // numVecs == 1 || ! X_or_Y_have_nonconstant_stride
241  if (runOnHost) {
242  auto X_lcl_h = X.template getLocalView<host_memory_space> ();
243  decltype (X_lcl_h) Y_lcl_h;
244 
245  if (Y_copyToHost) {
246  auto Y_lcl = Y.template getLocalView<dev_memory_space> ();
247  typename decltype (Y_lcl_h)::non_const_type
248  Y_lcl_h_nc ("Y_lcl_h", Y_lcl.extent (0), Y_numVecs);
249  deep_copy (Y_lcl_h_nc, Y_lcl);
250  Y_lcl_h = Y_lcl_h_nc;
251  }
252  else { // Y already sync'd to host; just use its host view
253  Y_lcl_h = Y.template getLocalView<host_memory_space> ();
254  }
255 
256  if (numVecs == size_t (1)) {
257  auto X_lcl_h_1d = subview (X_lcl_h, rowRange, 0);
258  auto Y_lcl_h_1d = subview (Y_lcl_h, rowRange, 0);
259  auto result_h_0d = subview (result_h, 0);
260  KokkosBlas::dot (result_h_0d, X_lcl_h_1d, Y_lcl_h_1d);
261  }
262  else {
263  auto X_lcl_h_2d = subview (X_lcl_h, rowRange,
264  pair_type (0, X_numVecs));
265  auto Y_lcl_h_2d = subview (Y_lcl_h, rowRange,
266  pair_type (0, Y_numVecs));
267  KokkosBlas::dot (result_h, X_lcl_h_2d, Y_lcl_h_2d);
268  }
269  }
270  else { // run on device
271  auto X_lcl = X.template getLocalView<dev_memory_space> ();
272  decltype (X_lcl) Y_lcl;
273 
274  if (Y_copyToDev) {
275  auto Y_lcl_h = Y.template getLocalView<host_memory_space> ();
276  typename decltype (Y_lcl)::non_const_type
277  Y_lcl_nc ("Y_lcl", Y_lcl_h.extent (0), Y_numVecs);
278  deep_copy (Y_lcl_nc, Y_lcl_h);
279  Y_lcl = Y_lcl_nc;
280  }
281  else { // Y already sync'd to dev; just use its dev view
282  Y_lcl = Y.template getLocalView<dev_memory_space> ();
283  }
284 
285  if (numVecs == size_t (1)) {
286  auto X_lcl_1d = subview (X_lcl, rowRange, 0);
287  auto Y_lcl_1d = subview (Y_lcl, rowRange, 0);
288  auto result_0d = subview (result, 0);
289  KokkosBlas::dot (result_0d, X_lcl_1d, Y_lcl_1d);
290  Kokkos::fence();
291  }
292  else {
293  auto X_lcl_2d = subview (X_lcl, rowRange,
294  pair_type (0, X_numVecs));
295  auto Y_lcl_2d = subview (Y_lcl, rowRange,
296  pair_type (0, Y_numVecs));
297  KokkosBlas::dot (result, X_lcl_2d, Y_lcl_2d);
298  Kokkos::fence();
299  }
300  } // host or device?
301  } // multivector with nonconstant stride?
302 
303  if (runOnHost && resultOnDevice) {
304  // Copy result from host to device, where the user wanted it.
305  deep_copy (result, result_h);
306  }
307  else if (! runOnHost && ! resultOnDevice) {
308  // Copy result from device to host, where the user wanted it.
309  deep_copy (result_h, result);
310  }
311 }
312 
313 } // namespace Details
314 
315 //
316 // SKIP DOWN TO HERE
317 //
318 
371 template<class SC, class LO, class GO, class NT>
372 std::shared_ptr< ::Tpetra::Details::CommRequest>
373 idot (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type* resultRaw,
374  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
375  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y)
376 {
377  using ::Kokkos::Impl::MemorySpaceAccess;
378  using ::Kokkos::HostSpace;
379  using ::Kokkos::View;
381  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
382  typedef typename MV::dot_type dot_type;
383  typedef typename MV::device_type device_type;
384  typedef View<dot_type*, device_type> result_dev_view_type;
385  typedef typename result_dev_view_type::HostMirror result_host_view_type;
386  typedef typename device_type::memory_space dev_memory_space;
387 
388  auto map = X.getMap ();
389  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
390  if (comm.is_null ()) { // calling process does not participate
391  return std::shared_ptr< ::Tpetra::Details::CommRequest> (NULL);
392  }
393 
394  const size_t X_numVecs = X.getNumVectors ();
395  const size_t Y_numVecs = Y.getNumVectors ();
396  const size_t numVecs = (X_numVecs > Y_numVecs) ? X_numVecs : Y_numVecs;
397 
398  // Check compatibility of number of columns; allow special cases of
399  // a multivector dot a single vector, or vice versa.
400  if (X_numVecs != Y_numVecs &&
401  X_numVecs != size_t (1) &&
402  Y_numVecs != size_t (1)) {
403  std::ostringstream os;
404  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
405  << " != Y.getNumVectors() = " << Y_numVecs
406  << ", but neither is 1.";
407  throw std::invalid_argument (os.str ());
408  }
409 
410  // If the input communicator is an intercomm, then the input and
411  // output buffers of iallreduce may not alias one another, so we
412  // must allocate a temporary buffer for the local dot product(s).
413  const bool needCopy = Details::isInterComm (*comm);
414 
415  // We know that resultRaw is host memory. Can the device access it?
416  // If so, lclDotRaw may be able to avoid a copy.
417  const bool resultOnDevice =
418  MemorySpaceAccess<dev_memory_space, HostSpace>::accessible;
419  if (resultOnDevice) {
420  result_dev_view_type gblResult (resultRaw, numVecs);
421  result_dev_view_type lclResult = needCopy ?
422  result_dev_view_type ("lclResult", numVecs) :
423  gblResult;
424  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
425  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
426  }
427  else {
428  result_host_view_type gblResult (resultRaw, numVecs);
429  result_host_view_type lclResult = needCopy ?
430  result_host_view_type ("lclResult", numVecs) :
431  gblResult;
432  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
433  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
434  }
435 }
436 
478 template<class SC, class LO, class GO, class NT>
479 std::shared_ptr< ::Tpetra::Details::CommRequest>
480 idot (const Kokkos::View<typename ::Tpetra::Vector<SC, LO, GO, NT>::dot_type,
481  typename ::Tpetra::Vector<SC, LO, GO, NT>::device_type>& result,
482  const ::Tpetra::Vector<SC, LO, GO, NT>& X,
483  const ::Tpetra::Vector<SC, LO, GO, NT>& Y)
484 {
485  using ::Kokkos::Impl::MemorySpaceAccess;
486  using ::Kokkos::HostSpace;
487  using ::Kokkos::View;
489  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
490  typedef typename MV::dot_type dot_type;
491  typedef typename MV::device_type device_type;
492  typedef View<dot_type, device_type> result_view_type;
493 
494  auto map = X.getMap ();
495  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
496  if (comm.is_null ()) { // calling process does not participate
497  return std::shared_ptr< ::Tpetra::Details::CommRequest> (NULL);
498  }
499 
500  // If the input communicator is an intercomm, then the input and
501  // output buffers of iallreduce may not alias one another, so we
502  // must allocate a temporary buffer for the local dot product.
503  const bool needCopy = Details::isInterComm (*comm);
504 
505  constexpr bool resultOnDevice = true; // 'result' is a device View
506  result_view_type gblResult = result;
507  result_view_type lclResult = needCopy ?
508  result_view_type ("lclResult") :
509  gblResult;
510  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
511  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
512 }
513 
576 template<class SC, class LO, class GO, class NT>
577 std::shared_ptr< ::Tpetra::Details::CommRequest>
578 idot (const Kokkos::View<typename ::Tpetra::MultiVector<SC, LO, GO, NT>::dot_type*,
579  typename ::Tpetra::MultiVector<SC, LO, GO, NT>::device_type>& result,
580  const ::Tpetra::MultiVector<SC, LO, GO, NT>& X,
581  const ::Tpetra::MultiVector<SC, LO, GO, NT>& Y)
582 {
583  using ::Kokkos::Impl::MemorySpaceAccess;
584  using ::Kokkos::HostSpace;
585  using ::Kokkos::View;
587  typedef ::Tpetra::MultiVector<SC, LO, GO, NT> MV;
588  typedef typename MV::dot_type dot_type;
589  typedef typename MV::device_type device_type;
590  typedef View<dot_type*, device_type> result_view_type;
591 
592  auto map = X.getMap ();
593  auto comm = map.is_null () ? ::Teuchos::null : map->getComm ();
594  if (comm.is_null ()) { // calling process does not participate
595  return std::shared_ptr< ::Tpetra::Details::CommRequest> (NULL);
596  }
597 
598  // Check compatibility of number of columns; allow special cases of
599  // a multivector dot a single vector, or vice versa. It's OK to
600  // throw without MPI communication, since the number of columns of a
601  // Tpetra::MultiVector must be the same on all processes of its
602  // communicator.
603  const size_t X_numVecs = X.getNumVectors ();
604  const size_t Y_numVecs = Y.getNumVectors ();
605  if (X_numVecs != Y_numVecs &&
606  X_numVecs != size_t (1) &&
607  Y_numVecs != size_t (1)) {
608  std::ostringstream os;
609  os << "Tpetra::idot: X.getNumVectors() = " << X_numVecs
610  << " != Y.getNumVectors() = " << Y_numVecs
611  << ", but neither is 1.";
612  throw std::invalid_argument (os.str ());
613  }
614 
615  // If the input communicator is an intercomm, then the input and
616  // output buffers of iallreduce may not alias one another, so we
617  // must allocate a temporary buffer for the local dot product(s).
618  const bool needCopy = Details::isInterComm (*comm);
619 
620  constexpr bool resultOnDevice = true; // 'result' is a device View
621  result_view_type gblResult = result;
622  result_view_type lclResult = needCopy ?
623  result_view_type ("lclResult", result.extent (0)) :
624  gblResult;
625  Details::lclDotRaw (lclResult.data (), X, Y, resultOnDevice);
626  return iallreduce (lclResult, gblResult, ::Teuchos::REDUCE_SUM, *comm);
627 }
628 
629 } // namespace Tpetra
630 
631 #endif // TPETRA_DETAILS_IDOT_HPP
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator. ...
Declaration of Tpetra::Details::isInterComm.
std::shared_ptr< ::Tpetra::Details::CommRequest > idot(typename::Tpetra::MultiVector< SC, LO, GO, NT >::dot_type *resultRaw, const ::Tpetra::MultiVector< SC, LO, GO, NT > &X, const ::Tpetra::MultiVector< SC, LO, GO, NT > &Y)
Nonblocking dot product, with either Tpetra::MultiVector or Tpetra::Vector inputs, and raw pointer or raw array output.
void lclDotRaw(typename::Tpetra::MultiVector< SC, LO, GO, NT >::dot_type *const resultRaw, const ::Tpetra::MultiVector< SC, LO, GO, NT > &X, const ::Tpetra::MultiVector< SC, LO, GO, NT > &Y, const bool resultOnDevice)
Compute the local dot product(s), columnwise, of X and Y.
Declaration of Tpetra::Details::iallreduce.
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.