Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_Experimental_BlockMultiVector_def.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKMULTIVECTOR_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKMULTIVECTOR_DEF_HPP
44 
47 #include "Teuchos_OrdinalTraits.hpp"
48 
49 namespace { // anonymous
50 
62  template<class MultiVectorType>
63  struct RawHostPtrFromMultiVector {
64  typedef typename MultiVectorType::impl_scalar_type impl_scalar_type;
65 
66  static impl_scalar_type* getRawPtr (MultiVectorType& X) {
67  // NOTE (mfh 09 Jun 2016) This does NOT sync to host, or mark
68  // host as modified. This is on purpose, because we don't want
69  // the BlockMultiVector sync'd to host unnecessarily.
70  // Otherwise, all the MultiVector and BlockMultiVector kernels
71  // would run on host instead of device. See Github Issue #428.
72  auto X_view_host = X.template getLocalView<typename MultiVectorType::dual_view_type::t_host::device_type> ();
73  impl_scalar_type* X_raw = X_view_host.data ();
74  return X_raw;
75  }
76  };
77 
90  template<class S, class LO, class GO, class N>
92  getRawHostPtrFromMultiVector (Tpetra::MultiVector<S, LO, GO, N>& X) {
94  return RawHostPtrFromMultiVector<MV>::getRawPtr (X);
95  }
96 
97 } // namespace (anonymous)
98 
99 namespace Tpetra {
100 namespace Experimental {
101 
102 template<class Scalar, class LO, class GO, class Node>
103 typename BlockMultiVector<Scalar, LO, GO, Node>::mv_type
106 {
107  return mv_;
108 }
109 
110 template<class Scalar, class LO, class GO, class Node>
111 Teuchos::RCP<const BlockMultiVector<Scalar, LO, GO, Node> >
114 {
116  const BMV* src_bmv = dynamic_cast<const BMV*> (&src);
117  TEUCHOS_TEST_FOR_EXCEPTION(
118  src_bmv == NULL, std::invalid_argument, "Tpetra::Experimental::"
119  "BlockMultiVector: The source object of an Import or Export to a "
120  "BlockMultiVector, must also be a BlockMultiVector.");
121  return Teuchos::rcp (src_bmv, false); // nonowning RCP
122 }
123 
124 template<class Scalar, class LO, class GO, class Node>
126 BlockMultiVector (const map_type& meshMap,
127  const LO blockSize,
128  const LO numVecs) :
129  dist_object_type (Teuchos::rcp (new map_type (meshMap))), // shallow copy
130  meshMap_ (meshMap),
131  pointMap_ (makePointMap (meshMap, blockSize)),
132  mv_ (Teuchos::rcpFromRef (pointMap_), numVecs), // nonowning RCP is OK, since pointMap_ won't go away
133  mvData_ (getRawHostPtrFromMultiVector (mv_)),
134  blockSize_ (blockSize)
135 {
136  // Make sure that mv_ has view semantics.
137  mv_.setCopyOrView (Teuchos::View);
138 }
139 
140 template<class Scalar, class LO, class GO, class Node>
142 BlockMultiVector (const map_type& meshMap,
143  const map_type& pointMap,
144  const LO blockSize,
145  const LO numVecs) :
146  dist_object_type (Teuchos::rcp (new map_type (meshMap))), // shallow copy
147  meshMap_ (meshMap),
148  pointMap_ (pointMap),
149  mv_ (Teuchos::rcpFromRef (pointMap_), numVecs),
150  mvData_ (getRawHostPtrFromMultiVector (mv_)),
151  blockSize_ (blockSize)
152 {
153  // Make sure that mv_ has view semantics.
154  mv_.setCopyOrView (Teuchos::View);
155 }
156 
157 template<class Scalar, class LO, class GO, class Node>
160  const map_type& meshMap,
161  const LO blockSize) :
162  dist_object_type (Teuchos::rcp (new map_type (meshMap))), // shallow copy
163  meshMap_ (meshMap),
164  mvData_ (NULL), // just for now
165  blockSize_ (blockSize)
166 {
167  using Teuchos::RCP;
168 
169  if (X_mv.getCopyOrView () == Teuchos::View) {
170  // The input MultiVector has view semantics, so assignment just
171  // does a shallow copy.
172  mv_ = X_mv;
173  }
174  else if (X_mv.getCopyOrView () == Teuchos::Copy) {
175  // The input MultiVector has copy semantics. We can't change
176  // that, but we can make a view of the input MultiVector and
177  // change the view to have view semantics. (That sounds silly;
178  // shouldn't views always have view semantics? but remember that
179  // "view semantics" just governs the default behavior of the copy
180  // constructor and assignment operator.)
181  RCP<const mv_type> X_view_const;
182  const size_t numCols = X_mv.getNumVectors ();
183  if (numCols == 0) {
184  Teuchos::Array<size_t> cols (0); // view will have zero columns
185  X_view_const = X_mv.subView (cols ());
186  } else { // Range1D is an inclusive range
187  X_view_const = X_mv.subView (Teuchos::Range1D (0, numCols-1));
188  }
189  TEUCHOS_TEST_FOR_EXCEPTION(
190  X_view_const.is_null (), std::logic_error, "Tpetra::Experimental::"
191  "BlockMultiVector constructor: X_mv.subView(...) returned null. This "
192  "should never happen. Please report this bug to the Tpetra developers.");
193 
194  // It's perfectly OK to cast away const here. Those view methods
195  // should be marked const anyway, because views can't change the
196  // allocation (just the entries themselves).
197  RCP<mv_type> X_view = Teuchos::rcp_const_cast<mv_type> (X_view_const);
198  X_view->setCopyOrView (Teuchos::View);
199  TEUCHOS_TEST_FOR_EXCEPTION(
200  X_view->getCopyOrView () != Teuchos::View, std::logic_error, "Tpetra::"
201  "Experimental::BlockMultiVector constructor: We just set a MultiVector "
202  "to have view semantics, but it claims that it doesn't have view "
203  "semantics. This should never happen. "
204  "Please report this bug to the Tpetra developers.");
205  mv_ = *X_view; // MultiVector::operator= does a shallow copy here
206  }
207 
208  // At this point, mv_ has been assigned, so we can ignore X_mv.
209  Teuchos::RCP<const map_type> pointMap = mv_.getMap ();
210  if (! pointMap.is_null ()) {
211  pointMap_ = *pointMap; // Map::operator= also does a shallow copy
212  }
213  mvData_ = getRawHostPtrFromMultiVector (mv_);
214 }
215 
216 template<class Scalar, class LO, class GO, class Node>
219  const map_type& newMeshMap,
220  const map_type& newPointMap,
221  const size_t offset) :
222  dist_object_type (Teuchos::rcp (new map_type (newMeshMap))), // shallow copy
223  meshMap_ (newMeshMap),
224  pointMap_ (newPointMap),
225  mv_ (X.mv_, newPointMap, offset * X.getBlockSize ()), // MV "offset view" constructor
226  mvData_ (getRawHostPtrFromMultiVector (mv_)),
227  blockSize_ (X.getBlockSize ())
228 {
229  // Make sure that mv_ has view semantics.
230  mv_.setCopyOrView (Teuchos::View);
231 }
232 
233 template<class Scalar, class LO, class GO, class Node>
236  const map_type& newMeshMap,
237  const size_t offset) :
238  dist_object_type (Teuchos::rcp (new map_type (newMeshMap))), // shallow copy
239  meshMap_ (newMeshMap),
240  pointMap_ (makePointMap (newMeshMap, X.getBlockSize ())),
241  mv_ (X.mv_, pointMap_, offset * X.getBlockSize ()), // MV "offset view" constructor
242  mvData_ (getRawHostPtrFromMultiVector (mv_)),
243  blockSize_ (X.getBlockSize ())
244 {
245  // Make sure that mv_ has view semantics.
246  mv_.setCopyOrView (Teuchos::View);
247 }
248 
249 template<class Scalar, class LO, class GO, class Node>
252  dist_object_type (Teuchos::null),
253  mvData_ (NULL),
254  blockSize_ (0)
255 {
256  // Make sure that mv_ has view semantics.
257  mv_.setCopyOrView (Teuchos::View);
258 }
259 
260 template<class Scalar, class LO, class GO, class Node>
263 makePointMap (const map_type& meshMap, const LO blockSize)
264 {
265  typedef Tpetra::global_size_t GST;
266  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
267 
268  const GST gblNumMeshMapInds =
269  static_cast<GST> (meshMap.getGlobalNumElements ());
270  const size_t lclNumMeshMapIndices =
271  static_cast<size_t> (meshMap.getNodeNumElements ());
272  const GST gblNumPointMapInds =
273  gblNumMeshMapInds * static_cast<GST> (blockSize);
274  const size_t lclNumPointMapInds =
275  lclNumMeshMapIndices * static_cast<size_t> (blockSize);
276  const GO indexBase = meshMap.getIndexBase ();
277 
278  if (meshMap.isContiguous ()) {
279  return map_type (gblNumPointMapInds, lclNumPointMapInds, indexBase,
280  meshMap.getComm ());
281  }
282  else {
283  // "Hilbert's Hotel" trick: multiply each process' GIDs by
284  // blockSize, and fill in. That ensures correctness even if the
285  // mesh Map is overlapping.
286  Teuchos::ArrayView<const GO> lclMeshGblInds = meshMap.getNodeElementList ();
287  const size_type lclNumMeshGblInds = lclMeshGblInds.size ();
288  Teuchos::Array<GO> lclPointGblInds (lclNumPointMapInds);
289  for (size_type g = 0; g < lclNumMeshGblInds; ++g) {
290  const GO meshGid = lclMeshGblInds[g];
291  const GO pointGidStart = indexBase +
292  (meshGid - indexBase) * static_cast<GO> (blockSize);
293  const size_type offset = g * static_cast<size_type> (blockSize);
294  for (LO k = 0; k < blockSize; ++k) {
295  const GO pointGid = pointGidStart + static_cast<GO> (k);
296  lclPointGblInds[offset + static_cast<size_type> (k)] = pointGid;
297  }
298  }
299  return map_type (gblNumPointMapInds, lclPointGblInds (), indexBase,
300  meshMap.getComm ());
301  }
302 }
303 
304 
305 template<class Scalar, class LO, class GO, class Node>
306 void
308 replaceLocalValuesImpl (const LO localRowIndex,
309  const LO colIndex,
310  const Scalar vals[]) const
311 {
312  auto X_dst = getLocalBlock (localRowIndex, colIndex);
313  typename const_little_vec_type::HostMirror::const_type X_src (reinterpret_cast<const impl_scalar_type*> (vals),
314  getBlockSize ());
315  Kokkos::deep_copy (X_dst, X_src);
316 }
317 
318 
319 template<class Scalar, class LO, class GO, class Node>
320 bool
322 replaceLocalValues (const LO localRowIndex,
323  const LO colIndex,
324  const Scalar vals[]) const
325 {
326  if (! meshMap_.isNodeLocalElement (localRowIndex)) {
327  return false;
328  } else {
329  replaceLocalValuesImpl (localRowIndex, colIndex, vals);
330  return true;
331  }
332 }
333 
334 template<class Scalar, class LO, class GO, class Node>
335 bool
337 replaceGlobalValues (const GO globalRowIndex,
338  const LO colIndex,
339  const Scalar vals[]) const
340 {
341  const LO localRowIndex = meshMap_.getLocalElement (globalRowIndex);
342  if (localRowIndex == Teuchos::OrdinalTraits<LO>::invalid ()) {
343  return false;
344  } else {
345  replaceLocalValuesImpl (localRowIndex, colIndex, vals);
346  return true;
347  }
348 }
349 
350 template<class Scalar, class LO, class GO, class Node>
351 void
353 sumIntoLocalValuesImpl (const LO localRowIndex,
354  const LO colIndex,
355  const Scalar vals[]) const
356 {
357  auto X_dst = getLocalBlock (localRowIndex, colIndex);
358  typename const_little_vec_type::HostMirror::const_type X_src (reinterpret_cast<const impl_scalar_type*> (vals),
359  getBlockSize ());
360  AXPY (static_cast<impl_scalar_type> (STS::one ()), X_src, X_dst);
361 }
362 
363 template<class Scalar, class LO, class GO, class Node>
364 bool
366 sumIntoLocalValues (const LO localRowIndex,
367  const LO colIndex,
368  const Scalar vals[]) const
369 {
370  if (! meshMap_.isNodeLocalElement (localRowIndex)) {
371  return false;
372  } else {
373  sumIntoLocalValuesImpl (localRowIndex, colIndex, vals);
374  return true;
375  }
376 }
377 
378 template<class Scalar, class LO, class GO, class Node>
379 bool
381 sumIntoGlobalValues (const GO globalRowIndex,
382  const LO colIndex,
383  const Scalar vals[]) const
384 {
385  const LO localRowIndex = meshMap_.getLocalElement (globalRowIndex);
386  if (localRowIndex == Teuchos::OrdinalTraits<LO>::invalid ()) {
387  return false;
388  } else {
389  sumIntoLocalValuesImpl (localRowIndex, colIndex, vals);
390  return true;
391  }
392 }
393 
394 template<class Scalar, class LO, class GO, class Node>
395 bool
397 getLocalRowView (const LO localRowIndex, const LO colIndex, Scalar*& vals) const
398 {
399  if (! meshMap_.isNodeLocalElement (localRowIndex)) {
400  return false;
401  } else {
402  auto X_ij = getLocalBlock (localRowIndex, colIndex);
403  vals = reinterpret_cast<Scalar*> (X_ij.data ());
404  return true;
405  }
406 }
407 
408 template<class Scalar, class LO, class GO, class Node>
409 bool
411 getGlobalRowView (const GO globalRowIndex, const LO colIndex, Scalar*& vals) const
412 {
413  const LO localRowIndex = meshMap_.getLocalElement (globalRowIndex);
414  if (localRowIndex == Teuchos::OrdinalTraits<LO>::invalid ()) {
415  return false;
416  } else {
417  auto X_ij = getLocalBlock (localRowIndex, colIndex);
418  vals = reinterpret_cast<Scalar*> (X_ij.data ());
419  return true;
420  }
421 }
422 
423 template<class Scalar, class LO, class GO, class Node>
426 getLocalBlock (const LO localRowIndex,
427  const LO colIndex) const
428 {
429  // NOTE (mfh 07 Jul 2016) It should be correct to add the
430  // commented-out test below. However, I've conservatively commented
431  // it out, since users might not realize that they need to have
432  // things sync'd correctly.
433 
434 // #ifdef HAVE_TPETRA_DEBUG
435 // TEUCHOS_TEST_FOR_EXCEPTION
436 // (mv_.need_sync_host (), std::runtime_error,
437 // "Tpetra::Experimental::BlockMultiVector::getLocalBlock: This method "
438 // "accesses host data, but the object is not in sync on host." );
439 // #endif // HAVE_TPETRA_DEBUG
440 
441  if (! isValidLocalMeshIndex (localRowIndex)) {
442  return typename little_vec_type::HostMirror ();
443  } else {
444  const size_t blockSize = getBlockSize ();
445  const size_t offset = colIndex * this->getStrideY () +
446  localRowIndex * blockSize;
447  impl_scalar_type* blockRaw = this->getRawPtr () + offset;
448  return typename little_vec_type::HostMirror (blockRaw, blockSize);
449  }
450 }
451 
452 template<class Scalar, class LO, class GO, class Node>
453 Teuchos::RCP<const typename BlockMultiVector<Scalar, LO, GO, Node>::mv_type>
456 {
457  using Teuchos::rcp;
458  using Teuchos::rcpFromRef;
459 
460  // The source object of an Import or Export must be a
461  // BlockMultiVector or MultiVector (a Vector is a MultiVector). Try
462  // them in that order; one must succeed. Note that the target of
463  // the Import or Export calls checkSizes in DistObject's doTransfer.
464  typedef BlockMultiVector<Scalar, LO, GO, Node> this_type;
465  const this_type* srcBlkVec = dynamic_cast<const this_type*> (&src);
466  if (srcBlkVec == NULL) {
467  const mv_type* srcMultiVec = dynamic_cast<const mv_type*> (&src);
468  if (srcMultiVec == NULL) {
469  // FIXME (mfh 05 May 2014) Tpetra::MultiVector's operator=
470  // currently does a shallow copy by default. This is why we
471  // return by RCP, rather than by value.
472  return rcp (new mv_type ());
473  } else { // src is a MultiVector
474  return rcp (srcMultiVec, false); // nonowning RCP
475  }
476  } else { // src is a BlockMultiVector
477  return rcpFromRef (srcBlkVec->mv_); // nonowning RCP
478  }
479 }
480 
481 template<class Scalar, class LO, class GO, class Node>
484 {
485  return ! getMultiVectorFromSrcDistObject (src).is_null ();
486 }
487 
488 template<class SC, class LO, class GO, class NT>
489 std::pair<int, std::unique_ptr<std::string>>
493  const size_t numSameIDs,
494  const Kokkos::DualView<
495  const local_ordinal_type*,
496  buffer_device_type
497  >& permuteToLIDs,
498  const Kokkos::DualView<
499  const local_ordinal_type*,
500  buffer_device_type
501  >& permuteFromLIDs)
502 {
503  using std::endl;
504  const char tfecfFuncName[] = "copyAndPermuteImpl: ";
505  std::unique_ptr<std::string> errMsg;
506  const bool debug = ::Tpetra::Details::Behavior::debug ("BlockMultiVector");
507  const int myRank = src.getMap ()->getComm ()->getRank ();
508 
509  std::unique_ptr<std::string> prefix;
510  if (debug) {
511  std::ostringstream os;
512  os << "Proc " << myRank << ": " << tfecfFuncName;
513  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
514  os << "Start" << endl;
515  std::cerr << os.str ();
516  }
517  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
518  std::ostringstream os;
519  os << "Proc " << myRank << ": " << tfecfFuncName << "permuteToLIDs and/or "
520  "permuteFromLIDs need sync to host. This should never happen.";
521  errMsg = std::unique_ptr<std::string> (new std::string (os.str ()));
522  return { -1, std::move (errMsg) };
523  }
524 
525  const size_t numPermuteLIDs = static_cast<size_t> (permuteToLIDs.extent (0));
526  if (static_cast<size_t> (permuteFromLIDs.extent (0)) != numPermuteLIDs) {
527  std::ostringstream os;
528  os << "Proc " << myRank << ": " << tfecfFuncName
529  << "permuteToLIDs.extent(0) = " << numPermuteLIDs
530  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent (0)
531  << ".";
532  errMsg = std::unique_ptr<std::string> (new std::string (os.str ()));
533  return { -2, std::move (errMsg) };
534  }
535 
536  const size_t numVecs = static_cast<size_t> (src.getNumVectors ());
537  if (debug) {
538  std::ostringstream os;
539  os << *prefix << "Sames: numSameIDs=" << numSameIDs
540  << ", numVecs=" << numVecs << endl;
541  std::cerr << os.str ();
542  }
543  // FIXME (mfh 23 Apr 2014) This implementation is sequential and
544  // assumes UVM.
545  for (size_t j = 0; j < numVecs; ++j) {
546  for (size_t lclRow = 0; lclRow < numSameIDs; ++lclRow) {
547  deep_copy (this->getLocalBlock (lclRow, j),
548  src.getLocalBlock (lclRow, j));
549  }
550  }
551 
552  if (debug) {
553  std::ostringstream os;
554  os << *prefix << "Permutes: numPermuteLIDs=" << numPermuteLIDs << endl;
555  std::cerr << os.str ();
556  }
557  auto permuteToLIDs_h = permuteToLIDs.view_host ();
558  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
559 
560  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on the
561  // same process, this merges their values by replacement of the last
562  // encountered GID, not by the specified merge rule (such as ADD).
563  for (size_t j = 0; j < numVecs; ++j) {
564  for (size_t k = numSameIDs; k < numPermuteLIDs; ++k) {
565  deep_copy (this->getLocalBlock (permuteToLIDs_h[k], j),
566  src.getLocalBlock (permuteFromLIDs_h[k], j));
567  }
568  }
569 
570  if (debug) {
571  std::ostringstream os;
572  os << *prefix << "Done" << endl;
573  std::cerr << os.str ();
574  }
575  return { 0, std::move (errMsg) };
576 }
577 
578 template<class Scalar, class LO, class GO, class Node>
579 void BlockMultiVector<Scalar, LO, GO, Node>::
580 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
581 copyAndPermuteNew
582 #else // TPETRA_ENABLE_DEPRECATED_CODE
583 copyAndPermute
584 #endif // TPETRA_ENABLE_DEPRECATED_CODE
585 (const SrcDistObject& src,
586  const size_t numSameIDs,
587  const Kokkos::DualView<const local_ordinal_type*,
588  buffer_device_type>& permuteToLIDs,
589  const Kokkos::DualView<const local_ordinal_type*,
590  buffer_device_type>& permuteFromLIDs)
591 {
592  const char tfecfFuncName[] = "copyAndPermute: ";
593 
594  auto srcPtr = getBlockMultiVectorFromSrcDistObject (src);
595  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
596  (srcPtr.is_null (), std::invalid_argument,
597  "The source of an Import or Export to a BlockMultiVector "
598  "must also be a BlockMultiVector.");
599  const auto ret = copyAndPermuteImpl (*srcPtr, numSameIDs,
600  permuteToLIDs, permuteFromLIDs);
601  // TODO (mfh 15 Apr 2019) Instead of throwing here, which could
602  // cause an MPI parallel hang, we should record the error in *this
603  // (the target BMV instance).
604  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
605  (ret.first != 0, std::runtime_error, "copyAndPermuteImpl "
606  "reports an error: " << * (ret.second));
607 }
608 
609 template<class SC, class LO, class GO, class NT>
610 std::pair<int, std::unique_ptr<std::string>>
611 BlockMultiVector<SC, LO, GO, NT>::
612 packAndPrepareImpl
613  (const Kokkos::DualView<
614  const local_ordinal_type*,
615  buffer_device_type
616  >& exportLIDs,
617  Kokkos::DualView<
618  impl_scalar_type*,
619  buffer_device_type
620  >& exports,
621  Kokkos::DualView<
622  size_t*,
623  buffer_device_type
624  > /* numPacketsPerLID */,
625  size_t& constantNumPackets,
626  Distributor& /* distor */ ) const
627 {
628  using std::endl;
629  using IST = impl_scalar_type;
630  using exports_dv_type = Kokkos::DualView<packet_type*, buffer_device_type>;
631  using host_device_type = typename exports_dv_type::t_host::device_type;
632  using dst_little_vec_type = Kokkos::View<IST*, host_device_type,
633  Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
634  const char tfecfFuncName[] = "packAndPrepareImpl: ";
635  std::unique_ptr<std::string> errMsg;
636  const bool debug = ::Tpetra::Details::Behavior::debug ("BlockMultiVector");
637 
638  std::unique_ptr<std::string> prefix;
639  if (debug) {
640  const int myRank = this->getMap ()->getComm ()->getRank ();
641  std::ostringstream os;
642  os << "Proc " << myRank << ": " << tfecfFuncName;
643  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
644  os << "Start" << endl;
645  std::cerr << os.str ();
646  }
647  if (exportLIDs.need_sync_host ()) {
648  const int myRank = this->getMap ()->getComm ()->getRank ();
649  std::ostringstream os;
650  os << "Proc " << myRank << ": " << tfecfFuncName << "exportLIDs "
651  "needs sync to host. This should never happen.";
652  errMsg = std::unique_ptr<std::string> (new std::string (os.str ()));
653  return { -1, std::move (errMsg) };
654  }
655 
656  const size_t numVecs = static_cast<size_t> (this->getNumVectors ());
657  const size_t blockSize = static_cast<size_t> (this->getBlockSize ());
658 
659  // Number of things to pack per LID is the block size, times the
660  // number of columns. Input LIDs correspond to the mesh points, not
661  // the degrees of freedom (DOFs).
662  constantNumPackets = blockSize * numVecs;
663  const size_t numMeshLIDs = static_cast<size_t> (exportLIDs.extent (0));
664  const size_t requiredExportsSize = numMeshLIDs * blockSize * numVecs;
665 
666  if (static_cast<size_t> (exports.extent (0)) != requiredExportsSize) {
667  exports = exports_dv_type ();
668  exports = exports_dv_type ("exports", requiredExportsSize);
669  }
670  exports.clear_sync_state ();
671  exports.modify_host ();
672  auto exports_h = exports.view_host ();
673  auto exportLIDs_h = exportLIDs.view_host ();
674 
675  try {
676  size_t curExportPos = 0;
677  for (size_t meshLidIndex = 0; meshLidIndex < numMeshLIDs; ++meshLidIndex) {
678  for (size_t j = 0; j < numVecs; ++j, curExportPos += blockSize) {
679  const LO meshLid = exportLIDs_h[meshLidIndex];
680 
681  IST* const curExportPtr = &exports_h[curExportPos];
682  dst_little_vec_type X_dst (curExportPtr, blockSize);
683  auto X_src = this->getLocalBlock (meshLid, j);
684 
685  Kokkos::deep_copy (X_dst, X_src);
686  }
687  }
688  }
689  catch (std::exception& e) {
690  const int myRank = this->getMap ()->getComm ()->getRank ();
691  std::ostringstream os;
692  os << "Proc " << myRank << ": " << tfecfFuncName
693  << " threw an exception: " << e.what ();
694  errMsg = std::unique_ptr<std::string> (new std::string (os.str ()));
695  return { -2, std::move (errMsg) };
696  }
697 
698  if (debug) {
699  std::ostringstream os;
700  os << *prefix << "Done" << endl;
701  std::cerr << os.str ();
702  }
703  return { 0, std::move (errMsg) };
704 }
705 
706 template<class Scalar, class LO, class GO, class Node>
707 void BlockMultiVector<Scalar, LO, GO, Node>::
708 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
709 packAndPrepareNew
710 #else // TPETRA_ENABLE_DEPRECATED_CODE
711 packAndPrepare
712 #endif // TPETRA_ENABLE_DEPRECATED_CODE
713 (const SrcDistObject& src,
714  const Kokkos::DualView<const local_ordinal_type*,
715  buffer_device_type>& exportLIDs,
716  Kokkos::DualView<packet_type*,
717  buffer_device_type>& exports,
718  Kokkos::DualView<size_t*,
719  buffer_device_type> numPacketsPerLID,
720  size_t& constantNumPackets,
721  Distributor& distor)
722 {
723  const char tfecfFuncName[] = "packAndPrepare: ";
724 
725  auto srcAsBmvPtr = getBlockMultiVectorFromSrcDistObject (src);
726  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
727  (srcAsBmvPtr.is_null (), std::invalid_argument,
728  "The source of an Import or Export to a BlockMultiVector "
729  "must also be a BlockMultiVector.");
730  const auto ret =
731  srcAsBmvPtr->packAndPrepareImpl (exportLIDs, exports,
732  numPacketsPerLID,
733  constantNumPackets, distor);
734  // TODO (mfh 15 Apr 2019) Instead of throwing here, which could
735  // cause an MPI parallel hang, we should record the error in *this
736  // (the target BMV instance).
737  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
738  (ret.first != 0, std::runtime_error, "packAndPrepareImpl "
739  "reports an error: " << * (ret.second));
740 }
741 
742 template<class SC, class LO, class GO, class NT>
743 std::pair<int, std::unique_ptr<std::string>>
744 BlockMultiVector<SC, LO, GO, NT>::
745 unpackAndCombineImpl
746  (const Kokkos::DualView<
747  const local_ordinal_type*,
748  buffer_device_type
749  >& importLIDs,
750  Kokkos::DualView<
751  impl_scalar_type*,
752  buffer_device_type
753  > imports,
754  Kokkos::DualView<
755  size_t*,
756  buffer_device_type
757  > numPacketsPerLID,
758  const size_t constantNumPackets,
759  Distributor& distor,
760  const CombineMode combineMode)
761 {
762  using std::endl;
763  using IST = impl_scalar_type;
764  using KAT = Kokkos::ArithTraits<IST>;
765  using imports_dv_type = Kokkos::DualView<packet_type*, buffer_device_type>;
766  using host_device_type = typename imports_dv_type::t_host::device_type;
767  using src_little_vec_type = Kokkos::View<const IST*, host_device_type,
768  Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
769  const char tfecfFuncName[] = "unpackAndCombineImpl: ";
770  std::unique_ptr<std::string> errMsg;
771  const bool debug = ::Tpetra::Details::Behavior::debug ("BlockMultiVector");
772 
773  std::unique_ptr<std::string> prefix;
774  if (debug) {
775  const int myRank = this->getMap ()->getComm ()->getRank ();
776  std::ostringstream os;
777  os << "Proc " << myRank << ": " << tfecfFuncName;
778  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
779  os << "Start" << endl;
780  std::cerr << os.str ();
781  }
782 
783  if (combineMode != ADD && combineMode != REPLACE &&
784  combineMode != INSERT && combineMode != ABSMAX &&
785  combineMode != ZERO) {
786  std::ostringstream os;
787  os << tfecfFuncName << "Invalid CombineMode: " << combineMode << ". "
788  "Valid CombineMode values: ADD, REPLACE, INSERT, ABSMAX, and ZERO.";
789  errMsg = std::unique_ptr<std::string> (new std::string (os.str ()));
790  return { -1, std::move (errMsg) };
791  }
792  if (combineMode == ZERO) {
793  return { 0, std::move (errMsg) };
794  }
795 
796  if (importLIDs.need_sync_host ()) {
797  const int myRank = this->getMap ()->getComm ()->getRank ();
798  std::ostringstream os;
799  os << "Proc " << myRank << ": " << tfecfFuncName << "importLIDs "
800  "needs sync to host. This should never happen.";
801  errMsg = std::unique_ptr<std::string> (new std::string (os.str ()));
802  return { -2, std::move (errMsg) };
803  }
804 
805  // Number of things to pack per LID is the block size.
806  // Input LIDs correspond to the mesh points, not the DOFs.
807  const size_t numMeshLIDs = static_cast<size_t> (importLIDs.extent (0));
808  const size_t blockSize = static_cast<size_t> (this->getBlockSize ());
809  const size_t numVecs = static_cast<size_t> (this->getNumVectors ());
810 
811  const size_t requiredImportsSize = numMeshLIDs * blockSize * numVecs;
812  if (static_cast<size_t> (imports.extent (0)) < requiredImportsSize) {
813  const int myRank = this->getMap ()->getComm ()->getRank ();
814  std::ostringstream os;
815  os << "Proc " << myRank << ": " << tfecfFuncName << "imports.extent(0) = "
816  << imports.extent (0) << " < requiredImportsSize = "
817  << requiredImportsSize << ".";
818  errMsg = std::unique_ptr<std::string> (new std::string (os.str ()));
819  return { -3, std::move (errMsg) };
820  }
821 
822  auto importLIDs_h = importLIDs.view_host ();
823  if (imports.need_sync_host ()) {
824  imports.sync_host ();
825  }
826  auto imports_h = imports.view_host ();
827 
828  size_t curImportPos = 0;
829  for (size_t meshLidIndex = 0; meshLidIndex < numMeshLIDs; ++meshLidIndex) {
830  for (size_t j = 0; j < numVecs; ++j, curImportPos += blockSize) {
831  const LO meshLid = importLIDs_h[meshLidIndex];
832  const IST* const curImportPtr = &imports_h[curImportPos];
833 
834  src_little_vec_type X_src (curImportPtr, blockSize);
835  auto X_dst = this->getLocalBlock (meshLid, j);
836 
837  if (combineMode == INSERT || combineMode == REPLACE) {
838  deep_copy (X_dst, X_src);
839  }
840  else if (combineMode == ADD) {
842  AXPY (static_cast<IST> (KAT::one ()), X_src, X_dst);
843  }
844  else if (combineMode == ABSMAX) {
845  ::Tpetra::Experimental::Impl::absMax (X_dst, X_src);
846  }
847  }
848  }
849 
850  if (debug) {
851  std::ostringstream os;
852  os << *prefix << "Done" << endl;
853  std::cerr << os.str ();
854  }
855  return { 0, std::move (errMsg) };
856 }
857 
858 template<class Scalar, class LO, class GO, class Node>
859 void BlockMultiVector<Scalar, LO, GO, Node>::
860 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
861 unpackAndCombineNew
862 #else // TPETRA_ENABLE_DEPRECATED_CODE
863 unpackAndCombine
864 #endif // TPETRA_ENABLE_DEPRECATED_CODE
865 (const Kokkos::DualView<const local_ordinal_type*,
866  buffer_device_type>& importLIDs,
867  Kokkos::DualView<packet_type*,
868  buffer_device_type> imports,
869  Kokkos::DualView<size_t*,
870  buffer_device_type> numPacketsPerLID,
871  const size_t constantNumPackets,
872  Distributor& distor,
873  const CombineMode combineMode)
874 {
875  const char tfecfFuncName[] = "unpackAndCombine: ";
876  const auto ret =
877  unpackAndCombineImpl (importLIDs, imports, numPacketsPerLID,
878  constantNumPackets, distor, combineMode);
879  // TODO (mfh 15 Apr 2019) Instead of throwing here, which could
880  // cause an MPI parallel hang, we should record the error in *this
881  // (the target BMV instance).
882  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
883  (ret.first != 0, std::runtime_error, "unpackAndCombineImpl "
884  "reports an error: " << * (ret.second));
885 }
886 
887 template<class Scalar, class LO, class GO, class Node>
889 isValidLocalMeshIndex (const LO meshLocalIndex) const
890 {
891  return meshLocalIndex != Teuchos::OrdinalTraits<LO>::invalid () &&
892  meshMap_.isNodeLocalElement (meshLocalIndex);
893 }
894 
895 template<class Scalar, class LO, class GO, class Node>
897 putScalar (const Scalar& val)
898 {
899  mv_.putScalar (val);
900 }
901 
902 template<class Scalar, class LO, class GO, class Node>
904 scale (const Scalar& val)
905 {
906  mv_.scale (val);
907 }
908 
909 template<class Scalar, class LO, class GO, class Node>
911 update (const Scalar& alpha,
913  const Scalar& beta)
914 {
915  mv_.update (alpha, X.mv_, beta);
916 }
917 
918 namespace Impl {
919 // Y := alpha * D * X
920 template <typename Scalar, typename ViewY, typename ViewD, typename ViewX>
921 struct BlockWiseMultiply {
922  typedef typename ViewD::size_type Size;
923 
924 private:
925  typedef typename ViewD::device_type Device;
926  typedef typename ViewD::non_const_value_type ImplScalar;
927  typedef Kokkos::MemoryTraits<Kokkos::Unmanaged> Unmanaged;
928 
929  template <typename View>
930  using UnmanagedView = Kokkos::View<typename View::data_type, typename View::array_layout,
931  typename View::device_type, Unmanaged>;
932  typedef UnmanagedView<ViewY> UnMViewY;
933  typedef UnmanagedView<ViewD> UnMViewD;
934  typedef UnmanagedView<ViewX> UnMViewX;
935 
936  const Size block_size_;
937  Scalar alpha_;
938  UnMViewY Y_;
939  UnMViewD D_;
940  UnMViewX X_;
941 
942 public:
943  BlockWiseMultiply (const Size block_size, const Scalar& alpha,
944  const ViewY& Y, const ViewD& D, const ViewX& X)
945  : block_size_(block_size), alpha_(alpha), Y_(Y), D_(D), X_(X)
946  {}
947 
948  KOKKOS_INLINE_FUNCTION
949  void operator() (const Size k) const {
950  const auto zero = Kokkos::Details::ArithTraits<Scalar>::zero();
951  auto D_curBlk = Kokkos::subview(D_, k, Kokkos::ALL (), Kokkos::ALL ());
952  const auto num_vecs = X_.extent(1);
953  for (Size i = 0; i < num_vecs; ++i) {
954  Kokkos::pair<Size, Size> kslice(k*block_size_, (k+1)*block_size_);
955  auto X_curBlk = Kokkos::subview(X_, kslice, i);
956  auto Y_curBlk = Kokkos::subview(Y_, kslice, i);
957  // Y_curBlk := alpha * D_curBlk * X_curBlk.
958  // Recall that GEMV does an update (+=) of the last argument.
959  Tpetra::Experimental::FILL(Y_curBlk, zero);
960  Tpetra::Experimental::GEMV(alpha_, D_curBlk, X_curBlk, Y_curBlk);
961  }
962  }
963 };
964 
965 template <typename Scalar, typename ViewY, typename ViewD, typename ViewX>
966 inline BlockWiseMultiply<Scalar, ViewY, ViewD, ViewX>
967 createBlockWiseMultiply (const int block_size, const Scalar& alpha,
968  const ViewY& Y, const ViewD& D, const ViewX& X) {
969  return BlockWiseMultiply<Scalar, ViewY, ViewD, ViewX>(block_size, alpha, Y, D, X);
970 }
971 
972 template <typename ViewY,
973  typename Scalar,
974  typename ViewD,
975  typename ViewZ,
976  typename LO = typename ViewY::size_type>
977 class BlockJacobiUpdate {
978 private:
979  typedef typename ViewD::device_type Device;
980  typedef typename ViewD::non_const_value_type ImplScalar;
981  typedef Kokkos::MemoryTraits<Kokkos::Unmanaged> Unmanaged;
982 
983  template <typename ViewType>
984  using UnmanagedView = Kokkos::View<typename ViewType::data_type,
985  typename ViewType::array_layout,
986  typename ViewType::device_type,
987  Unmanaged>;
988  typedef UnmanagedView<ViewY> UnMViewY;
989  typedef UnmanagedView<ViewD> UnMViewD;
990  typedef UnmanagedView<ViewZ> UnMViewZ;
991 
992  const LO blockSize_;
993  UnMViewY Y_;
994  const Scalar alpha_;
995  UnMViewD D_;
996  UnMViewZ Z_;
997  const Scalar beta_;
998 
999 public:
1000  BlockJacobiUpdate (const ViewY& Y,
1001  const Scalar& alpha,
1002  const ViewD& D,
1003  const ViewZ& Z,
1004  const Scalar& beta) :
1005  blockSize_ (D.extent (1)),
1006  // numVecs_ (static_cast<int> (ViewY::rank) == 1 ? static_cast<size_type> (1) : static_cast<size_type> (Y_.extent (1))),
1007  Y_ (Y),
1008  alpha_ (alpha),
1009  D_ (D),
1010  Z_ (Z),
1011  beta_ (beta)
1012  {
1013  static_assert (static_cast<int> (ViewY::rank) == 1,
1014  "Y must have rank 1.");
1015  static_assert (static_cast<int> (ViewD::rank) == 3, "D must have rank 3.");
1016  static_assert (static_cast<int> (ViewZ::rank) == 1,
1017  "Z must have rank 1.");
1018  // static_assert (static_cast<int> (ViewZ::rank) ==
1019  // static_cast<int> (ViewY::rank),
1020  // "Z must have the same rank as Y.");
1021  }
1022 
1023  KOKKOS_INLINE_FUNCTION void
1024  operator() (const LO& k) const
1025  {
1026  using Kokkos::ALL;
1027  using Kokkos::subview;
1028  typedef Kokkos::pair<LO, LO> range_type;
1029  typedef Kokkos::Details::ArithTraits<Scalar> KAT;
1030 
1031  // We only have to implement the alpha != 0 case.
1032 
1033  auto D_curBlk = subview (D_, k, ALL (), ALL ());
1034  const range_type kslice (k*blockSize_, (k+1)*blockSize_);
1035 
1036  // Z.update (STS::one (), X, -STS::one ()); // assume this is done
1037 
1038  auto Z_curBlk = subview (Z_, kslice);
1039  auto Y_curBlk = subview (Y_, kslice);
1040  // Y_curBlk := beta * Y_curBlk + alpha * D_curBlk * Z_curBlk
1041  if (beta_ == KAT::zero ()) {
1042  Tpetra::Experimental::FILL (Y_curBlk, KAT::zero ());
1043  }
1044  else if (beta_ != KAT::one ()) {
1045  Tpetra::Experimental::SCAL (beta_, Y_curBlk);
1046  }
1047  Tpetra::Experimental::GEMV (alpha_, D_curBlk, Z_curBlk, Y_curBlk);
1048  }
1049 };
1050 
1051 template<class ViewY,
1052  class Scalar,
1053  class ViewD,
1054  class ViewZ,
1055  class LO = typename ViewD::size_type>
1056 void
1057 blockJacobiUpdate (const ViewY& Y,
1058  const Scalar& alpha,
1059  const ViewD& D,
1060  const ViewZ& Z,
1061  const Scalar& beta)
1062 {
1063  static_assert (Kokkos::Impl::is_view<ViewY>::value, "Y must be a Kokkos::View.");
1064  static_assert (Kokkos::Impl::is_view<ViewD>::value, "D must be a Kokkos::View.");
1065  static_assert (Kokkos::Impl::is_view<ViewZ>::value, "Z must be a Kokkos::View.");
1066  static_assert (static_cast<int> (ViewY::rank) == static_cast<int> (ViewZ::rank),
1067  "Y and Z must have the same rank.");
1068  static_assert (static_cast<int> (ViewD::rank) == 3, "D must have rank 3.");
1069 
1070  const auto lclNumMeshRows = D.extent (0);
1071 
1072 #ifdef HAVE_TPETRA_DEBUG
1073  // D.extent(0) is the (local) number of mesh rows.
1074  // D.extent(1) is the block size. Thus, their product should be
1075  // the local number of point rows, that is, the number of rows in Y.
1076  const auto blkSize = D.extent (1);
1077  const auto lclNumPtRows = lclNumMeshRows * blkSize;
1078  TEUCHOS_TEST_FOR_EXCEPTION
1079  (Y.extent (0) != lclNumPtRows, std::invalid_argument,
1080  "blockJacobiUpdate: Y.extent(0) = " << Y.extent (0) << " != "
1081  "D.extent(0)*D.extent(1) = " << lclNumMeshRows << " * " << blkSize
1082  << " = " << lclNumPtRows << ".");
1083  TEUCHOS_TEST_FOR_EXCEPTION
1084  (Y.extent (0) != Z.extent (0), std::invalid_argument,
1085  "blockJacobiUpdate: Y.extent(0) = " << Y.extent (0) << " != "
1086  "Z.extent(0) = " << Z.extent (0) << ".");
1087  TEUCHOS_TEST_FOR_EXCEPTION
1088  (Y.extent (1) != Z.extent (1), std::invalid_argument,
1089  "blockJacobiUpdate: Y.extent(1) = " << Y.extent (1) << " != "
1090  "Z.extent(1) = " << Z.extent (1) << ".");
1091 #endif // HAVE_TPETRA_DEBUG
1092 
1093  BlockJacobiUpdate<ViewY, Scalar, ViewD, ViewZ, LO> functor (Y, alpha, D, Z, beta);
1094  typedef Kokkos::RangePolicy<typename ViewY::execution_space, LO> range_type;
1095  // lclNumMeshRows must fit in LO, else the Map would not be correct.
1096  range_type range (0, static_cast<LO> (lclNumMeshRows));
1097  Kokkos::parallel_for (range, functor);
1098 }
1099 
1100 } // namespace Impl
1101 
1102 template<class Scalar, class LO, class GO, class Node>
1104 blockWiseMultiply (const Scalar& alpha,
1105  const Kokkos::View<const impl_scalar_type***,
1106  device_type, Kokkos::MemoryUnmanaged>& D,
1108 {
1109  using Kokkos::ALL;
1110  typedef typename device_type::execution_space execution_space;
1111  typedef typename device_type::memory_space memory_space;
1112  const LO lclNumMeshRows = meshMap_.getNodeNumElements ();
1113 
1114  if (alpha == STS::zero ()) {
1115  this->putScalar (STS::zero ());
1116  }
1117  else { // alpha != 0
1118  const LO blockSize = this->getBlockSize ();
1119  const impl_scalar_type alphaImpl = static_cast<impl_scalar_type> (alpha);
1120  auto X_lcl = X.mv_.template getLocalView<memory_space> ();
1121  auto Y_lcl = this->mv_.template getLocalView<memory_space> ();
1122  auto bwm = Impl::createBlockWiseMultiply (blockSize, alphaImpl, Y_lcl, D, X_lcl);
1123 
1124  // Use an explicit RangePolicy with the desired execution space.
1125  // Otherwise, if you just use a number, it runs on the default
1126  // execution space. For example, if the default execution space
1127  // is Cuda but the current execution space is Serial, using just a
1128  // number would incorrectly run with Cuda.
1129  Kokkos::RangePolicy<execution_space, LO> range (0, lclNumMeshRows);
1130  Kokkos::parallel_for (range, bwm);
1131  }
1132 }
1133 
1134 template<class Scalar, class LO, class GO, class Node>
1136 blockJacobiUpdate (const Scalar& alpha,
1137  const Kokkos::View<const impl_scalar_type***,
1138  device_type, Kokkos::MemoryUnmanaged>& D,
1141  const Scalar& beta)
1142 {
1143  using Kokkos::ALL;
1144  using Kokkos::subview;
1145  typedef typename device_type::memory_space memory_space;
1146  typedef impl_scalar_type IST;
1147 
1148  const IST alphaImpl = static_cast<IST> (alpha);
1149  const IST betaImpl = static_cast<IST> (beta);
1150  const LO numVecs = mv_.getNumVectors ();
1151 
1152  auto X_lcl = X.mv_.template getLocalView<memory_space> ();
1153  auto Y_lcl = this->mv_.template getLocalView<memory_space> ();
1154  auto Z_lcl = Z.mv_.template getLocalView<memory_space> ();
1155 
1156  if (alpha == STS::zero ()) { // Y := beta * Y
1157  this->scale (beta);
1158  }
1159  else { // alpha != 0
1160  Z.update (STS::one (), X, -STS::one ());
1161  for (LO j = 0; j < numVecs; ++j) {
1162  auto X_lcl_j = subview (X_lcl, ALL (), j);
1163  auto Y_lcl_j = subview (Y_lcl, ALL (), j);
1164  auto Z_lcl_j = subview (Z_lcl, ALL (), j);
1165  Impl::blockJacobiUpdate (Y_lcl_j, alphaImpl, D, Z_lcl_j, betaImpl);
1166  }
1167  }
1168 }
1169 
1170 } // namespace Experimental
1171 } // namespace Tpetra
1172 
1173 //
1174 // Explicit instantiation macro
1175 //
1176 // Must be expanded from within the Tpetra namespace!
1177 //
1178 #define TPETRA_EXPERIMENTAL_BLOCKMULTIVECTOR_INSTANT(S,LO,GO,NODE) \
1179  namespace Experimental { \
1180  template class BlockMultiVector< S, LO, GO, NODE >; \
1181  }
1182 
1183 #endif // TPETRA_EXPERIMENTAL_BLOCKMULTIVECTOR_DEF_HPP
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
mv_type mv_
The Tpetra::MultiVector used to represent the data.
bool isContiguous() const
True if this Map is distributed contiguously, else false.
void putScalar(const Scalar &val)
Fill all entries with the given value val.
size_t getNumVectors() const
Number of columns in the multivector.
bool replaceGlobalValues(const GO globalRowIndex, const LO colIndex, const Scalar vals[]) const
Replace all values at the given mesh point, using a global index.
One or more distributed dense vectors.
bool sumIntoGlobalValues(const GO globalRowIndex, const LO colIndex, const Scalar vals[]) const
Sum into all values at the given mesh point, using a global index.
MultiVector for multiple degrees of freedom per mesh point.
GlobalOrdinal getIndexBase() const
The index base for this Map.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
static bool debug()
Whether Tpetra is in debug mode.
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
int local_ordinal_type
Default value of Scalar template parameter.
void setCopyOrView(const Teuchos::DataAccess copyOrView)
Set whether this has copy (copyOrView = Teuchos::Copy) or view (copyOrView = Teuchos::View) semantics...
void scale(const Scalar &val)
Multiply all entries in place by the given value val.
void update(const Scalar &alpha, const BlockMultiVector< Scalar, LO, GO, Node > &X, const Scalar &beta)
Update: this = beta*this + alpha*X.
mv_type getMultiVectorView() const
Get a Tpetra::MultiVector that views this BlockMultiVector&#39;s data.
size_t global_size_t
Global size_t object.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
typename device_type::execution_space execution_space
The Kokkos execution space.
Linear algebra kernels for small dense matrices and vectors.
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void blockJacobiUpdate(const Scalar &alpha, const Kokkos::View< const impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D, const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Z, const Scalar &beta)
Block Jacobi update .
void blockWiseMultiply(const Scalar &alpha, const Kokkos::View< const impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D, const BlockMultiVector< Scalar, LO, GO, Node > &X)
*this := alpha * D * X, where D is a block diagonal matrix.
bool getLocalRowView(const LO localRowIndex, const LO colIndex, Scalar *&vals) const
Get a writeable view of the entries at the given mesh point, using a local index. ...
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
bool isValidLocalMeshIndex(const LO meshLocalIndex) const
True if and only if meshLocalIndex is a valid local index in the mesh Map.
Replace existing values with new values.
Replace old values with zero.
Teuchos::ArrayView< const GlobalOrdinal > getNodeElementList() const
Return a NONOWNING view of the global indices owned by this process.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
virtual bool checkSizes(const Tpetra::SrcDistObject &source)
Compare the source and target (this) objects for compatibility.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
typename mv_type::impl_scalar_type impl_scalar_type
The implementation type of entries in the object.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool replaceLocalValues(const LO localRowIndex, const LO colIndex, const Scalar vals[]) const
Replace all values at the given mesh point, using local row and column indices.
Teuchos::DataAccess getCopyOrView() const
Get whether this has copy (copyOrView = Teuchos::Copy) or view (copyOrView = Teuchos::View) semantics...
bool sumIntoLocalValues(const LO localRowIndex, const LO colIndex, const Scalar vals[]) const
Sum into all values at the given mesh point, using a local index.
global_size_t getGlobalNumElements() const
The number of elements in this Map.
typename mv_type::device_type device_type
The Kokkos Device type.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
bool getGlobalRowView(const GO globalRowIndex, const LO colIndex, Scalar *&vals) const
Get a writeable view of the entries at the given mesh point, using a global index.