41 #ifndef TPETRA_DETAILS_RESIDUAL_HPP
42 #define TPETRA_DETAILS_RESIDUAL_HPP
44 #include "TpetraCore_config.h"
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Tpetra_LocalCrsMatrixOperator.hpp"
47 #include "Tpetra_MultiVector.hpp"
48 #include "Teuchos_RCP.hpp"
49 #include "Teuchos_ScalarTraits.hpp"
63 template<
class ExecutionSpace>
65 residual_launch_parameters (int64_t numRows,
67 int64_t rows_per_thread,
71 using execution_space =
typename ExecutionSpace::execution_space;
73 int64_t rows_per_team;
74 int64_t nnz_per_row = nnz/numRows;
76 if (nnz_per_row < 1) {
80 if (vector_length < 1) {
82 while (vector_length<32 && vector_length*6 < nnz_per_row) {
88 if (rows_per_thread < 1) {
89 #ifdef KOKKOS_ENABLE_CUDA
90 if (std::is_same<Kokkos::Cuda, execution_space>::value) {
96 if (nnz_per_row < 20 && nnz > 5000000) {
97 rows_per_thread = 256;
100 rows_per_thread = 64;
105 #ifdef KOKKOS_ENABLE_CUDA
107 if (std::is_same<Kokkos::Cuda,execution_space>::value) {
108 team_size = 256/vector_length;
116 rows_per_team = rows_per_thread * team_size;
118 if (rows_per_team < 0) {
119 int64_t nnz_per_team = 4096;
120 int64_t conc = execution_space::concurrency ();
121 while ((conc * nnz_per_team * 4 > nnz) &&
122 (nnz_per_team > 256)) {
125 rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
128 return rows_per_team;
135 template<
class SC,
class LO,
class GO,
class NO>
136 void localResidual(
const CrsMatrix<SC,LO,GO,NO> & A,
137 const MultiVector<SC,LO,GO,NO> & X,
138 const MultiVector<SC,LO,GO,NO> & B,
139 MultiVector<SC,LO,GO,NO> & R) {
141 using Teuchos::NO_TRANS;
142 ProfilingRegion regionLocalApply (
"Tpetra::CrsMatrix::localResidual");
144 auto A_lcl = A.getLocalMatrix ();
145 auto X_lcl = X.getLocalViewDevice ();
146 auto B_lcl = B.getLocalViewDevice ();
147 auto R_lcl = R.getLocalViewDevice ();
148 auto lclMatrix_ = A.getLocalMatrix ();
152 TEUCHOS_TEST_FOR_EXCEPTION
153 (X.getNumVectors () != R.getNumVectors (), std::runtime_error,
154 "X.getNumVectors() = " << X.getNumVectors () <<
" != "
155 "R.getNumVectors() = " << R.getNumVectors () <<
".");
156 TEUCHOS_TEST_FOR_EXCEPTION
157 (X.getNumVectors () != R.getNumVectors (), std::runtime_error,
158 "X.getNumVectors() = " << X.getNumVectors () <<
" != "
159 "R.getNumVectors() = " << R.getNumVectors () <<
".");
161 TEUCHOS_TEST_FOR_EXCEPTION
162 (X.getLocalLength () !=
163 A.getColMap ()->getNodeNumElements (), std::runtime_error,
164 "X has the wrong number of local rows. "
165 "X.getLocalLength() = " << X.getLocalLength () <<
" != "
166 "A.getColMap()->getNodeNumElements() = " <<
167 A.getColMap ()->getNodeNumElements () <<
".");
168 TEUCHOS_TEST_FOR_EXCEPTION
169 (R.getLocalLength () !=
170 A.getRowMap ()->getNodeNumElements (), std::runtime_error,
171 "R has the wrong number of local rows. "
172 "R.getLocalLength() = " << R.getLocalLength () <<
" != "
173 "A.getRowMap()->getNodeNumElements() = " <<
174 A.getRowMap ()->getNodeNumElements () <<
".");
175 TEUCHOS_TEST_FOR_EXCEPTION
176 (B.getLocalLength () !=
177 A.getRowMap ()->getNodeNumElements (), std::runtime_error,
178 "B has the wrong number of local rows. "
179 "B.getLocalLength() = " << B.getLocalLength () <<
" != "
180 "A.getRowMap()->getNodeNumElements() = " <<
181 A.getRowMap ()->getNodeNumElements () <<
".");
183 TEUCHOS_TEST_FOR_EXCEPTION
184 (! A.isFillComplete (), std::runtime_error,
"The matrix A is not "
185 "fill complete. You must call fillComplete() (possibly with "
186 "domain and range Map arguments) without an intervening "
187 "resumeFill() call before you may call this method.");
188 TEUCHOS_TEST_FOR_EXCEPTION
189 (! X.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
190 std::runtime_error,
"X, Y and B must be constant stride.");
193 TEUCHOS_TEST_FOR_EXCEPTION
194 ((X_lcl.data () == R_lcl.data () && X_lcl.data () !=
nullptr) ||
195 (X_lcl.data () == B_lcl.data () && X_lcl.data () !=
nullptr),
196 std::runtime_error,
"X, Y and R may not alias one another.");
199 SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
200 #ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL
203 A.localApply(X,R,Teuchos::NO_TRANS, one, zero);
204 R.update(one,B,negone);
206 using execution_space =
typename CrsMatrix<SC,LO,GO,NO>::execution_space;
208 if (A_lcl.numRows() == 0) {
212 int64_t numLocalRows = A_lcl.numRows ();
213 int64_t myNnz = A_lcl.nnz();
217 LO maxRowImbalance = 0;
218 if(numLocalRows != 0)
219 maxRowImbalance = A.getNodeMaxNumRowEntries() - (myNnz / numLocalRows);
223 auto lclOp = A.getLocalMultiplyOperator();
225 lclOp->applyImbalancedRows (X_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
226 R.update(one,B,negone);
231 int vector_length = -1;
232 int64_t rows_per_thread = -1;
234 int64_t rows_per_team =
235 residual_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
236 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
238 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
239 using team_member =
typename policy_type::member_type;
241 using residual_value_type =
typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_value_type;
242 using KAT = Kokkos::ArithTraits<residual_value_type>;
244 policy_type policy (1, 1);
246 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
249 policy = policy_type (worksets, team_size, vector_length);
252 bool is_vector = (X_lcl.extent(1) == 1);
257 Kokkos::parallel_for(
"residual-vector",policy,KOKKOS_LAMBDA(
const team_member& dev) {
258 Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (
const LO& loop) {
259 const LO lclRow =
static_cast<LO
> (dev.league_rank ()) * rows_per_team + loop;
261 if (lclRow >= A_lcl.numRows ()) {
265 const auto A_row = A_lcl.rowConst(lclRow);
266 const LO row_length =
static_cast<LO
> (A_row.length);
267 residual_value_type A_x = KAT::zero ();
269 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (
const LO iEntry, residual_value_type& lsum) {
270 const auto A_val = A_row.value(iEntry);
271 lsum += A_val * X_lcl(A_row.colidx(iEntry),0);
274 Kokkos::single(Kokkos::PerThread(dev),[&] () {
275 R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
282 Kokkos::parallel_for(
"residual-multivector",policy,KOKKOS_LAMBDA(
const team_member& dev) {
285 const LO numVectors =
static_cast<LO
>(X_lcl.extent(1));
286 Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (
const LO& loop) {
287 const LO lclRow =
static_cast<LO
> (dev.league_rank ()) * rows_per_team + loop;
289 if (lclRow >= A_lcl.numRows ()) {
292 const auto A_row = A_lcl.rowConst(lclRow);
293 const LO row_length =
static_cast<LO
> (A_row.length);
294 for(LO v=0; v<numVectors; v++) {
295 residual_value_type A_x = KAT::zero ();
297 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (
const LO iEntry, residual_value_type& lsum) {
298 const auto A_val = A_row.value(iEntry);
299 lsum += A_val * X_lcl(A_row.colidx(iEntry),v);
302 Kokkos::single(Kokkos::PerThread(dev),[&] () {
303 R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
315 template<
class SC,
class LO,
class GO,
class NO>
323 using Teuchos::rcp_const_cast;
324 using Teuchos::rcpFromRef;
329 SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
330 Aop.
apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
331 R_in.
update(one,B_in,negone);
341 const bool R_is_replicated =
350 RCP<const import_type> importer = A.
getGraph ()->getImporter ();
351 RCP<const export_type> exporter = A.
getGraph ()->getExporter ();
356 RCP<const MV> X_colMap;
357 if (importer.is_null ()) {
367 X_colMap = rcp_const_cast<
const MV> (X_colMapNonConst);
372 X_colMap = rcpFromRef (X_in);
384 X_colMapNonConst->doImport (X_in, *importer,
INSERT);
385 X_colMap = rcp_const_cast<
const MV> (X_colMapNonConst);
392 if(exporter.is_null()) {
397 R_rowMap = rcpFromRef (R_in);
406 RCP<const MV> B_rowMap;
407 if(exporter.is_null()) {
413 B_rowMap = rcp_const_cast<
const MV> (B_rowMapNonConst);
416 B_rowMap = rcpFromRef (B_in);
422 ProfilingRegion regionExport (
"Tpetra::CrsMatrix::residual: B Import");
424 B_rowMapNonConst->doImport(B_in, *exporter,
ADD);
425 B_rowMap = rcp_const_cast<
const MV> (B_rowMapNonConst);
433 if (! exporter.is_null ()) {
435 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap);
438 ProfilingRegion regionExport (
"Tpetra::CrsMatrix::residual: R Export");
453 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap);
457 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap);
463 if (R_is_replicated) {
464 ProfilingRegion regionReduce (
"Tpetra::CrsMatrix::residual: Reduce Y");
476 #endif // TPETRA_DETAILS_RESIDUAL_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumVectors() const
Number of columns in the multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
One or more distributed dense vectors.
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
bool isDistributed() const
Whether this is a globally distributed object.
static bool debug()
Whether Tpetra is in debug mode.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Abstract interface for operators (e.g., matrices and preconditioners).
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
Sum new values into existing values.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row...
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object ("forward mode").
void reduce()
Sum values of a locally replicated multivector across all processes.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.