Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualVector.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
44 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
45 
46 #include "Ifpack2_BlockHelper.hpp"
47 
48 namespace Ifpack2 {
49 
50  namespace BlockHelperDetails {
51 
55  template <typename MatrixType>
56  struct AmD {
58  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
59  using size_type_1d_view = typename impl_type::size_type_1d_view;
60  using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
61  // rowptr points to the start of each row of A_colindsub.
62  size_type_1d_view rowptr, rowptr_remote;
63  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
64  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
65  // where g is A's graph, are the columns AmD uses. If seq_method_, then
66  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
67  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
68  // contains the remote ones.
69  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
70 
71  // Currently always true.
72  bool is_tpetra_block_crs;
73 
74  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
75  impl_scalar_type_1d_view_tpetra tpetra_values;
76 
77  AmD() = default;
78  AmD(const AmD &b) = default;
79  };
80 
81  template<typename MatrixType>
82  struct PartInterface {
83  using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
84  using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
85  using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
86 
87  PartInterface() = default;
88  PartInterface(const PartInterface &b) = default;
89 
90  // Some terms:
91  // The matrix A is split as A = D + R, where D is the matrix of tridiag
92  // blocks and R is the remainder.
93  // A part is roughly a synonym for a tridiag. The distinction is that a part
94  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
95  // rows in R associated with that tridiag. In contrast, the term tridiag is
96  // used to refer specifically to tridiag data, such as the pointer into the
97  // tridiag data array.
98  // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
99  // tridiag, and partptr points to the beginning of each tridiag. This is the
100  // LID space.
101  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
102  // by this ordinal. This is the 'index' space.
103  // A flat index is the mathematical index into an array. A pack index
104  // accounts for SIMD packing.
105 
106  // Local row LIDs. Permutation from caller's index space to tridiag index
107  // space.
108  local_ordinal_type_1d_view lclrow;
109  // partptr_ is the pointer array into lclrow_.
110  local_ordinal_type_1d_view partptr; // np+1
111  local_ordinal_type_2d_view partptr_sub;
112  local_ordinal_type_1d_view partptr_schur;
113  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
114  // is the start of the i'th pack.
115  local_ordinal_type_1d_view packptr; // npack+1
116  local_ordinal_type_1d_view packptr_sub;
117  local_ordinal_type_1d_view packindices_sub;
118  local_ordinal_type_2d_view packindices_schur;
119  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
120  // an alias of partptr_ in the case of no overlap.
121  local_ordinal_type_1d_view part2rowidx0; // np+1
122  local_ordinal_type_1d_view part2rowidx0_sub;
123  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
124  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
125  // with i % vector_length to get the location in the packed data.
126  local_ordinal_type_1d_view part2packrowidx0; // np+1
127  local_ordinal_type_2d_view part2packrowidx0_sub;
128  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
129  // rowidx2part_ maps the row index to the part index.
130  local_ordinal_type_1d_view rowidx2part; // nr
131  local_ordinal_type_1d_view rowidx2part_sub;
132  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
133  // practice, this knowledge is not particularly useful, as packing for batched
134  // processing is done at the same time as the permutation from LID to index
135  // space. But it's easy to detect, so it's recorded in case an optimization
136  // can be made based on it.
137  bool row_contiguous;
138 
139  local_ordinal_type max_partsz;
140  local_ordinal_type max_subpartsz;
141  local_ordinal_type n_subparts_per_part;
142  local_ordinal_type nparts;
143  };
144 
148  static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
149  const int team_size) {
150  int total_team_size(0);
151  if (blksize <= 5) total_team_size = 32;
152  else if (blksize <= 9) total_team_size = 32; // 64
153  else if (blksize <= 12) total_team_size = 96;
154  else if (blksize <= 16) total_team_size = 128;
155  else if (blksize <= 20) total_team_size = 160;
156  else total_team_size = 160;
157  return total_team_size/team_size;
158  }
159 
160  static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
161  const int team_size) {
162  int total_team_size(0);
163  if (blksize <= 5) total_team_size = 32;
164  else if (blksize <= 9) total_team_size = 32; // 64
165  else if (blksize <= 12) total_team_size = 96;
166  else if (blksize <= 16) total_team_size = 128;
167  else if (blksize <= 20) total_team_size = 160;
168  else total_team_size = 160;
169  return total_team_size/team_size;
170  }
171 
172  static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
173  const int team_size) {
174  int total_team_size(0);
175  if (blksize <= 5) total_team_size = 32;
176  else if (blksize <= 9) total_team_size = 32; // 64
177  else if (blksize <= 12) total_team_size = 96;
178  else if (blksize <= 16) total_team_size = 128;
179  else if (blksize <= 20) total_team_size = 160;
180  else total_team_size = 160;
181  return total_team_size/team_size;
182  }
183 
184  template<typename T>
185  static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
186  const int team_size) {
187  if ( is_cuda<T>::value )
188  return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
189  if ( is_hip<T>::value )
190  return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
191  if ( is_sycl<T>::value )
192  return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
193  return -1;
194  }
195 
196  template<typename MatrixType>
197  struct ComputeResidualVector {
198  public:
199  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
200  using node_device_type = typename impl_type::node_device_type;
201  using execution_space = typename impl_type::execution_space;
202  using memory_space = typename impl_type::memory_space;
203 
204  using local_ordinal_type = typename impl_type::local_ordinal_type;
205  using size_type = typename impl_type::size_type;
206  using impl_scalar_type = typename impl_type::impl_scalar_type;
207  using magnitude_type = typename impl_type::magnitude_type;
208  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
209  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
211  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
212  using size_type_1d_view = typename impl_type::size_type_1d_view;
213  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
214  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
215  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
216  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
217  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
218  static constexpr int vector_length = impl_type::vector_length;
219 
221  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
222 
223  // enum for max blocksize and vector length
224  enum : int { max_blocksize = 32 };
225 
226  private:
227  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
228  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
229  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
230  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
231  Unmanaged<vector_type_3d_view> y_packed;
232  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
233 
234  // AmD information
235  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
236  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
237  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
238 
239  // block crs graph information
240  // for cuda (kokkos crs graph uses a different size_type from size_t)
241  const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
242  const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
243 
244  // blocksize
245  const local_ordinal_type blocksize_requested;
246 
247  // part interface
248  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
249  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
250  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
251  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
252  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
253  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
254  const bool is_dm2cm_active;
255 
256  public:
257  template<typename LocalCrsGraphType>
258  ComputeResidualVector(const AmD<MatrixType> &amd,
259  const LocalCrsGraphType &graph,
260  const local_ordinal_type &blocksize_requested_,
261  const PartInterface<MatrixType> &interf,
262  const local_ordinal_type_1d_view &dm2cm_)
263  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
264  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
265  tpetra_values(amd.tpetra_values),
266  A_rowptr(graph.row_map),
267  A_colind(graph.entries),
268  blocksize_requested(blocksize_requested_),
269  part2packrowidx0(interf.part2packrowidx0),
270  part2rowidx0(interf.part2rowidx0),
271  rowidx2part(interf.rowidx2part),
272  partptr(interf.partptr),
273  lclrow(interf.lclrow),
274  dm2cm(dm2cm_),
275  is_dm2cm_active(dm2cm_.span() > 0)
276  {}
277 
278  inline
279  void
280  SerialGemv(const local_ordinal_type &blocksize,
281  const impl_scalar_type * const KOKKOS_RESTRICT AA,
282  const impl_scalar_type * const KOKKOS_RESTRICT xx,
283  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
284  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
285  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
286  impl_scalar_type val = 0;
287 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
288 # pragma ivdep
289 #endif
290 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
291 # pragma unroll
292 #endif
293  for (local_ordinal_type k1=0;k1<blocksize;++k1)
294  val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
295  yy[k0] -= val;
296  }
297  }
298 
299  template<typename bbViewType, typename yyViewType>
300  KOKKOS_INLINE_FUNCTION
301  void
302  VectorCopy(const member_type &member,
303  const local_ordinal_type &blocksize,
304  const bbViewType &bb,
305  const yyViewType &yy) const {
306  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
307  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
308  });
309  }
310 
311  template<typename AAViewType, typename xxViewType, typename yyViewType>
312  KOKKOS_INLINE_FUNCTION
313  void
314  TeamVectorGemv(const member_type &member,
315  const local_ordinal_type &blocksize,
316  const AAViewType &AA,
317  const xxViewType &xx,
318  const yyViewType &yy) const {
319  Kokkos::parallel_for
320  (Kokkos::TeamThreadRange(member, blocksize),
321  [&](const local_ordinal_type &k0) {
322  impl_scalar_type val = 0;
323  Kokkos::parallel_for
324  (Kokkos::ThreadVectorRange(member, blocksize),
325  [&](const local_ordinal_type &k1) {
326  val += AA(k0,k1)*xx(k1);
327  });
328  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
329  });
330  }
331 
332  template<typename AAViewType, typename xxViewType, typename yyViewType>
333  KOKKOS_INLINE_FUNCTION
334  void
335  VectorGemv(const member_type &member,
336  const local_ordinal_type &blocksize,
337  const AAViewType &AA,
338  const xxViewType &xx,
339  const yyViewType &yy) const {
340  Kokkos::parallel_for
341  (Kokkos::ThreadVectorRange(member, blocksize),
342  [&](const local_ordinal_type &k0) {
343  impl_scalar_type val(0);
344  for (local_ordinal_type k1=0;k1<blocksize;++k1) {
345  val += AA(k0,k1)*xx(k1);
346  }
347  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
348  });
349  }
350 
351  // template<typename AAViewType, typename xxViewType, typename yyViewType>
352  // KOKKOS_INLINE_FUNCTION
353  // void
354  // VectorGemv(const member_type &member,
355  // const local_ordinal_type &blocksize,
356  // const AAViewType &AA,
357  // const xxViewType &xx,
358  // const yyViewType &yy) const {
359  // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
360  // impl_scalar_type val = 0;
361  // Kokkos::parallel_for
362  // (Kokkos::ThreadVectorRange(member, blocksize),
363  // [&](const local_ordinal_type &k1) {
364  // val += AA(k0,k1)*xx(k1);
365  // });
366  // Kokkos::atomic_fetch_add(&yy(k0), -val);
367  // }
368  // }
369 
370  struct SeqTag {};
371 
372  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
373  KOKKOS_INLINE_FUNCTION
374  void
375  operator() (const SeqTag &, const local_ordinal_type& i) const {
376  const local_ordinal_type blocksize = blocksize_requested;
377  const local_ordinal_type blocksize_square = blocksize*blocksize;
378 
379  // constants
380  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
381  const local_ordinal_type num_vectors = y.extent(1);
382  const local_ordinal_type row = i*blocksize;
383  for (local_ordinal_type col=0;col<num_vectors;++col) {
384  // y := b
385  impl_scalar_type *yy = &y(row, col);
386  const impl_scalar_type * const bb = &b(row, col);
387  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
388 
389  // y -= Rx
390  const size_type A_k0 = A_rowptr[i];
391  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
392  const size_type j = A_k0 + colindsub[k];
393  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
394  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
395  SerialGemv(blocksize,AA,xx,yy);
396  }
397  }
398  }
399 
400  KOKKOS_INLINE_FUNCTION
401  void
402  operator() (const SeqTag &, const member_type &member) const {
403 
404  // constants
405  const local_ordinal_type blocksize = blocksize_requested;
406  const local_ordinal_type blocksize_square = blocksize*blocksize;
407 
408  const local_ordinal_type lr = member.league_rank();
409  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
410  const local_ordinal_type num_vectors = y.extent(1);
411 
412  // subview pattern
413  auto bb = Kokkos::subview(b, block_range, 0);
414  auto xx = bb;
415  auto yy = Kokkos::subview(y, block_range, 0);
416  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
417 
418  const local_ordinal_type row = lr*blocksize;
419  for (local_ordinal_type col=0;col<num_vectors;++col) {
420  // y := b
421  yy.assign_data(&y(row, col));
422  bb.assign_data(&b(row, col));
423  if (member.team_rank() == 0)
424  VectorCopy(member, blocksize, bb, yy);
425  member.team_barrier();
426 
427  // y -= Rx
428  const size_type A_k0 = A_rowptr[lr];
429  Kokkos::parallel_for
430  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
431  [&](const local_ordinal_type &k) {
432  const size_type j = A_k0 + colindsub[k];
433  A_block.assign_data( &tpetra_values(j*blocksize_square) );
434  xx.assign_data( &x(A_colind[j]*blocksize, col) );
435  VectorGemv(member, blocksize, A_block, xx, yy);
436  });
437  }
438  }
439 
440  template<int B>
441  struct AsyncTag {};
442 
443  template<int B>
444  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
445  KOKKOS_INLINE_FUNCTION
446  void
447  operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
448  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
449  const local_ordinal_type blocksize_square = blocksize*blocksize;
450 
451  // constants
452  const local_ordinal_type partidx = rowidx2part(rowidx);
453  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
454  const local_ordinal_type v = partidx % vector_length;
455 
456  const local_ordinal_type num_vectors = y_packed.extent(2);
457  const local_ordinal_type num_local_rows = lclrow.extent(0);
458 
459  // temporary buffer for y flat
460  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
461 
462  const local_ordinal_type lr = lclrow(rowidx);
463  const local_ordinal_type row = lr*blocksize;
464  for (local_ordinal_type col=0;col<num_vectors;++col) {
465  // y := b
466  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
467 
468  // y -= Rx
469  const size_type A_k0 = A_rowptr[lr];
470  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
471  const size_type j = A_k0 + colindsub[k];
472  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
473  const local_ordinal_type A_colind_at_j = A_colind[j];
474  if (A_colind_at_j < num_local_rows) {
475  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
476  const impl_scalar_type * const xx = &x(loc*blocksize, col);
477  SerialGemv(blocksize, AA,xx,yy);
478  } else {
479  const auto loc = A_colind_at_j - num_local_rows;
480  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
481  SerialGemv(blocksize, AA,xx_remote,yy);
482  }
483  }
484  // move yy to y_packed
485  for (local_ordinal_type k=0;k<blocksize;++k)
486  y_packed(pri, k, col)[v] = yy[k];
487  }
488  }
489 
490  template<int B>
491  KOKKOS_INLINE_FUNCTION
492  void
493  operator() (const AsyncTag<B> &, const member_type &member) const {
494  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
495  const local_ordinal_type blocksize_square = blocksize*blocksize;
496 
497  // constants
498  const local_ordinal_type rowidx = member.league_rank();
499  const local_ordinal_type partidx = rowidx2part(rowidx);
500  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
501  const local_ordinal_type v = partidx % vector_length;
502 
503  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
504  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
505  const local_ordinal_type num_local_rows = lclrow.extent(0);
506 
507  // subview pattern
508  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
509  subview_1D_right_t bb(nullptr, blocksize);
510  subview_1D_right_t xx(nullptr, blocksize);
511  subview_1D_right_t xx_remote(nullptr, blocksize);
512  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
513  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
514  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
515 
516  const local_ordinal_type lr = lclrow(rowidx);
517  const local_ordinal_type row = lr*blocksize;
518  for (local_ordinal_type col=0;col<num_vectors;++col) {
519  // y := b
520  bb.assign_data(&b(row, col));
521  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
522  if (member.team_rank() == 0)
523  VectorCopy(member, blocksize, bb, yy);
524  member.team_barrier();
525 
526  // y -= Rx
527  const size_type A_k0 = A_rowptr[lr];
528  Kokkos::parallel_for
529  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
530  [&](const local_ordinal_type &k) {
531  const size_type j = A_k0 + colindsub[k];
532  A_block.assign_data( &tpetra_values(j*blocksize_square) );
533 
534  const local_ordinal_type A_colind_at_j = A_colind[j];
535  if (A_colind_at_j < num_local_rows) {
536  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
537  xx.assign_data( &x(loc*blocksize, col) );
538  VectorGemv(member, blocksize, A_block, xx, yy);
539  } else {
540  const auto loc = A_colind_at_j - num_local_rows;
541  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
542  VectorGemv(member, blocksize, A_block, xx_remote, yy);
543  }
544  });
545  }
546  }
547 
548  template <int P, int B> struct OverlapTag {};
549 
550  template<int P, int B>
551  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
552  KOKKOS_INLINE_FUNCTION
553  void
554  operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
555  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
556  const local_ordinal_type blocksize_square = blocksize*blocksize;
557 
558  // constants
559  const local_ordinal_type partidx = rowidx2part(rowidx);
560  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
561  const local_ordinal_type v = partidx % vector_length;
562 
563  const local_ordinal_type num_vectors = y_packed.extent(2);
564  const local_ordinal_type num_local_rows = lclrow.extent(0);
565 
566  // temporary buffer for y flat
567  impl_scalar_type yy[max_blocksize] = {};
568 
569  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
570  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
571 
572  const local_ordinal_type lr = lclrow(rowidx);
573  const local_ordinal_type row = lr*blocksize;
574  for (local_ordinal_type col=0;col<num_vectors;++col) {
575  if (P == 0) {
576  // y := b
577  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
578  } else {
579  // y (temporary) := 0
580  memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
581  }
582 
583  // y -= Rx
584  const size_type A_k0 = A_rowptr[lr];
585  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
586  const size_type j = A_k0 + colindsub_used[k];
587  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
588  const local_ordinal_type A_colind_at_j = A_colind[j];
589  if (P == 0) {
590  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
591  const impl_scalar_type * const xx = &x(loc*blocksize, col);
592  SerialGemv(blocksize,AA,xx,yy);
593  } else {
594  const auto loc = A_colind_at_j - num_local_rows;
595  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
596  SerialGemv(blocksize,AA,xx_remote,yy);
597  }
598  }
599  // move yy to y_packed
600  if (P == 0) {
601  for (local_ordinal_type k=0;k<blocksize;++k)
602  y_packed(pri, k, col)[v] = yy[k];
603  } else {
604  for (local_ordinal_type k=0;k<blocksize;++k)
605  y_packed(pri, k, col)[v] += yy[k];
606  }
607  }
608  }
609 
610  template<int P, int B>
611  KOKKOS_INLINE_FUNCTION
612  void
613  operator() (const OverlapTag<P,B> &, const member_type &member) const {
614  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
615  const local_ordinal_type blocksize_square = blocksize*blocksize;
616 
617  // constants
618  const local_ordinal_type rowidx = member.league_rank();
619  const local_ordinal_type partidx = rowidx2part(rowidx);
620  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
621  const local_ordinal_type v = partidx % vector_length;
622 
623  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
624  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
625  const local_ordinal_type num_local_rows = lclrow.extent(0);
626 
627  // subview pattern
628  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
629  subview_1D_right_t bb(nullptr, blocksize);
630  subview_1D_right_t xx(nullptr, blocksize);
631  subview_1D_right_t xx_remote(nullptr, blocksize);
632  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
633  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
634  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
635  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
636  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
637 
638  const local_ordinal_type lr = lclrow(rowidx);
639  const local_ordinal_type row = lr*blocksize;
640  for (local_ordinal_type col=0;col<num_vectors;++col) {
641  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
642  if (P == 0) {
643  // y := b
644  bb.assign_data(&b(row, col));
645  if (member.team_rank() == 0)
646  VectorCopy(member, blocksize, bb, yy);
647  member.team_barrier();
648  }
649 
650  // y -= Rx
651  const size_type A_k0 = A_rowptr[lr];
652  Kokkos::parallel_for
653  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
654  [&](const local_ordinal_type &k) {
655  const size_type j = A_k0 + colindsub_used[k];
656  A_block.assign_data( &tpetra_values(j*blocksize_square) );
657 
658  const local_ordinal_type A_colind_at_j = A_colind[j];
659  if (P == 0) {
660  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
661  xx.assign_data( &x(loc*blocksize, col) );
662  VectorGemv(member, blocksize, A_block, xx, yy);
663  } else {
664  const auto loc = A_colind_at_j - num_local_rows;
665  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
666  VectorGemv(member, blocksize, A_block, xx_remote, yy);
667  }
668  });
669  }
670  }
671 
672  // y = b - Rx; seq method
673  template<typename MultiVectorLocalViewTypeY,
674  typename MultiVectorLocalViewTypeB,
675  typename MultiVectorLocalViewTypeX>
676  void run(const MultiVectorLocalViewTypeY &y_,
677  const MultiVectorLocalViewTypeB &b_,
678  const MultiVectorLocalViewTypeX &x_) {
679  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
680  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
681 
682  y = y_; b = b_; x = x_;
683  if constexpr (is_device<execution_space>::value) {
684  const local_ordinal_type blocksize = blocksize_requested;
685  const local_ordinal_type team_size = 8;
686  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
687  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
688  Kokkos::parallel_for
689  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
690  } else {
691  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
692  Kokkos::parallel_for
693  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
694  }
695  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
696  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
697  }
698 
699  // y = b - R (x , x_remote)
700  template<typename MultiVectorLocalViewTypeB,
701  typename MultiVectorLocalViewTypeX,
702  typename MultiVectorLocalViewTypeX_Remote>
703  void run(const vector_type_3d_view &y_packed_,
704  const MultiVectorLocalViewTypeB &b_,
705  const MultiVectorLocalViewTypeX &x_,
706  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
707  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
708  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
709 
710  b = b_; x = x_; x_remote = x_remote_;
711  if constexpr (is_device<execution_space>::value) {
712  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
713  y_packed_.extent(0),
714  y_packed_.extent(1),
715  y_packed_.extent(2),
716  vector_length);
717  } else {
718  y_packed = y_packed_;
719  }
720 
721  if constexpr(is_device<execution_space>::value) {
722  const local_ordinal_type blocksize = blocksize_requested;
723  const local_ordinal_type team_size = 8;
724  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
725  // local_ordinal_type vl_power_of_two = 1;
726  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
727  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
728  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
729 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
730  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
731  policy(rowidx2part.extent(0), team_size, vector_size); \
732  Kokkos::parallel_for \
733  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
734  policy, *this); } break
735  switch (blocksize_requested) {
736  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
737  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
738  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
739  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
740  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
741  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
742  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
743  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
744  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
745  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
746  }
747 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
748  } else {
749 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
750  const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
751  Kokkos::parallel_for \
752  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
753  policy, *this); } break
754  switch (blocksize_requested) {
755  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
756  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
757  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
758  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
759  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
760  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
761  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
762  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
763  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
764  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
765  }
766 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
767  }
768  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
769  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
770  }
771 
772  // y = b - R (y , y_remote)
773  template<typename MultiVectorLocalViewTypeB,
774  typename MultiVectorLocalViewTypeX,
775  typename MultiVectorLocalViewTypeX_Remote>
776  void run(const vector_type_3d_view &y_packed_,
777  const MultiVectorLocalViewTypeB &b_,
778  const MultiVectorLocalViewTypeX &x_,
779  const MultiVectorLocalViewTypeX_Remote &x_remote_,
780  const bool compute_owned) {
781  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
782  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
783 
784  b = b_; x = x_; x_remote = x_remote_;
785  if constexpr (is_device<execution_space>::value) {
786  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
787  y_packed_.extent(0),
788  y_packed_.extent(1),
789  y_packed_.extent(2),
790  vector_length);
791  } else {
792  y_packed = y_packed_;
793  }
794 
795  if constexpr (is_device<execution_space>::value) {
796  const local_ordinal_type blocksize = blocksize_requested;
797  const local_ordinal_type team_size = 8;
798  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
799  // local_ordinal_type vl_power_of_two = 1;
800  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
801  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
802  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
803 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
804  if (compute_owned) { \
805  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
806  policy(rowidx2part.extent(0), team_size, vector_size); \
807  Kokkos::parallel_for \
808  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
809  } else { \
810  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
811  policy(rowidx2part.extent(0), team_size, vector_size); \
812  Kokkos::parallel_for \
813  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
814  } break
815  switch (blocksize_requested) {
816  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
817  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
818  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
819  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
820  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
821  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
822  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
823  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
824  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
825  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
826  }
827 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
828  } else {
829 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
830  if (compute_owned) { \
831  const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
832  policy(0, rowidx2part.extent(0)); \
833  Kokkos::parallel_for \
834  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
835  } else { \
836  const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
837  policy(0, rowidx2part.extent(0)); \
838  Kokkos::parallel_for \
839  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
840  } break
841 
842  switch (blocksize_requested) {
843  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
844  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
845  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
846  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
847  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
848  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
849  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
850  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
851  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
852  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
853  }
854 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
855  }
856  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
857  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
858  }
859  };
860 
861 
862  } // namespace BlockHelperDetails
863 
864 } // namespace Ifpack2
865 
866 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:317
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:294
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:303
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:359
Definition: Ifpack2_BlockHelper.hpp:290
Definition: Ifpack2_BlockComputeResidualVector.hpp:56