Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualVector.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
12 
13 #include "Ifpack2_BlockHelper.hpp"
14 
15 namespace Ifpack2 {
16 
17  namespace BlockHelperDetails {
18 
22  template <typename MatrixType>
23  struct AmD {
25  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
26  using size_type_1d_view = typename impl_type::size_type_1d_view;
27  using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
28  // rowptr points to the start of each row of A_colindsub.
29  size_type_1d_view rowptr, rowptr_remote;
30  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
31  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
32  // where g is A's graph, are the columns AmD uses. If seq_method_, then
33  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
34  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
35  // contains the remote ones.
36  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
37 
38  // Currently always true.
39  bool is_tpetra_block_crs;
40 
41  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
42  impl_scalar_type_1d_view_tpetra tpetra_values;
43 
44  AmD() = default;
45  AmD(const AmD &b) = default;
46  };
47 
48  template<typename MatrixType>
49  struct PartInterface {
50  using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
51  using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
52  using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
53 
54  PartInterface() = default;
55  PartInterface(const PartInterface &b) = default;
56 
57  // Some terms:
58  // The matrix A is split as A = D + R, where D is the matrix of tridiag
59  // blocks and R is the remainder.
60  // A part is roughly a synonym for a tridiag. The distinction is that a part
61  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
62  // rows in R associated with that tridiag. In contrast, the term tridiag is
63  // used to refer specifically to tridiag data, such as the pointer into the
64  // tridiag data array.
65  // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
66  // tridiag, and partptr points to the beginning of each tridiag. This is the
67  // LID space.
68  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
69  // by this ordinal. This is the 'index' space.
70  // A flat index is the mathematical index into an array. A pack index
71  // accounts for SIMD packing.
72 
73  // Local row LIDs. Permutation from caller's index space to tridiag index
74  // space.
75  local_ordinal_type_1d_view lclrow;
76  // partptr_ is the pointer array into lclrow_.
77  local_ordinal_type_1d_view partptr; // np+1
78  local_ordinal_type_2d_view partptr_sub;
79  local_ordinal_type_1d_view partptr_schur;
80  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
81  // is the start of the i'th pack.
82  local_ordinal_type_1d_view packptr; // npack+1
83  local_ordinal_type_1d_view packptr_sub;
84  local_ordinal_type_1d_view packindices_sub;
85  local_ordinal_type_2d_view packindices_schur;
86  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
87  // an alias of partptr_ in the case of no overlap.
88  local_ordinal_type_1d_view part2rowidx0; // np+1
89  local_ordinal_type_1d_view part2rowidx0_sub;
90  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
91  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
92  // with i % vector_length to get the location in the packed data.
93  local_ordinal_type_1d_view part2packrowidx0; // np+1
94  local_ordinal_type_2d_view part2packrowidx0_sub;
95  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
96  // rowidx2part_ maps the row index to the part index.
97  local_ordinal_type_1d_view rowidx2part; // nr
98  local_ordinal_type_1d_view rowidx2part_sub;
99  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
100  // practice, this knowledge is not particularly useful, as packing for batched
101  // processing is done at the same time as the permutation from LID to index
102  // space. But it's easy to detect, so it's recorded in case an optimization
103  // can be made based on it.
104  bool row_contiguous;
105 
106  local_ordinal_type max_partsz;
107  local_ordinal_type max_subpartsz;
108  local_ordinal_type n_subparts_per_part;
109  local_ordinal_type nparts;
110  };
111 
115  static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
116  const int team_size) {
117  int total_team_size(0);
118  if (blksize <= 5) total_team_size = 32;
119  else if (blksize <= 9) total_team_size = 32; // 64
120  else if (blksize <= 12) total_team_size = 96;
121  else if (blksize <= 16) total_team_size = 128;
122  else if (blksize <= 20) total_team_size = 160;
123  else total_team_size = 160;
124  return total_team_size/team_size;
125  }
126 
127  static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
128  const int team_size) {
129  int total_team_size(0);
130  if (blksize <= 5) total_team_size = 32;
131  else if (blksize <= 9) total_team_size = 32; // 64
132  else if (blksize <= 12) total_team_size = 96;
133  else if (blksize <= 16) total_team_size = 128;
134  else if (blksize <= 20) total_team_size = 160;
135  else total_team_size = 160;
136  return total_team_size/team_size;
137  }
138 
139  static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
140  const int team_size) {
141  int total_team_size(0);
142  if (blksize <= 5) total_team_size = 32;
143  else if (blksize <= 9) total_team_size = 32; // 64
144  else if (blksize <= 12) total_team_size = 96;
145  else if (blksize <= 16) total_team_size = 128;
146  else if (blksize <= 20) total_team_size = 160;
147  else total_team_size = 160;
148  return total_team_size/team_size;
149  }
150 
151  template<typename T>
152  static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
153  const int team_size) {
154  if ( is_cuda<T>::value )
155  return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
156  if ( is_hip<T>::value )
157  return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
158  if ( is_sycl<T>::value )
159  return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
160  return -1;
161  }
162 
163  template<typename MatrixType>
164  struct ComputeResidualVector {
165  public:
166  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
167  using node_device_type = typename impl_type::node_device_type;
168  using execution_space = typename impl_type::execution_space;
169  using memory_space = typename impl_type::memory_space;
170 
171  using local_ordinal_type = typename impl_type::local_ordinal_type;
172  using size_type = typename impl_type::size_type;
173  using impl_scalar_type = typename impl_type::impl_scalar_type;
174  using magnitude_type = typename impl_type::magnitude_type;
175  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
176  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
178  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
179  using size_type_1d_view = typename impl_type::size_type_1d_view;
180  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
181  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
182  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
183  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
184  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
185  static constexpr int vector_length = impl_type::vector_length;
186 
188  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
189 
190  // enum for max blocksize and vector length
191  enum : int { max_blocksize = 32 };
192 
193  private:
194  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
195  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
196  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
197  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
198  Unmanaged<vector_type_3d_view> y_packed;
199  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
200 
201  // AmD information
202  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
203  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
204  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
205 
206  // block crs graph information
207  // for cuda (kokkos crs graph uses a different size_type from size_t)
208  const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_block_rowptr;
209  const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_point_rowptr;
210  const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
211 
212  // blocksize
213  const local_ordinal_type blocksize_requested;
214 
215  // part interface
216  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
217  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
218  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
219  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
220  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
221  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
222  const bool is_dm2cm_active;
223  const bool hasBlockCrsMatrix;
224 
225  public:
226  template<typename LocalCrsGraphType>
227  ComputeResidualVector(const AmD<MatrixType> &amd,
228  const LocalCrsGraphType &block_graph,
229  const LocalCrsGraphType &point_graph,
230  const local_ordinal_type &blocksize_requested_,
231  const PartInterface<MatrixType> &interf,
232  const local_ordinal_type_1d_view &dm2cm_)
233  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
234  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
235  tpetra_values(amd.tpetra_values),
236  A_block_rowptr(block_graph.row_map),
237  A_point_rowptr(point_graph.row_map),
238  A_colind(block_graph.entries),
239  blocksize_requested(blocksize_requested_),
240  part2packrowidx0(interf.part2packrowidx0),
241  part2rowidx0(interf.part2rowidx0),
242  rowidx2part(interf.rowidx2part),
243  partptr(interf.partptr),
244  lclrow(interf.lclrow),
245  dm2cm(dm2cm_),
246  is_dm2cm_active(dm2cm_.span() > 0),
247  hasBlockCrsMatrix(A_block_rowptr.extent(0) == A_point_rowptr.extent(0))
248  { }
249 
250  inline
251  void
252  SerialDot(const local_ordinal_type &blocksize,
253  const local_ordinal_type &lclRowID,
254  const local_ordinal_type &lclColID,
255  const local_ordinal_type &ii,
256  const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
257  const impl_scalar_type * const KOKKOS_RESTRICT xx,
258  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
259  const size_type Aj_c = colindsub_(lclColID);
260  auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
261  impl_scalar_type val = 0;
262 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
263 # pragma ivdep
264 #endif
265 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
266 # pragma unroll
267 #endif
268  for (local_ordinal_type k1=0;k1<blocksize;++k1)
269  val += tpetra_values(point_row_offset + k1) * xx[k1];
270  yy[ii] -= val;
271  }
272 
273  inline
274  void
275  SerialGemv(const local_ordinal_type &blocksize,
276  const impl_scalar_type * const KOKKOS_RESTRICT AA,
277  const impl_scalar_type * const KOKKOS_RESTRICT xx,
278  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
279  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
280  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
281  impl_scalar_type val = 0;
282 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
283 # pragma ivdep
284 #endif
285 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
286 # pragma unroll
287 #endif
288  for (local_ordinal_type k1=0;k1<blocksize;++k1)
289  val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
290  yy[k0] -= val;
291  }
292  }
293 
294  template<typename bbViewType, typename yyViewType>
295  KOKKOS_INLINE_FUNCTION
296  void
297  VectorCopy(const member_type &member,
298  const local_ordinal_type &blocksize,
299  const bbViewType &bb,
300  const yyViewType &yy) const {
301  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
302  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
303  });
304  }
305 
306  template<typename AAViewType, typename xxViewType, typename yyViewType>
307  KOKKOS_INLINE_FUNCTION
308  void
309  TeamVectorGemv(const member_type &member,
310  const local_ordinal_type &blocksize,
311  const AAViewType &AA,
312  const xxViewType &xx,
313  const yyViewType &yy) const {
314  Kokkos::parallel_for
315  (Kokkos::TeamThreadRange(member, blocksize),
316  [&](const local_ordinal_type &k0) {
317  impl_scalar_type val = 0;
318  Kokkos::parallel_for
319  (Kokkos::ThreadVectorRange(member, blocksize),
320  [&](const local_ordinal_type &k1) {
321  val += AA(k0,k1)*xx(k1);
322  });
323  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
324  });
325  }
326 
327  template<typename xxViewType, typename yyViewType>
328  KOKKOS_INLINE_FUNCTION
329  void
330  VectorDot(const member_type &member,
331  const local_ordinal_type &blocksize,
332  const local_ordinal_type &lclRowID,
333  const local_ordinal_type &lclColID,
334  const local_ordinal_type &ii,
335  const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
336  const xxViewType &xx,
337  const yyViewType &yy) const {
338  const size_type Aj_c = colindsub_(lclColID);
339  auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
340  impl_scalar_type val = 0;
341  Kokkos::parallel_for
342  (Kokkos::ThreadVectorRange(member, blocksize),
343  [&](const local_ordinal_type &k1) {
344  val += tpetra_values(point_row_offset + k1) *xx(k1);
345  });
346  Kokkos::atomic_fetch_add(&yy(ii), typename yyViewType::const_value_type(-val));
347  }
348 
349  template<typename AAViewType, typename xxViewType, typename yyViewType>
350  KOKKOS_INLINE_FUNCTION
351  void
352  VectorGemv(const member_type &member,
353  const local_ordinal_type &blocksize,
354  const AAViewType &AA,
355  const xxViewType &xx,
356  const yyViewType &yy) const {
357  Kokkos::parallel_for
358  (Kokkos::ThreadVectorRange(member, blocksize),
359  [&](const local_ordinal_type &k0) {
360  impl_scalar_type val(0);
361  for (local_ordinal_type k1=0;k1<blocksize;++k1) {
362  val += AA(k0,k1)*xx(k1);
363  }
364  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
365  });
366  }
367 
368  // template<typename AAViewType, typename xxViewType, typename yyViewType>
369  // KOKKOS_INLINE_FUNCTION
370  // void
371  // VectorGemv(const member_type &member,
372  // const local_ordinal_type &blocksize,
373  // const AAViewType &AA,
374  // const xxViewType &xx,
375  // const yyViewType &yy) const {
376  // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
377  // impl_scalar_type val = 0;
378  // Kokkos::parallel_for
379  // (Kokkos::ThreadVectorRange(member, blocksize),
380  // [&](const local_ordinal_type &k1) {
381  // val += AA(k0,k1)*xx(k1);
382  // });
383  // Kokkos::atomic_fetch_add(&yy(k0), -val);
384  // }
385  // }
386 
387  struct SeqTag {};
388 
389  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
390  KOKKOS_INLINE_FUNCTION
391  void
392  operator() (const SeqTag &, const local_ordinal_type& i) const {
393  const local_ordinal_type blocksize = blocksize_requested;
394  const local_ordinal_type blocksize_square = blocksize*blocksize;
395 
396  // constants
397  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
398  const local_ordinal_type num_vectors = y.extent(1);
399  const local_ordinal_type row = i*blocksize;
400  for (local_ordinal_type col=0;col<num_vectors;++col) {
401  // y := b
402  impl_scalar_type *yy = &y(row, col);
403  const impl_scalar_type * const bb = &b(row, col);
404  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
405 
406  // y -= Rx
407  const size_type A_k0 = A_block_rowptr[i];
408  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
409  const size_type j = A_k0 + colindsub[k];
410  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
411  if (hasBlockCrsMatrix) {
412  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
413  SerialGemv(blocksize,AA,xx,yy);
414  }
415  else {
416  for (local_ordinal_type k0=0;k0<blocksize;++k0)
417  SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
418  }
419  }
420  }
421  }
422 
423  KOKKOS_INLINE_FUNCTION
424  void
425  operator() (const SeqTag &, const member_type &member) const {
426 
427  // constants
428  const local_ordinal_type blocksize = blocksize_requested;
429  const local_ordinal_type blocksize_square = blocksize*blocksize;
430 
431  const local_ordinal_type lr = member.league_rank();
432  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
433  const local_ordinal_type num_vectors = y.extent(1);
434 
435  // subview pattern
436  auto bb = Kokkos::subview(b, block_range, 0);
437  auto xx = bb;
438  auto yy = Kokkos::subview(y, block_range, 0);
439  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
440 
441  const local_ordinal_type row = lr*blocksize;
442  for (local_ordinal_type col=0;col<num_vectors;++col) {
443  // y := b
444  yy.assign_data(&y(row, col));
445  bb.assign_data(&b(row, col));
446  if (member.team_rank() == 0)
447  VectorCopy(member, blocksize, bb, yy);
448  member.team_barrier();
449 
450  // y -= Rx
451  const size_type A_k0 = A_block_rowptr[lr];
452 
453  if(hasBlockCrsMatrix) {
454  Kokkos::parallel_for
455  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
456  [&](const local_ordinal_type &k) {
457  const size_type j = A_k0 + colindsub[k];
458  xx.assign_data( &x(A_colind[j]*blocksize, col) );
459  A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
460  VectorGemv(member, blocksize, A_block_cst, xx, yy);
461  });
462  }
463  else {
464  Kokkos::parallel_for
465  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
466  [&](const local_ordinal_type &k) {
467 
468  const size_type j = A_k0 + colindsub[k];
469  xx.assign_data( &x(A_colind[j]*blocksize, col) );
470 
471  for (local_ordinal_type k0=0;k0<blocksize;++k0)
472  VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
473  });
474  }
475  }
476  }
477 
478  template<int B>
479  struct AsyncTag {};
480 
481  template<int B>
482  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
483  KOKKOS_INLINE_FUNCTION
484  void
485  operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
486  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
487  const local_ordinal_type blocksize_square = blocksize*blocksize;
488 
489  // constants
490  const local_ordinal_type partidx = rowidx2part(rowidx);
491  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
492  const local_ordinal_type v = partidx % vector_length;
493 
494  const local_ordinal_type num_vectors = y_packed.extent(2);
495  const local_ordinal_type num_local_rows = lclrow.extent(0);
496 
497  // temporary buffer for y flat
498  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
499 
500  const local_ordinal_type lr = lclrow(rowidx);
501  const local_ordinal_type row = lr*blocksize;
502  for (local_ordinal_type col=0;col<num_vectors;++col) {
503  // y := b
504  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
505 
506  // y -= Rx
507  const size_type A_k0 = A_block_rowptr[lr];
508  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
509  const size_type j = A_k0 + colindsub[k];
510  const local_ordinal_type A_colind_at_j = A_colind[j];
511  if (hasBlockCrsMatrix) {
512  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
513 
514  if (A_colind_at_j < num_local_rows) {
515  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
516  const impl_scalar_type * const xx = &x(loc*blocksize, col);
517  SerialGemv(blocksize, AA,xx,yy);
518  } else {
519  const auto loc = A_colind_at_j - num_local_rows;
520  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
521  SerialGemv(blocksize, AA,xx_remote,yy);
522  }
523  }
524  else {
525  if (A_colind_at_j < num_local_rows) {
526  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
527  const impl_scalar_type * const xx = &x(loc*blocksize, col);
528  for (local_ordinal_type k0=0;k0<blocksize;++k0)
529  SerialDot(blocksize, lr, k, k0, colindsub, xx, yy);
530  } else {
531  const auto loc = A_colind_at_j - num_local_rows;
532  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
533  for (local_ordinal_type k0=0;k0<blocksize;++k0)
534  SerialDot(blocksize, lr, k, k0, colindsub, xx_remote, yy);
535  }
536  }
537  }
538  // move yy to y_packed
539  for (local_ordinal_type k=0;k<blocksize;++k)
540  y_packed(pri, k, col)[v] = yy[k];
541  }
542  }
543 
544  template<int B>
545  KOKKOS_INLINE_FUNCTION
546  void
547  operator() (const AsyncTag<B> &, const member_type &member) const {
548  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
549  const local_ordinal_type blocksize_square = blocksize*blocksize;
550 
551  // constants
552  const local_ordinal_type rowidx = member.league_rank();
553  const local_ordinal_type partidx = rowidx2part(rowidx);
554  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
555  const local_ordinal_type v = partidx % vector_length;
556 
557  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
558  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
559  const local_ordinal_type num_local_rows = lclrow.extent(0);
560 
561  // subview pattern
562  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
563  subview_1D_right_t bb(nullptr, blocksize);
564  subview_1D_right_t xx(nullptr, blocksize);
565  subview_1D_right_t xx_remote(nullptr, blocksize);
566  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
567  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
568  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
569 
570  const local_ordinal_type lr = lclrow(rowidx);
571  const local_ordinal_type row = lr*blocksize;
572  for (local_ordinal_type col=0;col<num_vectors;++col) {
573  // y := b
574  bb.assign_data(&b(row, col));
575  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
576  if (member.team_rank() == 0)
577  VectorCopy(member, blocksize, bb, yy);
578  member.team_barrier();
579 
580  // y -= Rx
581  const size_type A_k0 = A_block_rowptr[lr];
582  if(hasBlockCrsMatrix) {
583  Kokkos::parallel_for
584  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
585  [&](const local_ordinal_type &k) {
586  const size_type j = A_k0 + colindsub[k];
587  A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
588 
589  const local_ordinal_type A_colind_at_j = A_colind[j];
590  if (A_colind_at_j < num_local_rows) {
591  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
592  xx.assign_data( &x(loc*blocksize, col) );
593  VectorGemv(member, blocksize, A_block_cst, xx, yy);
594  } else {
595  const auto loc = A_colind_at_j - num_local_rows;
596  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
597  VectorGemv(member, blocksize, A_block_cst, xx_remote, yy);
598  }
599  });
600  }
601  else {
602  Kokkos::parallel_for
603  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
604  [&](const local_ordinal_type &k) {
605  const size_type j = A_k0 + colindsub[k];
606 
607  const local_ordinal_type A_colind_at_j = A_colind[j];
608  if (A_colind_at_j < num_local_rows) {
609  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
610  xx.assign_data( &x(loc*blocksize, col) );
611  for (local_ordinal_type k0=0;k0<blocksize;++k0)
612  VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
613  } else {
614  const auto loc = A_colind_at_j - num_local_rows;
615  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
616  for (local_ordinal_type k0=0;k0<blocksize;++k0)
617  VectorDot(member, blocksize, lr, k, k0, colindsub, xx_remote, yy);
618  }
619  });
620  }
621  }
622  }
623 
624  template <int P, int B> struct OverlapTag {};
625 
626  template<int P, int B>
627  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
628  KOKKOS_INLINE_FUNCTION
629  void
630  operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
631  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
632  const local_ordinal_type blocksize_square = blocksize*blocksize;
633 
634  // constants
635  const local_ordinal_type partidx = rowidx2part(rowidx);
636  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
637  const local_ordinal_type v = partidx % vector_length;
638 
639  const local_ordinal_type num_vectors = y_packed.extent(2);
640  const local_ordinal_type num_local_rows = lclrow.extent(0);
641 
642  // temporary buffer for y flat
643  impl_scalar_type yy[max_blocksize] = {};
644 
645  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
646  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
647 
648  const local_ordinal_type lr = lclrow(rowidx);
649  const local_ordinal_type row = lr*blocksize;
650  for (local_ordinal_type col=0;col<num_vectors;++col) {
651  if (P == 0) {
652  // y := b
653  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
654  } else {
655  // y (temporary) := 0
656  memset((void*) yy, 0, sizeof(impl_scalar_type)*blocksize);
657  }
658 
659  // y -= Rx
660  const size_type A_k0 = A_block_rowptr[lr];
661  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
662  const size_type j = A_k0 + colindsub_used[k];
663  const local_ordinal_type A_colind_at_j = A_colind[j];
664  if (hasBlockCrsMatrix) {
665  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
666  if (P == 0) {
667  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
668  const impl_scalar_type * const xx = &x(loc*blocksize, col);
669  SerialGemv(blocksize,AA,xx,yy);
670  } else {
671  const auto loc = A_colind_at_j - num_local_rows;
672  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
673  SerialGemv(blocksize,AA,xx_remote,yy);
674  }
675  }
676  else {
677  if (P == 0) {
678  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
679  const impl_scalar_type * const xx = &x(loc*blocksize, col);
680  for (local_ordinal_type k0=0;k0<blocksize;++k0)
681  SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
682  } else {
683  const auto loc = A_colind_at_j - num_local_rows;
684  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
685  for (local_ordinal_type k0=0;k0<blocksize;++k0)
686  SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
687  }
688  }
689  }
690  // move yy to y_packed
691  if (P == 0) {
692  for (local_ordinal_type k=0;k<blocksize;++k)
693  y_packed(pri, k, col)[v] = yy[k];
694  } else {
695  for (local_ordinal_type k=0;k<blocksize;++k)
696  y_packed(pri, k, col)[v] += yy[k];
697  }
698  }
699  }
700 
701  template<int P, int B>
702  KOKKOS_INLINE_FUNCTION
703  void
704  operator() (const OverlapTag<P,B> &, const member_type &member) const {
705  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
706  const local_ordinal_type blocksize_square = blocksize*blocksize;
707 
708  // constants
709  const local_ordinal_type rowidx = member.league_rank();
710  const local_ordinal_type partidx = rowidx2part(rowidx);
711  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
712  const local_ordinal_type v = partidx % vector_length;
713 
714  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
715  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
716  const local_ordinal_type num_local_rows = lclrow.extent(0);
717 
718  // subview pattern
719  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
720  subview_1D_right_t bb(nullptr, blocksize);
721  subview_1D_right_t xx(nullptr, blocksize);
722  subview_1D_right_t xx_remote(nullptr, blocksize);
723  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
724  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
725  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
726  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
727  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
728 
729  const local_ordinal_type lr = lclrow(rowidx);
730  const local_ordinal_type row = lr*blocksize;
731  for (local_ordinal_type col=0;col<num_vectors;++col) {
732  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
733  if (P == 0) {
734  // y := b
735  bb.assign_data(&b(row, col));
736  if (member.team_rank() == 0)
737  VectorCopy(member, blocksize, bb, yy);
738  member.team_barrier();
739  }
740 
741  // y -= Rx
742  const size_type A_k0 = A_block_rowptr[lr];
743  if(hasBlockCrsMatrix) {
744  Kokkos::parallel_for
745  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
746  [&](const local_ordinal_type &k) {
747  const size_type j = A_k0 + colindsub_used[k];
748  A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
749 
750  const local_ordinal_type A_colind_at_j = A_colind[j];
751  if (P == 0) {
752  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
753  xx.assign_data( &x(loc*blocksize, col) );
754  VectorGemv(member, blocksize, A_block_cst, xx, yy);
755  } else {
756  const auto loc = A_colind_at_j - num_local_rows;
757  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
758  VectorGemv(member, blocksize, A_block_cst, xx_remote, yy);
759  }
760  });
761  }
762  else {
763  Kokkos::parallel_for
764  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
765  [&](const local_ordinal_type &k) {
766  const size_type j = A_k0 + colindsub_used[k];
767 
768  const local_ordinal_type A_colind_at_j = A_colind[j];
769  if (P == 0) {
770  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
771  xx.assign_data( &x(loc*blocksize, col) );
772  for (local_ordinal_type k0=0;k0<blocksize;++k0)
773  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
774  } else {
775  const auto loc = A_colind_at_j - num_local_rows;
776  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
777  for (local_ordinal_type k0=0;k0<blocksize;++k0)
778  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
779  }
780  });
781  }
782  }
783  }
784 
785  // y = b - Rx; seq method
786  template<typename MultiVectorLocalViewTypeY,
787  typename MultiVectorLocalViewTypeB,
788  typename MultiVectorLocalViewTypeX>
789  void run(const MultiVectorLocalViewTypeY &y_,
790  const MultiVectorLocalViewTypeB &b_,
791  const MultiVectorLocalViewTypeX &x_) {
792  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
793  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
794 
795  y = y_; b = b_; x = x_;
796  if constexpr (is_device<execution_space>::value) {
797  const local_ordinal_type blocksize = blocksize_requested;
798  const local_ordinal_type team_size = 8;
799  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
800  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
801  Kokkos::parallel_for
802  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
803  } else {
804  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
805  Kokkos::parallel_for
806  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
807  }
808  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
809  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
810  }
811 
812  // y = b - R (x , x_remote)
813  template<typename MultiVectorLocalViewTypeB,
814  typename MultiVectorLocalViewTypeX,
815  typename MultiVectorLocalViewTypeX_Remote>
816  void run(const vector_type_3d_view &y_packed_,
817  const MultiVectorLocalViewTypeB &b_,
818  const MultiVectorLocalViewTypeX &x_,
819  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
820  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
821  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
822 
823  b = b_; x = x_; x_remote = x_remote_;
824  if constexpr (is_device<execution_space>::value) {
825  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
826  y_packed_.extent(0),
827  y_packed_.extent(1),
828  y_packed_.extent(2),
829  vector_length);
830  } else {
831  y_packed = y_packed_;
832  }
833 
834  if constexpr(is_device<execution_space>::value) {
835  const local_ordinal_type blocksize = blocksize_requested;
836  const local_ordinal_type team_size = 8;
837  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
838  // local_ordinal_type vl_power_of_two = 1;
839  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
840  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
841  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
842 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
843  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
844  policy(rowidx2part.extent(0), team_size, vector_size); \
845  Kokkos::parallel_for \
846  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
847  policy, *this); } break
848  switch (blocksize_requested) {
849  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
850  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
851  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
852  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
853  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
854  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
855  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
856  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
857  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
858  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
859  }
860 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
861  } else {
862 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
863  const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
864  Kokkos::parallel_for \
865  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
866  policy, *this); } break
867  switch (blocksize_requested) {
868  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
869  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
870  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
871  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
872  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
873  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
874  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
875  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
876  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
877  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
878  }
879 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
880  }
881  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
882  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
883  }
884 
885  // y = b - R (y , y_remote)
886  template<typename MultiVectorLocalViewTypeB,
887  typename MultiVectorLocalViewTypeX,
888  typename MultiVectorLocalViewTypeX_Remote>
889  void run(const vector_type_3d_view &y_packed_,
890  const MultiVectorLocalViewTypeB &b_,
891  const MultiVectorLocalViewTypeX &x_,
892  const MultiVectorLocalViewTypeX_Remote &x_remote_,
893  const bool compute_owned) {
894  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
895  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
896 
897  b = b_; x = x_; x_remote = x_remote_;
898  if constexpr (is_device<execution_space>::value) {
899  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
900  y_packed_.extent(0),
901  y_packed_.extent(1),
902  y_packed_.extent(2),
903  vector_length);
904  } else {
905  y_packed = y_packed_;
906  }
907 
908  if constexpr (is_device<execution_space>::value) {
909  const local_ordinal_type blocksize = blocksize_requested;
910  const local_ordinal_type team_size = 8;
911  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
912  // local_ordinal_type vl_power_of_two = 1;
913  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
914  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
915  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
916 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
917  if (compute_owned) { \
918  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
919  policy(rowidx2part.extent(0), team_size, vector_size); \
920  Kokkos::parallel_for \
921  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
922  } else { \
923  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
924  policy(rowidx2part.extent(0), team_size, vector_size); \
925  Kokkos::parallel_for \
926  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
927  } break
928  switch (blocksize_requested) {
929  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
930  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
931  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
932  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
933  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
934  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
935  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
936  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
937  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
938  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
939  }
940 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
941  } else {
942 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
943  if (compute_owned) { \
944  const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
945  policy(0, rowidx2part.extent(0)); \
946  Kokkos::parallel_for \
947  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
948  } else { \
949  const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
950  policy(0, rowidx2part.extent(0)); \
951  Kokkos::parallel_for \
952  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
953  } break
954 
955  switch (blocksize_requested) {
956  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
957  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
958  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
959  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
960  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
961  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
962  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
963  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
964  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
965  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
966  }
967 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
968  }
969  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
970  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
971  }
972  };
973 
974 
975  } // namespace BlockHelperDetails
976 
977 } // namespace Ifpack2
978 
979 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:276
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockComputeResidualVector.hpp:23