Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_SymmetricDiagonalSpec.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
11 #define KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
12 
13 #include <stdexcept>
14 
15 #include "Kokkos_Core.hpp"
16 
17 #include "Stokhos_Multiply.hpp"
20 
21 //----------------------------------------------------------------------------
22 
23 namespace Stokhos {
24 
25 template<>
26 class BlockMultiply< SymmetricDiagonalSpec< Kokkos::Cuda > >
27 {
28 public:
29  typedef Kokkos::Cuda execution_space ;
30  typedef execution_space::size_type size_type ;
32 
33  __host__
34  static dim3 thread_block( const block_type & block )
35  {
36  const int d = block.dimension();
37  const int warp_size = Kokkos::Impl::CudaTraits::WarpSize;
38  auto const maxWarpCount = std::min<unsigned>(
39  execution_space().cuda_device_prop().maxThreadsPerBlock / warp_size,
40  warp_size);
41  const int y = ( maxWarpCount * warp_size ) / d ;
42 
43  if ( 0 == y ) {
44  throw std::runtime_error( std::string("Stokhos::Multiply< SymmetricDiagonalSpec<Cuda> > ERROR: block too large") );
45  }
46 
47  // dimension X #diagonals to concurrently process
48  return dim3( d , std::min( y , ( 1 + d ) / 2 ) , 1 );
49  }
50 
51  template< typename VectorValue >
52  __host__
53  static size_type shmem_size( const block_type & block )
54  {
55  const dim3 d = thread_block( block );
56 
57  return sizeof(VectorValue) * d.x * d.y ;
58  }
59 
60  __host__
61  static size_type matrix_size( const block_type & block )
62  { return block.matrix_size(); }
63 
64  // Required: blockDim.x == block.dimension()
65  // Required: blockDim.y > 1
66  // Computing: Y[ threadIdx.x ]
67  template< typename MatrixValue , typename VectorValue >
68  __device__
69  static VectorValue apply( const block_type & block ,
70  const MatrixValue * const a ,
71  const VectorValue * const x )
72  {
73  const int dimension = block.dimension();
74  const int dim_half = ( dimension + 1 ) >> 1 ;
75 
76  VectorValue * const shX = kokkos_impl_cuda_shared_memory<VectorValue>();
77 
78  int ia = -1 ;
79  int ix = -1 ;
80 
81  VectorValue y = 0 ;
82 
83  if ( 0 == threadIdx.y ) {
84  // Load vector and multiply the main diagonal (first diagonal)
85 
86  shX[ threadIdx.x ] = x[ threadIdx.x ]; // Load 'x'
87 
88  y = shX[ threadIdx.x ] * a[ threadIdx.x ];
89  }
90 
91  __syncthreads(); // Wait for load of 'x'
92 
93  if ( 0 == threadIdx.y && ! ( dimension & 01 ) ) {
94 
95  // If even number of diagonals then the last diagonal is half-length
96  // and only used once.
97 
98  ix = threadIdx.x ;
99  ia = threadIdx.x + dim_half * dimension ;
100 
101  if ( threadIdx.x < dim_half ) {
102  ix += dim_half ;
103  }
104  else {
105  ix -= dim_half ;
106  ia -= dim_half ;
107  }
108  y += shX[ ix ] * a[ ia ];
109  }
110 
111  //------------------------------------
112 
113  const int A_stride = blockDim.y * dimension ;
114 
115  int d = 1 + threadIdx.y ;
116 
117  const MatrixValue * A = a + d * dimension ;
118 
119  for ( ; d < dim_half ; d += blockDim.y , A += A_stride ) {
120 
121  ix = threadIdx.x + d ; if ( dimension <= ix ) ix -= dimension ;
122  ia = threadIdx.x - d ; if ( ia < 0 ) ia += dimension ;
123 
124  // A 'threadIdx.y' group accesses the matrix diagonal
125  // A[ 0 .. dimension - 1 ] twice in the following statement.
126  // Spatial and temporal locality are excellent for L1 cache reuse
127  // for the second access.
128 
129  y += shX[ ix ] * A[ threadIdx.x ] +
130  shX[ ia ] * A[ ia ];
131  }
132 
133  if ( 0 < threadIdx.y ) {
134  shX[ threadIdx.x + threadIdx.y * dimension ] = y ;
135  }
136 
137  __syncthreads(); // Wait for load of contributions to 'y'
138 
139  for ( ix = 1 ; ix < blockDim.y ; ++ix ) {
140  y += shX[ threadIdx.x + ix * dimension ];
141  }
142 
143  return y ;
144  }
145 };
146 
147 } /* namespace Stokhos */
148 
149 //----------------------------------------------------------------------------
150 
151 #endif /* #ifndef STOKHOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP */
KOKKOS_INLINE_FUNCTION unsigned matrix_size() const
Storage size for block coefficients.
Kokkos::DefaultExecutionSpace execution_space
Symmetric diagonal storage for a dense matrix.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static __device__ VectorValue apply(const block_type &block, const MatrixValue *const a, const VectorValue *const x)
KOKKOS_INLINE_FUNCTION unsigned dimension() const
Dimension of vector block.