Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_SymmetricDiagonalSpec.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
43 #define KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
44 
45 #include <stdexcept>
46 
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
52 
53 //----------------------------------------------------------------------------
54 
55 namespace Stokhos {
56 
57 template<>
58 class BlockMultiply< SymmetricDiagonalSpec< Kokkos::Cuda > >
59 {
60 public:
61  typedef Kokkos::Cuda execution_space ;
62  typedef execution_space::size_type size_type ;
64 
65  __host__
66  static dim3 thread_block( const block_type & block )
67  {
68  const int d = block.dimension();
69  const int warp_size = Kokkos::Impl::CudaTraits::WarpSize;
70  const int y = ( Kokkos::Impl::cuda_internal_maximum_warp_count() * warp_size ) / d ;
71 
72  if ( 0 == y ) {
73  throw std::runtime_error( std::string("Stokhos::Multiply< SymmetricDiagonalSpec<Cuda> > ERROR: block too large") );
74  }
75 
76  // dimension X #diagonals to concurrently process
77  return dim3( d , std::min( y , ( 1 + d ) / 2 ) , 1 );
78  }
79 
80  template< typename VectorValue >
81  __host__
82  static size_type shmem_size( const block_type & block )
83  {
84  const dim3 d = thread_block( block );
85 
86  return sizeof(VectorValue) * d.x * d.y ;
87  }
88 
89  __host__
90  static size_type matrix_size( const block_type & block )
91  { return block.matrix_size(); }
92 
93  // Required: blockDim.x == block.dimension()
94  // Required: blockDim.y > 1
95  // Computing: Y[ threadIdx.x ]
96  template< typename MatrixValue , typename VectorValue >
97  __device__
98  static VectorValue apply( const block_type & block ,
99  const MatrixValue * const a ,
100  const VectorValue * const x )
101  {
102  const int dimension = block.dimension();
103  const int dim_half = ( dimension + 1 ) >> 1 ;
104 
105  VectorValue * const shX = kokkos_impl_cuda_shared_memory<VectorValue>();
106 
107  int ia = -1 ;
108  int ix = -1 ;
109 
110  VectorValue y = 0 ;
111 
112  if ( 0 == threadIdx.y ) {
113  // Load vector and multiply the main diagonal (first diagonal)
114 
115  shX[ threadIdx.x ] = x[ threadIdx.x ]; // Load 'x'
116 
117  y = shX[ threadIdx.x ] * a[ threadIdx.x ];
118  }
119 
120  __syncthreads(); // Wait for load of 'x'
121 
122  if ( 0 == threadIdx.y && ! ( dimension & 01 ) ) {
123 
124  // If even number of diagonals then the last diagonal is half-length
125  // and only used once.
126 
127  ix = threadIdx.x ;
128  ia = threadIdx.x + dim_half * dimension ;
129 
130  if ( threadIdx.x < dim_half ) {
131  ix += dim_half ;
132  }
133  else {
134  ix -= dim_half ;
135  ia -= dim_half ;
136  }
137  y += shX[ ix ] * a[ ia ];
138  }
139 
140  //------------------------------------
141 
142  const int A_stride = blockDim.y * dimension ;
143 
144  int d = 1 + threadIdx.y ;
145 
146  const MatrixValue * A = a + d * dimension ;
147 
148  for ( ; d < dim_half ; d += blockDim.y , A += A_stride ) {
149 
150  ix = threadIdx.x + d ; if ( dimension <= ix ) ix -= dimension ;
151  ia = threadIdx.x - d ; if ( ia < 0 ) ia += dimension ;
152 
153  // A 'threadIdx.y' group accesses the matrix diagonal
154  // A[ 0 .. dimension - 1 ] twice in the following statement.
155  // Spatial and temporal locality are excellent for L1 cache reuse
156  // for the second access.
157 
158  y += shX[ ix ] * A[ threadIdx.x ] +
159  shX[ ia ] * A[ ia ];
160  }
161 
162  if ( 0 < threadIdx.y ) {
163  shX[ threadIdx.x + threadIdx.y * dimension ] = y ;
164  }
165 
166  __syncthreads(); // Wait for load of contributions to 'y'
167 
168  for ( ix = 1 ; ix < blockDim.y ; ++ix ) {
169  y += shX[ threadIdx.x + ix * dimension ];
170  }
171 
172  return y ;
173  }
174 };
175 
176 } /* namespace Stokhos */
177 
178 //----------------------------------------------------------------------------
179 
180 #endif /* #ifndef STOKHOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP */
KOKKOS_INLINE_FUNCTION unsigned matrix_size() const
Storage size for block coefficients.
Symmetric diagonal storage for a dense matrix.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static __device__ VectorValue apply(const block_type &block, const MatrixValue *const a, const VectorValue *const x)
KOKKOS_INLINE_FUNCTION unsigned dimension() const
Dimension of vector block.