42 #ifndef KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
43 #define KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
47 #include "Kokkos_Core.hpp"
69 const int warp_size = Kokkos::Impl::CudaTraits::WarpSize;
70 const int y = ( Kokkos::Impl::cuda_internal_maximum_warp_count() * warp_size ) / d ;
73 throw std::runtime_error( std::string(
"Stokhos::Multiply< SymmetricDiagonalSpec<Cuda> > ERROR: block too large") );
77 return dim3( d ,
std::min( y , ( 1 + d ) / 2 ) , 1 );
80 template<
typename VectorValue >
84 const dim3 d = thread_block( block );
86 return sizeof(VectorValue) * d.x * d.y ;
96 template<
typename MatrixValue ,
typename VectorValue >
99 const MatrixValue *
const a ,
100 const VectorValue *
const x )
103 const int dim_half = ( dimension + 1 ) >> 1 ;
105 VectorValue *
const shX = kokkos_impl_cuda_shared_memory<VectorValue>();
112 if ( 0 == threadIdx.y ) {
115 shX[ threadIdx.x ] = x[ threadIdx.x ];
117 y = shX[ threadIdx.x ] * a[ threadIdx.x ];
122 if ( 0 == threadIdx.y && ! ( dimension & 01 ) ) {
128 ia = threadIdx.x + dim_half * dimension ;
130 if ( threadIdx.x < dim_half ) {
137 y += shX[ ix ] * a[ ia ];
142 const int A_stride = blockDim.y * dimension ;
144 int d = 1 + threadIdx.y ;
146 const MatrixValue *
A = a + d * dimension ;
148 for ( ; d < dim_half ; d += blockDim.y , A += A_stride ) {
150 ix = threadIdx.x + d ;
if ( dimension <= ix ) ix -= dimension ;
151 ia = threadIdx.x - d ;
if ( ia < 0 ) ia += dimension ;
158 y += shX[ ix ] * A[ threadIdx.x ] +
162 if ( 0 < threadIdx.y ) {
163 shX[ threadIdx.x + threadIdx.y * dimension ] = y ;
168 for ( ix = 1 ; ix < blockDim.y ; ++ix ) {
169 y += shX[ threadIdx.x + ix * dimension ];
Kokkos::Cuda execution_space
static __host__ size_type matrix_size(const block_type &block)
KOKKOS_INLINE_FUNCTION unsigned matrix_size() const
Storage size for block coefficients.
Symmetric diagonal storage for a dense matrix.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
SymmetricDiagonalSpec< Kokkos::Cuda > block_type
static __device__ VectorValue apply(const block_type &block, const MatrixValue *const a, const VectorValue *const x)
static __host__ size_type shmem_size(const block_type &block)
KOKKOS_INLINE_FUNCTION unsigned dimension() const
Dimension of vector block.
static __host__ dim3 thread_block(const block_type &block)
execution_space::size_type size_type