10 #ifndef KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
11 #define KOKKOS_CUDA_SYMMETRIC_DIAGONAL_SPEC_HPP
15 #include "Kokkos_Core.hpp"
37 const int warp_size = Kokkos::Impl::CudaTraits::WarpSize;
38 auto const maxWarpCount = std::min<unsigned>(
41 const int y = ( maxWarpCount * warp_size ) / d ;
44 throw std::runtime_error( std::string(
"Stokhos::Multiply< SymmetricDiagonalSpec<Cuda> > ERROR: block too large") );
48 return dim3( d ,
std::min( y , ( 1 + d ) / 2 ) , 1 );
51 template<
typename VectorValue >
55 const dim3 d = thread_block( block );
57 return sizeof(VectorValue) * d.x * d.y ;
67 template<
typename MatrixValue ,
typename VectorValue >
70 const MatrixValue *
const a ,
71 const VectorValue *
const x )
74 const int dim_half = ( dimension + 1 ) >> 1 ;
76 VectorValue *
const shX = kokkos_impl_cuda_shared_memory<VectorValue>();
83 if ( 0 == threadIdx.y ) {
86 shX[ threadIdx.x ] = x[ threadIdx.x ];
88 y = shX[ threadIdx.x ] * a[ threadIdx.x ];
93 if ( 0 == threadIdx.y && ! ( dimension & 01 ) ) {
99 ia = threadIdx.x + dim_half * dimension ;
101 if ( threadIdx.x < dim_half ) {
108 y += shX[ ix ] * a[ ia ];
113 const int A_stride = blockDim.y * dimension ;
115 int d = 1 + threadIdx.y ;
117 const MatrixValue *
A = a + d * dimension ;
119 for ( ; d < dim_half ; d += blockDim.y , A += A_stride ) {
121 ix = threadIdx.x + d ;
if ( dimension <= ix ) ix -= dimension ;
122 ia = threadIdx.x - d ;
if ( ia < 0 ) ia += dimension ;
129 y += shX[ ix ] * A[ threadIdx.x ] +
133 if ( 0 < threadIdx.y ) {
134 shX[ threadIdx.x + threadIdx.y * dimension ] = y ;
139 for ( ix = 1 ; ix < blockDim.y ; ++ix ) {
140 y += shX[ threadIdx.x + ix * dimension ];
Kokkos::Cuda execution_space
static __host__ size_type matrix_size(const block_type &block)
KOKKOS_INLINE_FUNCTION unsigned matrix_size() const
Storage size for block coefficients.
Kokkos::DefaultExecutionSpace execution_space
Symmetric diagonal storage for a dense matrix.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
SymmetricDiagonalSpec< Kokkos::Cuda > block_type
static __device__ VectorValue apply(const block_type &block, const MatrixValue *const a, const VectorValue *const x)
static __host__ size_type shmem_size(const block_type &block)
KOKKOS_INLINE_FUNCTION unsigned dimension() const
Dimension of vector block.
static __host__ dim3 thread_block(const block_type &block)
execution_space::size_type size_type