10 #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
11 #define STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
15 #include "Kokkos_Core.hpp"
21 #include "cuda_profiler_api.h"
31 template<
typename TensorScalar,
32 typename MatrixScalar,
33 typename VectorScalar,
37 MatrixScalar, Kokkos::Cuda >,
38 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
39 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
48 typedef Kokkos::View< VectorScalar**,
52 template <
int MAX_COL>
53 class ApplyKernelSymmetric {
63 : m_A( A ), m_x( x ), m_y( y ) {}
69 const size_type dim = m_A.block.dimension();
71 volatile VectorScalar *
const sh =
72 kokkos_impl_cuda_shared_memory<VectorScalar>();
73 volatile VectorScalar *
const sh_y0 =
74 sh + blockDim.x*threadIdx.y;
75 volatile VectorScalar *
const sh_a0 =
76 sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
77 volatile VectorScalar *
const sh_x0 =
78 sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
80 (
size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
83 const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
84 if (row < m_A.graph.row_map.extent(0)-1) {
85 const size_type colBeg = m_A.graph.row_map[ row ];
86 const size_type colEnd = m_A.graph.row_map[ row + 1 ];
89 const TensorScalar c0 = m_A.block.value(0);
90 const TensorScalar c1 = m_A.block.value(1);
93 VectorScalar y0 = 0.0;
96 for (
size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
98 sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
101 for (
size_type stoch_row = threadIdx.x; stoch_row < dim;
102 stoch_row += blockDim.x) {
104 VectorScalar yi = 0.0;
112 for (
size_type col_offset = colBeg; col_offset < colEnd;
116 const size_type lcol = col_offset-colBeg;
120 const MatrixScalar ai = m_A.values( stoch_row, col_offset );
121 const VectorScalar xi = m_x( stoch_row, col );
124 if (stoch_row == 0) {
130 const MatrixScalar a0 = sh_a0[lcol];
131 const VectorScalar x0 = sh_x0[lcol];
134 if (stoch_row == 0) y0 += (c0-3.0*c1)*a0*x0;
136 yi += c1*(a0*xi + ai*x0);
141 m_y( stoch_row, row ) = yi;
149 sh_y0[ threadIdx.x ] = y0;
150 if ( threadIdx.x + 16 < blockDim.x )
151 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
152 if ( threadIdx.x + 8 < blockDim.x )
153 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
154 if ( threadIdx.x + 4 < blockDim.x )
155 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
156 if ( threadIdx.x + 2 < blockDim.x )
157 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
158 if ( threadIdx.x + 1 < blockDim.x )
159 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
162 if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
168 template <
int MAX_COL>
169 class ApplyKernelAsymmetric {
179 : m_A( A ), m_x( x ), m_y( y ) {}
185 const size_type dim = m_A.block.dimension();
187 volatile VectorScalar *
const sh =
188 kokkos_impl_cuda_shared_memory<VectorScalar>();
189 volatile VectorScalar *
const sh_y0 =
190 sh + blockDim.x*threadIdx.y;
191 volatile VectorScalar *
const sh_a0 =
192 sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
193 volatile VectorScalar *
const sh_x0 =
194 sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
196 (
size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
199 const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
200 if (row < m_A.graph.row_map.extent(0)-1) {
201 const size_type colBeg = m_A.graph.row_map[ row ];
202 const size_type colEnd = m_A.graph.row_map[ row + 1 ];
205 const TensorScalar c0 = m_A.block.value(0);
206 const TensorScalar c1 = m_A.block.value(1);
207 const TensorScalar c2 = m_A.block.value(2);
210 VectorScalar y0 = 0.0;
213 for (
size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
215 sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
218 for (
size_type stoch_row = threadIdx.x; stoch_row < dim;
219 stoch_row += blockDim.x) {
221 VectorScalar yi = 0.0;
229 for (
size_type col_offset = colBeg; col_offset < colEnd;
233 const size_type lcol = col_offset-colBeg;
237 const MatrixScalar ai = m_A.values( stoch_row, col_offset );
238 const VectorScalar xi = m_x( stoch_row, col );
241 if (stoch_row == 0) {
247 const MatrixScalar a0 = sh_a0[lcol];
248 const VectorScalar x0 = sh_x0[lcol];
251 if (stoch_row == 0) y0 += (c0-3.0*c1-c2)*a0*x0;
253 yi += c1*(a0*xi + ai*x0) + c2*ai*xi;
258 m_y( stoch_row, row ) = yi;
266 sh_y0[ threadIdx.x ] = y0;
267 if ( threadIdx.x + 16 < blockDim.x )
268 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
269 if ( threadIdx.x + 8 < blockDim.x )
270 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
271 if ( threadIdx.x + 4 < blockDim.x )
272 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
273 if ( threadIdx.x + 2 < blockDim.x )
274 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
275 if ( threadIdx.x + 1 < blockDim.x )
276 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
279 if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
295 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
297 size_type num_blocks = num_spatial_rows / num_rows_per_block;
298 if (num_blocks * num_rows_per_block < num_spatial_rows)
300 const dim3 dBlock( warp_size, num_rows_per_block );
301 const dim3 dGrid( num_blocks , 1, 1 );
303 const int MAX_COL = 32;
305 sizeof(VectorScalar) * (dBlock.x*dBlock.y + 2*MAX_COL*dBlock.y) +
310 std::cout <<
"Multiply< BlockCrsMatrix< LinearSparse3Tensor ... > >::apply"
312 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
313 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
"," << dBlock.z <<
")"<< std::endl
314 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
315 <<
" num_spatial_rows(" << num_spatial_rows <<
")" << std::endl
316 <<
" num_stoch_rows(" << num_stoch_rows <<
")" << std::endl;
320 if (A.
block.symmetric())
321 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
322 ( ApplyKernelSymmetric<MAX_COL>( A, x, y ) );
324 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
325 ( ApplyKernelAsymmetric<MAX_COL>( A, x, y ) );
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::execution_space Kokkos::Cuda execution_space
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::operator() __device__ void operator()(void) const
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::ApplyKernelSymmetric ApplyKernelSymmetric(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::size_type execution_space::size_type size_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::ApplyKernelAsymmetric ApplyKernelAsymmetric(const matrix_type &A, const vector_type &x, const vector_type &y)
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_y const vector_type m_y
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::vector_type Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::operator() __device__ void operator()(void) const
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::matrix_type BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
CRS matrix of dense blocks.
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::apply static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_y const vector_type m_y
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::tensor_type LinearSparse3Tensor< TensorScalar, execution_space, BlockSize > tensor_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_x const vector_type m_x
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_x const vector_type m_x