42 #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
43 #define STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
47 #include "Kokkos_Core.hpp"
53 #include "cuda_profiler_api.h"
63 template<
typename TensorScalar,
64 typename MatrixScalar,
65 typename VectorScalar,
69 MatrixScalar, Kokkos::Cuda >,
70 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
71 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
80 typedef Kokkos::View< VectorScalar**,
84 template <
int MAX_COL>
85 class ApplyKernelSymmetric {
95 : m_A( A ), m_x( x ), m_y( y ) {}
101 const size_type dim = m_A.block.dimension();
103 volatile VectorScalar *
const sh =
104 kokkos_impl_cuda_shared_memory<VectorScalar>();
105 volatile VectorScalar *
const sh_y0 =
106 sh + blockDim.x*threadIdx.y;
107 volatile VectorScalar *
const sh_a0 =
108 sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
109 volatile VectorScalar *
const sh_x0 =
110 sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
112 (
size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
115 const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
116 if (row < m_A.graph.row_map.extent(0)-1) {
117 const size_type colBeg = m_A.graph.row_map[ row ];
118 const size_type colEnd = m_A.graph.row_map[ row + 1 ];
121 const TensorScalar c0 = m_A.block.value(0);
122 const TensorScalar c1 = m_A.block.value(1);
125 VectorScalar y0 = 0.0;
128 for (
size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
130 sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
133 for (
size_type stoch_row = threadIdx.x; stoch_row < dim;
134 stoch_row += blockDim.x) {
136 VectorScalar yi = 0.0;
144 for (
size_type col_offset = colBeg; col_offset < colEnd;
148 const size_type lcol = col_offset-colBeg;
152 const MatrixScalar ai = m_A.values( stoch_row, col_offset );
153 const VectorScalar xi = m_x( stoch_row, col );
156 if (stoch_row == 0) {
162 const MatrixScalar a0 = sh_a0[lcol];
163 const VectorScalar x0 = sh_x0[lcol];
166 if (stoch_row == 0) y0 += (c0-3.0*c1)*a0*x0;
168 yi += c1*(a0*xi + ai*x0);
173 m_y( stoch_row, row ) = yi;
181 sh_y0[ threadIdx.x ] = y0;
182 if ( threadIdx.x + 16 < blockDim.x )
183 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
184 if ( threadIdx.x + 8 < blockDim.x )
185 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
186 if ( threadIdx.x + 4 < blockDim.x )
187 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
188 if ( threadIdx.x + 2 < blockDim.x )
189 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
190 if ( threadIdx.x + 1 < blockDim.x )
191 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
194 if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
200 template <
int MAX_COL>
201 class ApplyKernelAsymmetric {
211 : m_A( A ), m_x( x ), m_y( y ) {}
217 const size_type dim = m_A.block.dimension();
219 volatile VectorScalar *
const sh =
220 kokkos_impl_cuda_shared_memory<VectorScalar>();
221 volatile VectorScalar *
const sh_y0 =
222 sh + blockDim.x*threadIdx.y;
223 volatile VectorScalar *
const sh_a0 =
224 sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
225 volatile VectorScalar *
const sh_x0 =
226 sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
228 (
size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
231 const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
232 if (row < m_A.graph.row_map.extent(0)-1) {
233 const size_type colBeg = m_A.graph.row_map[ row ];
234 const size_type colEnd = m_A.graph.row_map[ row + 1 ];
237 const TensorScalar c0 = m_A.block.value(0);
238 const TensorScalar c1 = m_A.block.value(1);
239 const TensorScalar c2 = m_A.block.value(2);
242 VectorScalar y0 = 0.0;
245 for (
size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
247 sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
250 for (
size_type stoch_row = threadIdx.x; stoch_row < dim;
251 stoch_row += blockDim.x) {
253 VectorScalar yi = 0.0;
261 for (
size_type col_offset = colBeg; col_offset < colEnd;
265 const size_type lcol = col_offset-colBeg;
269 const MatrixScalar ai = m_A.values( stoch_row, col_offset );
270 const VectorScalar xi = m_x( stoch_row, col );
273 if (stoch_row == 0) {
279 const MatrixScalar a0 = sh_a0[lcol];
280 const VectorScalar x0 = sh_x0[lcol];
283 if (stoch_row == 0) y0 += (c0-3.0*c1-c2)*a0*x0;
285 yi += c1*(a0*xi + ai*x0) + c2*ai*xi;
290 m_y( stoch_row, row ) = yi;
298 sh_y0[ threadIdx.x ] = y0;
299 if ( threadIdx.x + 16 < blockDim.x )
300 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
301 if ( threadIdx.x + 8 < blockDim.x )
302 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
303 if ( threadIdx.x + 4 < blockDim.x )
304 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
305 if ( threadIdx.x + 2 < blockDim.x )
306 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
307 if ( threadIdx.x + 1 < blockDim.x )
308 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
311 if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
327 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
329 size_type num_blocks = num_spatial_rows / num_rows_per_block;
330 if (num_blocks * num_rows_per_block < num_spatial_rows)
332 const dim3 dBlock( warp_size, num_rows_per_block );
333 const dim3 dGrid( num_blocks , 1, 1 );
335 const int MAX_COL = 32;
337 sizeof(VectorScalar) * (dBlock.x*dBlock.y + 2*MAX_COL*dBlock.y) +
342 std::cout <<
"Multiply< BlockCrsMatrix< LinearSparse3Tensor ... > >::apply"
344 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
345 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
"," << dBlock.z <<
")"<< std::endl
346 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
347 <<
" num_spatial_rows(" << num_spatial_rows <<
")" << std::endl
348 <<
" num_stoch_rows(" << num_stoch_rows <<
")" << std::endl;
352 if (A.
block.symmetric())
353 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
354 ( ApplyKernelSymmetric<MAX_COL>( A, x, y ) );
356 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
357 ( ApplyKernelAsymmetric<MAX_COL>( A, x, y ) );
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::execution_space Kokkos::Cuda execution_space
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::operator() __device__ void operator()(void) const
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::ApplyKernelSymmetric ApplyKernelSymmetric(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::size_type execution_space::size_type size_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::ApplyKernelAsymmetric ApplyKernelAsymmetric(const matrix_type &A, const vector_type &x, const vector_type &y)
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_y const vector_type m_y
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::vector_type Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::operator() __device__ void operator()(void) const
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::matrix_type BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
CRS matrix of dense blocks.
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::apply static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_y const vector_type m_y
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::tensor_type LinearSparse3Tensor< TensorScalar, execution_space, BlockSize > tensor_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_x const vector_type m_x
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_x const vector_type m_x