42 #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP 
   43 #define STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP 
   47 #include "Kokkos_Core.hpp" 
   53 #include "cuda_profiler_api.h" 
   63 template< 
typename TensorScalar,
 
   64           typename MatrixScalar,
 
   65           typename VectorScalar,
 
   69                   MatrixScalar, Kokkos::Cuda >,
 
   70   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
 
   71   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
 
   80   typedef Kokkos::View< VectorScalar**,
 
   84   template <
int MAX_COL>
 
   85   class ApplyKernelSymmetric {
 
   95       : m_A( A ), m_x( x ), m_y( y ) {}
 
  101       const size_type dim = m_A.block.dimension();
 
  103       volatile VectorScalar * 
const sh =
 
  104         kokkos_impl_cuda_shared_memory<VectorScalar>();
 
  105       volatile VectorScalar * 
const sh_y0 =
 
  106         sh + blockDim.x*threadIdx.y;
 
  107       volatile VectorScalar * 
const sh_a0 =
 
  108         sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
 
  109       volatile VectorScalar * 
const sh_x0 =
 
  110         sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
 
  112         (
size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
 
  115       const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
 
  116       if (row < m_A.graph.row_map.extent(0)-1) {
 
  117         const size_type colBeg = m_A.graph.row_map[ row ];
 
  118         const size_type colEnd = m_A.graph.row_map[ row + 1 ];
 
  121         const TensorScalar c0 = m_A.block.value(0);
 
  122         const TensorScalar c1 = m_A.block.value(1);
 
  125         VectorScalar y0 = 0.0;
 
  128         for ( 
size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
 
  130           sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
 
  133         for ( 
size_type stoch_row = threadIdx.x; stoch_row < dim;
 
  134               stoch_row += blockDim.x) {
 
  136           VectorScalar yi = 0.0;
 
  144           for ( 
size_type col_offset = colBeg; col_offset < colEnd;
 
  148             const size_type lcol = col_offset-colBeg;
 
  152             const MatrixScalar ai = m_A.values( stoch_row, col_offset );
 
  153             const VectorScalar xi = m_x( stoch_row, col );
 
  156             if (stoch_row == 0) {
 
  162             const MatrixScalar a0 = sh_a0[lcol];
 
  163             const VectorScalar x0 = sh_x0[lcol];
 
  166             if (stoch_row == 0) y0 += (c0-3.0*c1)*a0*x0;
 
  168             yi += c1*(a0*xi + ai*x0);
 
  173           m_y( stoch_row, row ) = yi;
 
  181         sh_y0[ threadIdx.x ] = y0;
 
  182         if ( threadIdx.x + 16 < blockDim.x )
 
  183           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
 
  184         if ( threadIdx.x +  8 < blockDim.x )
 
  185           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
 
  186         if ( threadIdx.x +  4 < blockDim.x )
 
  187           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
 
  188         if ( threadIdx.x +  2 < blockDim.x )
 
  189           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
 
  190         if ( threadIdx.x +  1 < blockDim.x )
 
  191           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
 
  194         if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
 
  200   template <
int MAX_COL>
 
  201   class ApplyKernelAsymmetric {
 
  211       : m_A( A ), m_x( x ), m_y( y ) {}
 
  217       const size_type dim = m_A.block.dimension();
 
  219       volatile VectorScalar * 
const sh =
 
  220         kokkos_impl_cuda_shared_memory<VectorScalar>();
 
  221       volatile VectorScalar * 
const sh_y0 =
 
  222         sh + blockDim.x*threadIdx.y;
 
  223       volatile VectorScalar * 
const sh_a0 =
 
  224         sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
 
  225       volatile VectorScalar * 
const sh_x0 =
 
  226         sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
 
  228         (
size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
 
  231       const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
 
  232       if (row < m_A.graph.row_map.extent(0)-1) {
 
  233         const size_type colBeg = m_A.graph.row_map[ row ];
 
  234         const size_type colEnd = m_A.graph.row_map[ row + 1 ];
 
  237         const TensorScalar c0 = m_A.block.value(0);
 
  238         const TensorScalar c1 = m_A.block.value(1);
 
  239         const TensorScalar c2 = m_A.block.value(2);
 
  242         VectorScalar y0 = 0.0;
 
  245         for ( 
size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
 
  247           sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
 
  250         for ( 
size_type stoch_row = threadIdx.x; stoch_row < dim;
 
  251               stoch_row += blockDim.x) {
 
  253           VectorScalar yi = 0.0;
 
  261           for ( 
size_type col_offset = colBeg; col_offset < colEnd;
 
  265             const size_type lcol = col_offset-colBeg;
 
  269             const MatrixScalar ai = m_A.values( stoch_row, col_offset );
 
  270             const VectorScalar xi = m_x( stoch_row, col );
 
  273             if (stoch_row == 0) {
 
  279             const MatrixScalar a0 = sh_a0[lcol];
 
  280             const VectorScalar x0 = sh_x0[lcol];
 
  283             if (stoch_row == 0) y0 += (c0-3.0*c1-c2)*a0*x0;
 
  285             yi += c1*(a0*xi + ai*x0) + c2*ai*xi;
 
  290           m_y( stoch_row, row ) = yi;
 
  298         sh_y0[ threadIdx.x ] = y0;
 
  299         if ( threadIdx.x + 16 < blockDim.x )
 
  300           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
 
  301         if ( threadIdx.x +  8 < blockDim.x )
 
  302           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
 
  303         if ( threadIdx.x +  4 < blockDim.x )
 
  304           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
 
  305         if ( threadIdx.x +  2 < blockDim.x )
 
  306           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
 
  307         if ( threadIdx.x +  1 < blockDim.x )
 
  308           sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
 
  311         if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
 
  327     const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
 
  329     size_type num_blocks = num_spatial_rows / num_rows_per_block;
 
  330     if (num_blocks * num_rows_per_block < num_spatial_rows)
 
  332     const dim3 dBlock( warp_size, num_rows_per_block );
 
  333     const dim3 dGrid( num_blocks , 1, 1 );
 
  335     const int MAX_COL = 32; 
 
  337       sizeof(VectorScalar) * (dBlock.x*dBlock.y + 2*MAX_COL*dBlock.y) +
 
  342     std::cout << 
"Multiply< BlockCrsMatrix< LinearSparse3Tensor ... > >::apply" 
  344               << 
"  grid(" << dGrid.x << 
"," << dGrid.y << 
")" << std::endl
 
  345               << 
"  block(" << dBlock.x << 
"," << dBlock.y << 
"," << dBlock.z << 
")"<< std::endl
 
  346               << 
"  shmem(" << shmem/1024 << 
" kB)" << std::endl
 
  347               << 
"  num_spatial_rows(" << num_spatial_rows << 
")" << std::endl
 
  348               << 
"  num_stoch_rows(" << num_stoch_rows << 
")" << std::endl;
 
  352     if (A.
block.symmetric())
 
  353       Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
 
  354         ( ApplyKernelSymmetric<MAX_COL>( A, x, y ) );
 
  356       Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
 
  357         ( ApplyKernelAsymmetric<MAX_COL>( A, x, y ) );
 
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::execution_space Kokkos::Cuda execution_space
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::operator() __device__ void operator()(void) const 
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::ApplyKernelSymmetric ApplyKernelSymmetric(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::size_type execution_space::size_type size_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::ApplyKernelAsymmetric ApplyKernelAsymmetric(const matrix_type &A, const vector_type &x, const vector_type &y)
Sparse product tensor with replicated entries to provide subsets with a given coordinate. 
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_y const vector_type m_y
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::vector_type Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::operator() __device__ void operator()(void) const 
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::matrix_type BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
CRS matrix of dense blocks. 
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::apply static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_y const vector_type m_y
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::tensor_type LinearSparse3Tensor< TensorScalar, execution_space, BlockSize > tensor_type
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelSymmetric::m_x const vector_type m_x
Stokhos::Multiply< BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::ApplyKernelAsymmetric::m_x const vector_type m_x