42 #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP 
   43 #define STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP 
   47 #include "Kokkos_Core.hpp" 
   53 #include "cuda_profiler_api.h" 
   61 template< 
typename TensorScalar ,
 
   62           typename MatrixScalar ,
 
   63           typename VectorScalar >
 
   66                   MatrixScalar, Kokkos::Cuda >,
 
   67   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
 
   68   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
 
   77   typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >  
vector_type ;
 
   79   class ProductTensorLoop {
 
   91       : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
 
   96       const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
 
   99       const size_type dim = m_A.block.dimension();
 
  102       const size_type n_tile = m_A.block.num_tiles();
 
  103       const size_type tile_size = m_A.block.tile_size();
 
  104       const size_type tile_dim = n_tile == 1 ? dim : tile_size;
 
  107       VectorScalar * 
const sh_x_k =
 
  108         kokkos_impl_cuda_shared_memory<VectorScalar>();
 
  109       VectorScalar * 
const sh_x_j =
 
  110         n_tile == 1 ? sh_x_k : sh_x_k + m_block_size*tile_dim;
 
  111       VectorScalar * 
const sh_A_k =
 
  112         sh_x_j + m_block_size*tile_dim;
 
  113       VectorScalar * 
const sh_A_j =
 
  114         n_tile == 1 ? sh_A_k : sh_A_k + m_block_size*tile_dim;
 
  115       VectorScalar * 
const sh_y   = sh_A_j + m_block_size*tile_dim;
 
  116       volatile VectorScalar * 
const sh_t = sh_y + tile_dim;
 
  118       const size_type nid = blockDim.x * blockDim.y;
 
  119       const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
 
  122       const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
 
  123       const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
 
  124       size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
 
  125       const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
 
  126       if (remBlock > 0) ++numBlock;
 
  130         m_y(i,blockIdx.x) = 0.0;
 
  134       for (
size_type tile = 0; tile<n_tile; ++tile) {
 
  136         const size_type i_offset = m_A.block.offset(tile, 0);
 
  137         const size_type j_offset = m_A.block.offset(tile, 1);
 
  138         const size_type k_offset = m_A.block.offset(tile, 2);
 
  139         const size_type i_range  = m_A.block.range(tile, 0);
 
  140         const size_type j_range  = m_A.block.range(tile, 1);
 
  141         const size_type k_range  = m_A.block.range(tile, 2);
 
  142         const size_type n_row    = m_A.block.num_rows(tile);
 
  145         for (
size_type i=tid; i<i_range; i+=nid) {
 
  152              ++block, iBlockEntry+=m_block_size) {
 
  155             (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
 
  162           for (
size_type col=0; col<block_size; ++col) {
 
  165               m_A.graph.entries( iBlockEntry + col );
 
  166             const VectorScalar * 
const x_k = & m_x( k_offset , iBlockColumn );
 
  167             const VectorScalar * 
const x_j = & m_x( j_offset , iBlockColumn );
 
  168             const MatrixScalar * 
const A_k = & m_A.values( k_offset , iBlockEntry + col );
 
  169             const MatrixScalar * 
const A_j = & m_A.values( j_offset , iBlockEntry + col );
 
  172               sh_x_j[col+
j*m_block_size] = x_j[
j];
 
  173               sh_A_j[col+
j*m_block_size] = A_j[
j];
 
  176               for (
size_type k=tid; k<k_range; k+=nid) {
 
  177                 sh_x_k[col+k*m_block_size] = x_k[k];
 
  178                 sh_A_k[col+k*m_block_size] = A_k[k];
 
  187           for (
size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
 
  191             const size_type lBeg = m_A.block.entry_begin(tile, i);
 
  192             const size_type lEnd = m_A.block.entry_end(tile, i);
 
  195             for (
size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
 
  197               const size_type kj   = m_A.block.coord( l );
 
  198               const TensorScalar v = m_A.block.value( l );
 
  199               const size_type j    = ( kj & 0x0ffff ) * m_block_size ;
 
  200               const size_type k    = ( kj >> 16     ) * m_block_size ;
 
  202               for ( 
size_type col = 0; col < block_size; ++col ) {
 
  203                 s += v * ( sh_A_j[col+
j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+
j] );
 
  210             if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
 
  211             if ( threadIdx.x +  8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
 
  212             if ( threadIdx.x +  4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
 
  213             if ( threadIdx.x +  2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
 
  214             if ( threadIdx.x +  1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
 
  215             if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
 
  225         for (
size_type i=tid; i<i_range; i+=nid) {
 
  226           m_y( i+i_offset , blockIdx.x ) += sh_y[i];
 
  252     const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
 
  253     const dim3 dGrid( row_count , 1 , 1 );
 
  255     const size_type shmem_factor = num_tiles == 1 ? 2 : 4;
 
  256     const size_type tensor_align = num_tiles == 1 ? tensor_dimension : tile_dim;
 
  258       Kokkos::Impl::CudaTraits::SharedMemoryCapacity / 2;
 
  259     size_type bs = ((shcap / 
sizeof(VectorScalar) - dBlock.x*dBlock.y) / tensor_align - 1) / shmem_factor;
 
  266       sizeof(VectorScalar) * ((shmem_factor*block_size+1) * tensor_align + dBlock.x*dBlock.y);
 
  280     std::cout << 
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply" 
  282               << 
"  grid(" << dGrid.x << 
"," << dGrid.y << 
")" << std::endl
 
  283               << 
"  block(" << dBlock.x << 
"," << dBlock.y << 
")" << std::endl
 
  284               << 
"  block_size(" << block_size << 
")" << std::endl
 
  285               << 
"  shmem(" << shmem/1024 << 
" kB)" << std::endl
 
  286               << 
"  row_count(" << row_count << 
")" << std::endl
 
  287               << 
"  tensor_dimension(" << tensor_dimension << 
")" << std::endl
 
  288               << 
"  tile_size(" << tile_size << 
")" << std::endl
 
  289               << 
"  num_tiles(" << num_tiles << 
")" << std::endl
 
  293     Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
 
  294       ( ProductTensorLoop( A , x , y, block_size ) );
 
  303 template< 
typename TensorScalar ,
 
  304           typename MatrixScalar ,
 
  305           typename VectorScalar >
 
  307   BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
 
  308                   MatrixScalar, Kokkos::Cuda >,
 
  309   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
 
  310   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
 
  315   typedef execution_space::size_type size_type ;
 
  317   typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
 
  318   typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
 
  319   typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >  vector_type ;
 
  321   class ProductTensorLoop {
 
  324     const matrix_type m_A;
 
  325     const vector_type m_x;
 
  326     const vector_type m_y;
 
  327     const size_type m_tile;
 
  329     ProductTensorLoop( 
const matrix_type & 
A,
 
  330                        const vector_type & x,
 
  331                        const vector_type & y,
 
  332                        const size_type tile )
 
  333       : m_A( A ), m_x( x ), m_y( y ), m_tile(tile) {}
 
  336     void operator()(
void)
 const 
  338       const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
 
  341       const size_type dim = m_A.block.dimension();
 
  342       const size_type tile_size = m_A.block.tile_size();
 
  344       VectorScalar * 
const sh_x_k =
 
  345         kokkos_impl_cuda_shared_memory<VectorScalar>();
 
  346       VectorScalar * 
const sh_x_j = sh_x_k + tile_size;
 
  347       VectorScalar * 
const sh_A_k = sh_x_j + tile_size;
 
  348       VectorScalar * 
const sh_A_j = sh_A_k + tile_size;
 
  349       VectorScalar * 
const sh_y   = sh_A_j + tile_size;
 
  350       volatile VectorScalar * 
const sh_t = sh_y + tile_size;
 
  352       const size_type nid = blockDim.x * blockDim.y;
 
  353       const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
 
  362       const size_type i_offset = m_A.block.offset(m_tile, 0);
 
  363       const size_type j_offset = m_A.block.offset(m_tile, 1);
 
  364       const size_type k_offset = m_A.block.offset(m_tile, 2);
 
  365       const size_type i_range  = m_A.block.range(m_tile, 0);
 
  366       const size_type j_range  = m_A.block.range(m_tile, 1);
 
  367       const size_type k_range  = m_A.block.range(m_tile, 2);
 
  368       const size_type n_row    = m_A.block.num_rows(m_tile);
 
  371       const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
 
  372       const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
 
  375       for (size_type i=tid; i<i_range; i+=nid) {
 
  379       const size_type * __restrict cijk_j = &m_A.block.coord(0,0);
 
  380       const size_type * __restrict cijk_k = &m_A.block.coord(0,1);
 
  381       const TensorScalar * __restrict cijk_v = &m_A.block.value(0);
 
  384       for (size_type iBlockEntry = iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
 
  390         const size_type iBlockColumn = m_A.graph.entries( iBlockEntry );
 
  391         const VectorScalar * 
const x_k = & m_x( k_offset , iBlockColumn );
 
  392         const VectorScalar * 
const x_j = & m_x( j_offset , iBlockColumn );
 
  393         const MatrixScalar * 
const A_k = & m_A.values( k_offset , iBlockEntry );
 
  394         const MatrixScalar * 
const A_j = & m_A.values( j_offset , iBlockEntry );
 
  396         for (size_type 
j=tid; 
j<j_range; 
j+=nid) {
 
  400         for (size_type k=tid; k<k_range; k+=nid) {
 
  416         for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
 
  420           const size_type lBeg = m_A.block.entry_begin(m_tile, i);
 
  421           const size_type lEnd = m_A.block.entry_end(m_tile, i);
 
  424           for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
 
  430             const size_type 
j = cijk_j[l];
 
  431             const size_type k = cijk_k[l];
 
  432             const TensorScalar v = cijk_v[l];
 
  434             s += v * ( sh_A_j[
j] * sh_x_k[k] + sh_A_k[k] * sh_x_j[
j] );
 
  440           if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
 
  441           if ( threadIdx.x +  8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
 
  442           if ( threadIdx.x +  4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
 
  443           if ( threadIdx.x +  2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
 
  444           if ( threadIdx.x +  1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
 
  445           if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
 
  455       for (size_type i=tid; i<i_range; i+=nid) {
 
  456         m_y( i+i_offset , blockIdx.x ) += sh_y[i];
 
  466     typedef typename execution_space::size_type size_type;
 
  468     Zero( 
const vector_type & x ) : m_x( x ), m_d(x.extent(0)) {}
 
  470     KOKKOS_INLINE_FUNCTION
 
  471     void operator()( 
const size_type 
j )
 const {
 
  472       for (size_type i=0; i<m_d; ++i)
 
  477     const vector_type m_x;
 
  484   static void apply( 
const matrix_type & 
A ,
 
  485                      const vector_type & x ,
 
  486                      const vector_type & y )
 
  488     const size_type row_count = A.graph.row_map.extent(0) - 1;
 
  489     const size_type tensor_dimension = A.block.dimension();
 
  490     const size_type tile_size = A.block.tile_size();
 
  491     const size_type num_tiles = A.block.num_tiles();
 
  493     const size_type nWarp = 16;
 
  494     const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
 
  495     const dim3 dGrid( row_count , 1 , 1 );
 
  497     const size_type shmem =
 
  498       sizeof(VectorScalar) * (5*tile_size + dBlock.x*dBlock.y);
 
  502     std::cout << 
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply" 
  504               << 
"  grid(" << dGrid.x << 
"," << dGrid.y << 
")" << std::endl
 
  505               << 
"  block(" << dBlock.x << 
"," << dBlock.y << 
")" << std::endl
 
  506               << 
"  shmem(" << shmem/1024 << 
" kB)" << std::endl
 
  507               << 
"  row_count(" << row_count << 
")" << std::endl
 
  508               << 
"  tensor_dimension(" << tensor_dimension << 
")" << std::endl
 
  509               << 
"  tile_size(" << tile_size << 
")" << std::endl
 
  510               << 
"  num_tiles(" << num_tiles << 
")" << std::endl
 
  515     Kokkos::parallel_for( row_count , 
Zero(y) );
 
  518     for (size_type tile = 0; tile<num_tiles; ++tile) {
 
  519       Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
 
  520         ( ProductTensorLoop( A , x , y, tile ) );
 
  533 template< 
typename TensorScalar ,
 
  534           typename MatrixScalar ,
 
  535           typename VectorScalar >
 
  537   BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
 
  538                   MatrixScalar, Kokkos::Cuda >,
 
  539   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
 
  540   Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
 
  545   typedef execution_space::size_type size_type ;
 
  547   typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
 
  548   typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
 
  549   typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >  vector_type ;
 
  551   class ProductTensorLoop {
 
  554     const matrix_type m_A;
 
  555     const vector_type m_x;
 
  556     const vector_type m_y;
 
  558     ProductTensorLoop( 
const matrix_type & A,
 
  559                        const vector_type & x,
 
  560                        const vector_type & y )
 
  561       : m_A( A ), m_x( x ), m_y( y ) {}
 
  564     void operator()(
void)
 const 
  566       const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
 
  569       const size_type dim = m_A.block.dimension();
 
  570       const size_type tile_size = m_A.block.tile_size();
 
  571       const size_type max_num_rows = m_A.block.max_num_rows();
 
  573       const size_type nid = blockDim.x * blockDim.y * blockDim.z;
 
  574       const size_type tid =
 
  575         threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * threadIdx.z );
 
  576       const size_type thread_lane = threadIdx.x + blockDim.x * threadIdx.y;
 
  579       VectorScalar * 
const sh_x_k =
 
  580         kokkos_impl_cuda_shared_memory<VectorScalar>();
 
  582       VectorScalar * 
const sh_x_j = sh_x_k;
 
  583       VectorScalar * 
const sh_A_k = sh_x_j + blockDim.x*tile_size;
 
  585       VectorScalar * 
const sh_A_j = sh_A_k;
 
  586       VectorScalar * 
const sh_y   = sh_A_j + blockDim.x*tile_size;
 
  587       volatile VectorScalar * 
const sh_t = sh_y + tile_size;
 
  589       size_type * 
const sh_j = (size_type*) (sh_t + nid);
 
  590       size_type * 
const sh_k = (size_type*) (sh_j + nid);
 
  595       const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
 
  596       const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
 
  597       size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / blockDim.x;
 
  598       const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % blockDim.x;
 
  599       if (remBlock > 0) ++numBlock;
 
  602       for (size_type i=tid; i<dim; i+=nid) {
 
  603         m_y(i,blockIdx.x) = 0.0;
 
  607       const size_type n_tile = m_A.block.num_tiles();
 
  610       for (size_type tile = 0; tile<n_tile; ++tile) {
 
  613         const size_type i_offset = m_A.block.offset(tile, 0);
 
  614         const size_type j_offset = m_A.block.offset(tile, 1);
 
  615         const size_type k_offset = m_A.block.offset(tile, 2);
 
  616         const size_type i_range  = m_A.block.range(tile, 0);
 
  617         const size_type j_range  = m_A.block.range(tile, 1);
 
  618         const size_type k_range  = m_A.block.range(tile, 2);
 
  619         const size_type n_row    = m_A.block.num_rows(tile);
 
  622         for (size_type i=tid; i<i_range; i+=nid) {
 
  628         size_type iBlockEntry = iBlockEntryBeg;
 
  629         for (size_type block=0; block<numBlock;
 
  630              ++block, iBlockEntry+=blockDim.x) {
 
  632           const size_type block_size =
 
  633             (block == numBlock-1 && remBlock > 0) ? remBlock : blockDim.x;
 
  636           for (size_type col=0; col<block_size; ++col) {
 
  639             const size_type iBlockColumn =
 
  640               m_A.graph.entries( iBlockEntry + col );
 
  642             const VectorScalar * 
const x_k = & m_x( k_offset , iBlockColumn );
 
  643             const VectorScalar * 
const x_j = & m_x( j_offset , iBlockColumn );
 
  644             const MatrixScalar * 
const A_k =
 
  645               & m_A.values( k_offset , iBlockEntry + col );
 
  646             const MatrixScalar * 
const A_j =
 
  647               & m_A.values( j_offset , iBlockEntry + col );
 
  648             for (size_type j=tid; j<j_range; j+=nid) {
 
  649               sh_x_j[col+j*blockDim.x] = x_j[
j];
 
  650               sh_A_j[col+j*blockDim.x] = A_j[
j];
 
  662           for (size_type i=threadIdx.z; i<i_range; i+=blockDim.z) {
 
  667             const size_type lBeg = m_A.block.entry_begin(tile, i);
 
  668             const size_type lEnd = m_A.block.entry_end(tile, i);
 
  673             for (size_type l=lBeg; l<lEnd; l+=WarpSize) {
 
  677               if (l+thread_lane < lEnd) {
 
  678                 sh_j[tid] = m_A.block.coord(l+thread_lane,0);
 
  679                 sh_k[tid] = m_A.block.coord(l+thread_lane,1);
 
  680                 sh_t[tid] = m_A.block.value(l+thread_lane);
 
  683               const int mm = (l+WarpSize<lEnd) ? WarpSize : lEnd-l;
 
  684               for (size_type m=threadIdx.y; m<mm; m+=blockDim.y) {
 
  686                 if (threadIdx.x < block_size) {
 
  690                   const size_type j = sh_j[m+WarpSize*threadIdx.z];
 
  691                   const size_type k = sh_k[m+WarpSize*threadIdx.z];
 
  692                   const TensorScalar v = sh_t[m+WarpSize*threadIdx.z];
 
  693                   const size_type jj = threadIdx.x + j*blockDim.x;
 
  694                   const size_type kk = threadIdx.x + k*blockDim.x;
 
  695                   s += v * ( sh_A_j[jj] * sh_x_k[kk] + sh_A_k[kk] * sh_x_j[jj] );
 
  704             if ( thread_lane + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
 
  705             if ( thread_lane +  8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
 
  706             if ( thread_lane +  4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
 
  707             if ( thread_lane +  2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
 
  708             if ( thread_lane +  1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
 
  709             if ( thread_lane == 0 ) sh_y[i] += sh_t[tid];
 
  719         for (size_type i=tid; i<i_range; i+=nid) {
 
  720           m_y( i+i_offset , blockIdx.x ) += sh_y[i];
 
  729   static void apply( 
const matrix_type & A ,
 
  730                      const vector_type & x ,
 
  731                      const vector_type & y )
 
  733     const size_type row_count = A.graph.row_map.extent(0) - 1;
 
  734     const size_type tensor_dimension = A.block.dimension();
 
  735     const size_type tile_size = A.block.tile_size();
 
  736     const size_type num_tiles = A.block.num_tiles();
 
  737     const size_type max_num_rows = A.block.max_num_rows();
 
  739     const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
 
  740     const size_type block_size = 8;
 
  741     const size_type nWarp = 16;
 
  742     const dim3 dBlock( block_size, warp_size / block_size, nWarp );
 
  743     const dim3 dGrid( row_count , 1 , 1 );
 
  745     const size_type shmem =
 
  746       sizeof(VectorScalar) * ( (2*block_size+1)*tile_size +
 
  747                                dBlock.x*dBlock.y*dBlock.z ) +
 
  748       sizeof(size_type) * 2 * dBlock.x*dBlock.y*dBlock.z;
 
  750     const size_type cmem = 
sizeof(size_type)*(max_num_rows+1)*num_tiles;
 
  754     std::cout << 
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply" 
  756               << 
"  grid(" << dGrid.x << 
"," << dGrid.y << 
")" << std::endl
 
  757               << 
"  block(" << dBlock.x << 
"," << dBlock.y << 
"," << dBlock.z
 
  759               << 
"  block_size(" << block_size << 
")" << std::endl
 
  760               << 
"  shmem(" << shmem/1024 << 
" kB)" << std::endl
 
  761               << 
"  row_count(" << row_count << 
")" << std::endl
 
  762               << 
"  tensor_dimension(" << tensor_dimension << 
")" << std::endl
 
  763               << 
"  tile_size(" << tile_size << 
")" << std::endl
 
  764               << 
"  num_tiles(" << num_tiles << 
")" << std::endl
 
  765               << 
"  cmem(" << cmem/1024 << 
" kB)" << std::endl
 
  774     Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
 
  775       ( ProductTensorLoop( A , x , y ) );
 
BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
const size_type m_block_size
Kokkos::DefaultExecutionSpace execution_space
Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
TiledCrsProductTensor< TensorScalar, execution_space > tensor_type
Kokkos::Cuda execution_space
CRS matrix of dense blocks. 
execution_space::size_type size_type
__device__ void operator()(void) const 
ProductTensorLoop(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type block_size)