10 #ifndef STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
11 #define STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
15 #include "Kokkos_Core.hpp"
23 #include "cuda_profiler_api.h"
33 template<
typename TensorScalar ,
34 typename MatrixScalar ,
35 typename VectorScalar >
37 BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
38 MatrixScalar, Kokkos::Cuda >,
39 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
40 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
45 typedef execution_space::size_type size_type ;
47 typedef SimpleTiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
48 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
49 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
51 class ProductTensorLoop {
54 const matrix_type m_A;
55 const vector_type m_x;
56 const vector_type m_y;
57 const size_type m_block_size ;
59 ProductTensorLoop(
const matrix_type &
A,
60 const vector_type & x,
61 const vector_type & y,
62 const size_type block_size )
63 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
66 void operator()(
void)
const
69 const size_type dim = m_A.block.dimension();
70 const size_type max_tile_size = m_A.block.max_jk_tile_size();
72 volatile VectorScalar *
const sh_x_k =
73 kokkos_impl_cuda_shared_memory<VectorScalar>();
74 volatile VectorScalar *
const sh_x_j = sh_x_k+m_block_size*max_tile_size;
75 volatile VectorScalar *
const sh_A_k = sh_x_j+m_block_size*max_tile_size;
76 volatile VectorScalar *
const sh_A_j = sh_A_k+m_block_size*max_tile_size;
77 volatile VectorScalar *
const sh_y = sh_A_j+m_block_size*max_tile_size;
79 const size_type nid = blockDim.x * blockDim.y;
80 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
83 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
84 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
85 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
86 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
87 if (remBlock > 0) ++numBlock;
90 const size_type n_i_tile = m_A.block.num_i_tiles();
91 for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
92 const size_type i_begin = m_A.block.i_begin(i_tile);
93 const size_type i_size = m_A.block.i_size(i_tile);
96 for (size_type i=tid; i<i_size; i+=nid) {
101 size_type iBlockEntry = iBlockEntryBeg;
102 for (size_type block=0; block<numBlock; ++block,
103 iBlockEntry+=m_block_size) {
104 const size_type block_size =
105 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
108 const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
109 for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
110 const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
111 const size_type j_size = m_A.block.j_size(i_tile, j_tile);
118 for (size_type col=0; col<block_size; ++col) {
119 const size_type iBlockColumn =
120 m_A.graph.entries(iBlockEntry + col);
121 const VectorScalar *
const x_j =
122 &m_x(j_begin, iBlockColumn);
123 const MatrixScalar *
const A_j =
124 &m_A.values(j_begin, iBlockEntry + col);
125 for (size_type
j=tid;
j<j_size;
j+=nid) {
126 sh_x_j[col+
j*m_block_size] = x_j[
j];
127 sh_A_j[col+
j*m_block_size] = A_j[
j];
132 const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
133 for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
134 const size_type k_begin =
135 m_A.block.k_begin(i_tile, j_tile, k_tile);
136 const size_type k_size =
137 m_A.block.k_size(i_tile, j_tile, k_tile);
144 for (size_type col=0; col<block_size; ++col) {
145 const size_type iBlockColumn =
146 m_A.graph.entries(iBlockEntry + col);
147 const VectorScalar *
const x_k =
148 &m_x(k_begin, iBlockColumn);
149 const MatrixScalar *
const A_k =
150 &m_A.values(k_begin, iBlockEntry + col);
151 for (size_type k=tid; k<k_size; k+=nid) {
152 sh_x_k[col+k*m_block_size] = x_k[k];
153 sh_A_k[col+k*m_block_size] = A_k[k];
160 for (size_type i=threadIdx.y; i<i_size; i+=blockDim.y) {
164 const size_type lBeg =
165 m_A.block.entry_begin(i_tile, j_tile, k_tile, i);
166 const size_type lEnd =
167 m_A.block.entry_end(i_tile, j_tile, k_tile, i);
171 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
172 const size_type kj = m_A.block.coord( l );
173 const TensorScalar v = m_A.block.value( l );
174 const size_type
j = ( kj & 0x0ffff ) * m_block_size ;
175 const size_type k = ( kj >> 16 ) * m_block_size ;
177 for ( size_type col = 0; col < block_size; ++col ) {
178 s += v * ( sh_A_j[col+
j] * sh_x_k[col+k] +
179 sh_A_k[col+k] * sh_x_j[col+
j] );
184 if (blockDim.x >= 2) s +=
shfl_down(s, 1, blockDim.x);
185 if (blockDim.x >= 4) s +=
shfl_down(s, 2, blockDim.x);
186 if (blockDim.x >= 8) s +=
shfl_down(s, 4, blockDim.x);
187 if (blockDim.x >= 16) s +=
shfl_down(s, 8, blockDim.x);
188 if (blockDim.x >= 32) s +=
shfl_down(s, 16, blockDim.x);
189 if ( threadIdx.x == 0 ) sh_y[i] += s;
203 for (size_type i=tid; i<i_size; i+=nid) {
204 m_y( i+i_begin , blockIdx.x ) = sh_y[i];
215 static void apply(
const matrix_type &
A ,
216 const vector_type & x ,
217 const vector_type & y )
219 const size_type row_count = A.graph.row_map.extent(0) - 1;
220 const size_type tensor_dimension = A.block.dimension();
221 const size_type i_tile_size = A.block.max_i_tile_size();
222 const size_type jk_tile_size = A.block.max_jk_tile_size();
225 const size_type nWarp = 4;
228 const size_type nWarp = 16;
230 const size_type threads_per_row = 16;
231 const size_type rows_per_warp =
232 Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
233 const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
234 const dim3 dGrid( row_count , 1 , 1 );
236 const size_type shcap =
237 Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
239 (shcap /
sizeof(VectorScalar) - i_tile_size) / (4*jk_tile_size);
240 if (bs % 2 == 0) --bs;
241 const size_type block_size_max = 31;
242 const size_type block_size =
std::min(bs, block_size_max);
244 const size_type shmem =
245 sizeof(VectorScalar) * (4*jk_tile_size*block_size + i_tile_size);
255 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
257 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
258 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
259 <<
" block_size(" << block_size <<
")" << std::endl
260 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
261 <<
" row_count(" << row_count <<
")" << std::endl
262 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
263 <<
" tile_size(" << tile_size <<
")" << std::endl
264 <<
" num_tiles(" << num_tiles <<
")" << std::endl
268 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
269 ( ProductTensorLoop( A , x , y, block_size ) );
278 template<
typename TensorScalar ,
279 typename MatrixScalar ,
280 typename VectorScalar >
283 MatrixScalar, Kokkos::Cuda >,
284 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
285 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
294 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >
vector_type ;
296 class ProductTensorLoop {
308 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
314 const size_type dim = m_A.block.dimension();
315 const size_type max_tile_size = m_A.block.max_jk_tile_size();
317 const size_type nid = blockDim.x * blockDim.y;
318 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
323 volatile VectorScalar *
const sh_x_k =
324 kokkos_impl_cuda_shared_memory<VectorScalar>();
325 volatile VectorScalar *
const sh_x_j = sh_x_k+m_block_size*max_tile_size;
326 volatile VectorScalar *
const sh_A_k = sh_x_j+m_block_size*max_tile_size;
327 volatile VectorScalar *
const sh_A_j = sh_A_k+m_block_size*max_tile_size;
332 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.y ];
333 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.y + 1 ];
334 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
335 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
336 if (remBlock > 0) ++numBlock;
340 const size_type i_begin = m_A.block.i_begin(i_tile);
341 const size_type i_size = m_A.block.i_size(i_tile);
344 if (i1 >= i_size)
return;
351 for (
size_type block=0; block<numBlock; ++block,
352 iBlockEntry+=m_block_size) {
354 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
360 for (
size_type col=0; col<block_size; ++col)
361 sh_c[col] = m_A.graph.entries(iBlockEntry + col);
365 const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
366 for (
size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
367 const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
368 const size_type j_size = m_A.block.j_size(i_tile, j_tile);
375 for (
size_type col=0; col<block_size; ++col) {
376 const VectorScalar *
const x_j = &m_x(j_begin, sh_c[col]);
377 const MatrixScalar *
const A_j = &m_A.values(j_begin, iBlockEntry + col);
378 for (
size_type j=tid; j<j_size; j+=nid) {
379 sh_x_j[col+j*m_block_size] = x_j[
j];
380 sh_A_j[col+j*m_block_size] = A_j[
j];
393 const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
394 for (
size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
396 m_A.block.k_begin(i_tile, j_tile, k_tile);
398 m_A.block.k_size(i_tile, j_tile, k_tile);
405 for (
size_type col=0; col<block_size; ++col) {
406 const VectorScalar *
const x_k =
407 &m_x(k_begin, sh_c[col]);
408 const MatrixScalar *
const A_k =
409 &m_A.values(k_begin, iBlockEntry + col);
410 for (
size_type k=tid; k<k_size; k+=nid) {
411 sh_x_k[col+k*m_block_size] = x_k[k];
412 sh_A_k[col+k*m_block_size] = A_k[k];
428 m_A.block.entry_begin(i_tile, j_tile, k_tile, i1);
430 m_A.block.entry_end(i_tile, j_tile, k_tile, i1);
436 for (
size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
437 const size_type kj = m_A.block.coord( l );
438 const TensorScalar v = m_A.block.value( l );
439 const size_type j = ( kj & 0x0ffff ) * m_block_size ;
440 const size_type k = ( kj >> 16 ) * m_block_size ;
442 for (
size_type col = 0; col < block_size; ++col ) {
443 s1 += v * ( sh_A_j[col+
j] * sh_x_k[col+k] +
444 sh_A_k[col+k] * sh_x_j[col+
j] );
480 if (blockDim.x >= 2) s1 +=
shfl_down(s1, 1, blockDim.x);
481 if (blockDim.x >= 4) s1 +=
shfl_down(s1, 2, blockDim.x);
482 if (blockDim.x >= 8) s1 +=
shfl_down(s1, 4, blockDim.x);
483 if (blockDim.x >= 16) s1 +=
shfl_down(s1, 8, blockDim.x);
484 if (blockDim.x >= 32) s1 +=
shfl_down(s1, 16, blockDim.x);
485 if ( threadIdx.x == 0 ) m_y( i1+i_begin , blockIdx.y ) = s1;
519 Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
520 const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
521 const dim3 dGrid( n_i_tile , row_count , 1 );
524 Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
525 size_type bs = ((shcap /
sizeof(VectorScalar)) / tile_size) / 4;
526 if (bs % 2 == 0) --bs;
530 const size_type shmem =
sizeof(VectorScalar)*(4*block_size*tile_size);
544 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
546 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
547 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
548 <<
" block_size(" << block_size <<
")" << std::endl
549 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
550 <<
" row_count(" << row_count <<
")" << std::endl
551 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
552 <<
" tile_size(" << tile_size <<
")" << std::endl
553 <<
" num_i_tiles(" << n_i_tile <<
")" << std::endl
557 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
558 ( ProductTensorLoop( A , x , y, block_size ) );
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::Cuda execution_space
execution_space::size_type size_type
Kokkos::DefaultExecutionSpace execution_space
__device__ void operator()(void) const
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
ProductTensorLoop(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type block_size)
SimpleTiledCrsProductTensor< TensorScalar, execution_space > tensor_type
Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
const size_type m_block_size
CRS matrix of dense blocks.
static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type