42 #ifndef STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
47 #include "Kokkos_Core.hpp"
55 #include "cuda_profiler_api.h"
65 template<
typename TensorScalar ,
66 typename MatrixScalar ,
67 typename VectorScalar >
69 BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
70 MatrixScalar, Kokkos::Cuda >,
71 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
72 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
77 typedef execution_space::size_type size_type ;
79 typedef SimpleTiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
80 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
81 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
83 class ProductTensorLoop {
86 const matrix_type m_A;
87 const vector_type m_x;
88 const vector_type m_y;
89 const size_type m_block_size ;
91 ProductTensorLoop(
const matrix_type &
A,
92 const vector_type & x,
93 const vector_type & y,
94 const size_type block_size )
95 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
98 void operator()(
void)
const
101 const size_type dim = m_A.block.dimension();
102 const size_type max_tile_size = m_A.block.max_jk_tile_size();
104 volatile VectorScalar *
const sh_x_k =
105 kokkos_impl_cuda_shared_memory<VectorScalar>();
106 volatile VectorScalar *
const sh_x_j = sh_x_k+m_block_size*max_tile_size;
107 volatile VectorScalar *
const sh_A_k = sh_x_j+m_block_size*max_tile_size;
108 volatile VectorScalar *
const sh_A_j = sh_A_k+m_block_size*max_tile_size;
109 volatile VectorScalar *
const sh_y = sh_A_j+m_block_size*max_tile_size;
111 const size_type nid = blockDim.x * blockDim.y;
112 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
115 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
116 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
117 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
118 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
119 if (remBlock > 0) ++numBlock;
122 const size_type n_i_tile = m_A.block.num_i_tiles();
123 for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
124 const size_type i_begin = m_A.block.i_begin(i_tile);
125 const size_type i_size = m_A.block.i_size(i_tile);
128 for (size_type i=tid; i<i_size; i+=nid) {
133 size_type iBlockEntry = iBlockEntryBeg;
134 for (size_type block=0; block<numBlock; ++block,
135 iBlockEntry+=m_block_size) {
136 const size_type block_size =
137 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
140 const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
141 for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
142 const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
143 const size_type j_size = m_A.block.j_size(i_tile, j_tile);
150 for (size_type col=0; col<block_size; ++col) {
151 const size_type iBlockColumn =
152 m_A.graph.entries(iBlockEntry + col);
153 const VectorScalar *
const x_j =
154 &m_x(j_begin, iBlockColumn);
155 const MatrixScalar *
const A_j =
156 &m_A.values(j_begin, iBlockEntry + col);
157 for (size_type
j=tid;
j<j_size;
j+=nid) {
158 sh_x_j[col+
j*m_block_size] = x_j[
j];
159 sh_A_j[col+
j*m_block_size] = A_j[
j];
164 const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
165 for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
166 const size_type k_begin =
167 m_A.block.k_begin(i_tile, j_tile, k_tile);
168 const size_type k_size =
169 m_A.block.k_size(i_tile, j_tile, k_tile);
176 for (size_type col=0; col<block_size; ++col) {
177 const size_type iBlockColumn =
178 m_A.graph.entries(iBlockEntry + col);
179 const VectorScalar *
const x_k =
180 &m_x(k_begin, iBlockColumn);
181 const MatrixScalar *
const A_k =
182 &m_A.values(k_begin, iBlockEntry + col);
183 for (size_type k=tid; k<k_size; k+=nid) {
184 sh_x_k[col+k*m_block_size] = x_k[k];
185 sh_A_k[col+k*m_block_size] = A_k[k];
192 for (size_type i=threadIdx.y; i<i_size; i+=blockDim.y) {
196 const size_type lBeg =
197 m_A.block.entry_begin(i_tile, j_tile, k_tile, i);
198 const size_type lEnd =
199 m_A.block.entry_end(i_tile, j_tile, k_tile, i);
203 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
204 const size_type kj = m_A.block.coord( l );
205 const TensorScalar v = m_A.block.value( l );
206 const size_type
j = ( kj & 0x0ffff ) * m_block_size ;
207 const size_type k = ( kj >> 16 ) * m_block_size ;
209 for ( size_type col = 0; col < block_size; ++col ) {
210 s += v * ( sh_A_j[col+
j] * sh_x_k[col+k] +
211 sh_A_k[col+k] * sh_x_j[col+
j] );
216 if (blockDim.x >= 2) s +=
shfl_down(s, 1, blockDim.x);
217 if (blockDim.x >= 4) s +=
shfl_down(s, 2, blockDim.x);
218 if (blockDim.x >= 8) s +=
shfl_down(s, 4, blockDim.x);
219 if (blockDim.x >= 16) s +=
shfl_down(s, 8, blockDim.x);
220 if (blockDim.x >= 32) s +=
shfl_down(s, 16, blockDim.x);
221 if ( threadIdx.x == 0 ) sh_y[i] += s;
235 for (size_type i=tid; i<i_size; i+=nid) {
236 m_y( i+i_begin , blockIdx.x ) = sh_y[i];
247 static void apply(
const matrix_type &
A ,
248 const vector_type & x ,
249 const vector_type & y )
251 const size_type row_count = A.graph.row_map.extent(0) - 1;
252 const size_type tensor_dimension = A.block.dimension();
253 const size_type i_tile_size = A.block.max_i_tile_size();
254 const size_type jk_tile_size = A.block.max_jk_tile_size();
257 const size_type nWarp = 4;
260 const size_type nWarp = 16;
262 const size_type threads_per_row = 16;
263 const size_type rows_per_warp =
264 Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
265 const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
266 const dim3 dGrid( row_count , 1 , 1 );
268 const size_type shcap =
269 Kokkos::Impl::CudaTraits::SharedMemoryCapacity / 2;
271 (shcap /
sizeof(VectorScalar) - i_tile_size) / (4*jk_tile_size);
272 if (bs % 2 == 0) --bs;
273 const size_type block_size_max = 31;
274 const size_type block_size =
std::min(bs, block_size_max);
276 const size_type shmem =
277 sizeof(VectorScalar) * (4*jk_tile_size*block_size + i_tile_size);
287 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
289 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
290 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
291 <<
" block_size(" << block_size <<
")" << std::endl
292 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
293 <<
" row_count(" << row_count <<
")" << std::endl
294 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
295 <<
" tile_size(" << tile_size <<
")" << std::endl
296 <<
" num_tiles(" << num_tiles <<
")" << std::endl
300 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
301 ( ProductTensorLoop( A , x , y, block_size ) );
310 template<
typename TensorScalar ,
311 typename MatrixScalar ,
312 typename VectorScalar >
315 MatrixScalar, Kokkos::Cuda >,
316 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
317 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
326 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >
vector_type ;
328 class ProductTensorLoop {
340 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
346 const size_type dim = m_A.block.dimension();
347 const size_type max_tile_size = m_A.block.max_jk_tile_size();
349 const size_type nid = blockDim.x * blockDim.y;
350 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
355 volatile VectorScalar *
const sh_x_k =
356 kokkos_impl_cuda_shared_memory<VectorScalar>();
357 volatile VectorScalar *
const sh_x_j = sh_x_k+m_block_size*max_tile_size;
358 volatile VectorScalar *
const sh_A_k = sh_x_j+m_block_size*max_tile_size;
359 volatile VectorScalar *
const sh_A_j = sh_A_k+m_block_size*max_tile_size;
364 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.y ];
365 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.y + 1 ];
366 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
367 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
368 if (remBlock > 0) ++numBlock;
372 const size_type i_begin = m_A.block.i_begin(i_tile);
373 const size_type i_size = m_A.block.i_size(i_tile);
376 if (i1 >= i_size)
return;
383 for (
size_type block=0; block<numBlock; ++block,
384 iBlockEntry+=m_block_size) {
386 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
392 for (
size_type col=0; col<block_size; ++col)
393 sh_c[col] = m_A.graph.entries(iBlockEntry + col);
397 const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
398 for (
size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
399 const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
400 const size_type j_size = m_A.block.j_size(i_tile, j_tile);
407 for (
size_type col=0; col<block_size; ++col) {
408 const VectorScalar *
const x_j = &m_x(j_begin, sh_c[col]);
409 const MatrixScalar *
const A_j = &m_A.values(j_begin, iBlockEntry + col);
410 for (
size_type j=tid; j<j_size; j+=nid) {
411 sh_x_j[col+j*m_block_size] = x_j[
j];
412 sh_A_j[col+j*m_block_size] = A_j[
j];
425 const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
426 for (
size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
428 m_A.block.k_begin(i_tile, j_tile, k_tile);
430 m_A.block.k_size(i_tile, j_tile, k_tile);
437 for (
size_type col=0; col<block_size; ++col) {
438 const VectorScalar *
const x_k =
439 &m_x(k_begin, sh_c[col]);
440 const MatrixScalar *
const A_k =
441 &m_A.values(k_begin, iBlockEntry + col);
442 for (
size_type k=tid; k<k_size; k+=nid) {
443 sh_x_k[col+k*m_block_size] = x_k[k];
444 sh_A_k[col+k*m_block_size] = A_k[k];
460 m_A.block.entry_begin(i_tile, j_tile, k_tile, i1);
462 m_A.block.entry_end(i_tile, j_tile, k_tile, i1);
468 for (
size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
469 const size_type kj = m_A.block.coord( l );
470 const TensorScalar v = m_A.block.value( l );
471 const size_type j = ( kj & 0x0ffff ) * m_block_size ;
472 const size_type k = ( kj >> 16 ) * m_block_size ;
474 for (
size_type col = 0; col < block_size; ++col ) {
475 s1 += v * ( sh_A_j[col+
j] * sh_x_k[col+k] +
476 sh_A_k[col+k] * sh_x_j[col+
j] );
512 if (blockDim.x >= 2) s1 +=
shfl_down(s1, 1, blockDim.x);
513 if (blockDim.x >= 4) s1 +=
shfl_down(s1, 2, blockDim.x);
514 if (blockDim.x >= 8) s1 +=
shfl_down(s1, 4, blockDim.x);
515 if (blockDim.x >= 16) s1 +=
shfl_down(s1, 8, blockDim.x);
516 if (blockDim.x >= 32) s1 +=
shfl_down(s1, 16, blockDim.x);
517 if ( threadIdx.x == 0 ) m_y( i1+i_begin , blockIdx.y ) = s1;
551 Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
552 const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
553 const dim3 dGrid( n_i_tile , row_count , 1 );
556 Kokkos::Impl::CudaTraits::SharedMemoryCapacity / 2;
557 size_type bs = ((shcap /
sizeof(VectorScalar)) / tile_size) / 4;
558 if (bs % 2 == 0) --bs;
562 const size_type shmem =
sizeof(VectorScalar)*(4*block_size*tile_size);
576 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
578 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
579 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
580 <<
" block_size(" << block_size <<
")" << std::endl
581 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
582 <<
" row_count(" << row_count <<
")" << std::endl
583 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
584 <<
" tile_size(" << tile_size <<
")" << std::endl
585 <<
" num_i_tiles(" << n_i_tile <<
")" << std::endl
589 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
590 ( ProductTensorLoop( A , x , y, block_size ) );
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::Cuda execution_space
execution_space::size_type size_type
Kokkos::DefaultExecutionSpace execution_space
__device__ void operator()(void) const
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
ProductTensorLoop(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type block_size)
SimpleTiledCrsProductTensor< TensorScalar, execution_space > tensor_type
Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
const size_type m_block_size
CRS matrix of dense blocks.
static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type