10 #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
11 #define STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
15 #include "Kokkos_Core.hpp"
21 #include "cuda_profiler_api.h"
29 template<
typename TensorScalar ,
30 typename MatrixScalar ,
31 typename VectorScalar >
34 MatrixScalar, Kokkos::Cuda >,
35 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
36 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
45 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >
vector_type ;
47 class ProductTensorLoop {
59 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
64 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
67 const size_type dim = m_A.block.dimension();
70 const size_type n_tile = m_A.block.num_tiles();
71 const size_type tile_size = m_A.block.tile_size();
72 const size_type tile_dim = n_tile == 1 ? dim : tile_size;
75 VectorScalar *
const sh_x_k =
76 kokkos_impl_cuda_shared_memory<VectorScalar>();
77 VectorScalar *
const sh_x_j =
78 n_tile == 1 ? sh_x_k : sh_x_k + m_block_size*tile_dim;
79 VectorScalar *
const sh_A_k =
80 sh_x_j + m_block_size*tile_dim;
81 VectorScalar *
const sh_A_j =
82 n_tile == 1 ? sh_A_k : sh_A_k + m_block_size*tile_dim;
83 VectorScalar *
const sh_y = sh_A_j + m_block_size*tile_dim;
84 volatile VectorScalar *
const sh_t = sh_y + tile_dim;
86 const size_type nid = blockDim.x * blockDim.y;
87 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
90 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
91 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
92 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
93 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
94 if (remBlock > 0) ++numBlock;
98 m_y(i,blockIdx.x) = 0.0;
102 for (
size_type tile = 0; tile<n_tile; ++tile) {
104 const size_type i_offset = m_A.block.offset(tile, 0);
105 const size_type j_offset = m_A.block.offset(tile, 1);
106 const size_type k_offset = m_A.block.offset(tile, 2);
107 const size_type i_range = m_A.block.range(tile, 0);
108 const size_type j_range = m_A.block.range(tile, 1);
109 const size_type k_range = m_A.block.range(tile, 2);
110 const size_type n_row = m_A.block.num_rows(tile);
113 for (
size_type i=tid; i<i_range; i+=nid) {
120 ++block, iBlockEntry+=m_block_size) {
123 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
130 for (
size_type col=0; col<block_size; ++col) {
133 m_A.graph.entries( iBlockEntry + col );
134 const VectorScalar *
const x_k = & m_x( k_offset , iBlockColumn );
135 const VectorScalar *
const x_j = & m_x( j_offset , iBlockColumn );
136 const MatrixScalar *
const A_k = & m_A.values( k_offset , iBlockEntry + col );
137 const MatrixScalar *
const A_j = & m_A.values( j_offset , iBlockEntry + col );
140 sh_x_j[col+
j*m_block_size] = x_j[
j];
141 sh_A_j[col+
j*m_block_size] = A_j[
j];
144 for (
size_type k=tid; k<k_range; k+=nid) {
145 sh_x_k[col+k*m_block_size] = x_k[k];
146 sh_A_k[col+k*m_block_size] = A_k[k];
155 for (
size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
159 const size_type lBeg = m_A.block.entry_begin(tile, i);
160 const size_type lEnd = m_A.block.entry_end(tile, i);
163 for (
size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
165 const size_type kj = m_A.block.coord( l );
166 const TensorScalar v = m_A.block.value( l );
167 const size_type j = ( kj & 0x0ffff ) * m_block_size ;
168 const size_type k = ( kj >> 16 ) * m_block_size ;
170 for (
size_type col = 0; col < block_size; ++col ) {
171 s += v * ( sh_A_j[col+
j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+
j] );
178 if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
179 if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
180 if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
181 if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
182 if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
183 if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
193 for (
size_type i=tid; i<i_range; i+=nid) {
194 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
220 const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
221 const dim3 dGrid( row_count , 1 , 1 );
223 const size_type shmem_factor = num_tiles == 1 ? 2 : 4;
224 const size_type tensor_align = num_tiles == 1 ? tensor_dimension : tile_dim;
226 Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
227 size_type bs = ((shcap /
sizeof(VectorScalar) - dBlock.x*dBlock.y) / tensor_align - 1) / shmem_factor;
234 sizeof(VectorScalar) * ((shmem_factor*block_size+1) * tensor_align + dBlock.x*dBlock.y);
248 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
250 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
251 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
252 <<
" block_size(" << block_size <<
")" << std::endl
253 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
254 <<
" row_count(" << row_count <<
")" << std::endl
255 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
256 <<
" tile_size(" << tile_size <<
")" << std::endl
257 <<
" num_tiles(" << num_tiles <<
")" << std::endl
261 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
262 ( ProductTensorLoop( A , x , y, block_size ) );
271 template<
typename TensorScalar ,
272 typename MatrixScalar ,
273 typename VectorScalar >
275 BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
276 MatrixScalar, Kokkos::Cuda >,
277 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
278 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
283 typedef execution_space::size_type size_type ;
285 typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
286 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
287 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
289 class ProductTensorLoop {
292 const matrix_type m_A;
293 const vector_type m_x;
294 const vector_type m_y;
295 const size_type m_tile;
297 ProductTensorLoop(
const matrix_type &
A,
298 const vector_type & x,
299 const vector_type & y,
300 const size_type tile )
301 : m_A( A ), m_x( x ), m_y( y ), m_tile(tile) {}
304 void operator()(
void)
const
306 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
309 const size_type dim = m_A.block.dimension();
310 const size_type tile_size = m_A.block.tile_size();
312 VectorScalar *
const sh_x_k =
313 kokkos_impl_cuda_shared_memory<VectorScalar>();
314 VectorScalar *
const sh_x_j = sh_x_k + tile_size;
315 VectorScalar *
const sh_A_k = sh_x_j + tile_size;
316 VectorScalar *
const sh_A_j = sh_A_k + tile_size;
317 VectorScalar *
const sh_y = sh_A_j + tile_size;
318 volatile VectorScalar *
const sh_t = sh_y + tile_size;
320 const size_type nid = blockDim.x * blockDim.y;
321 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
330 const size_type i_offset = m_A.block.offset(m_tile, 0);
331 const size_type j_offset = m_A.block.offset(m_tile, 1);
332 const size_type k_offset = m_A.block.offset(m_tile, 2);
333 const size_type i_range = m_A.block.range(m_tile, 0);
334 const size_type j_range = m_A.block.range(m_tile, 1);
335 const size_type k_range = m_A.block.range(m_tile, 2);
336 const size_type n_row = m_A.block.num_rows(m_tile);
339 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
340 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
343 for (size_type i=tid; i<i_range; i+=nid) {
347 const size_type * __restrict cijk_j = &m_A.block.coord(0,0);
348 const size_type * __restrict cijk_k = &m_A.block.coord(0,1);
349 const TensorScalar * __restrict cijk_v = &m_A.block.value(0);
352 for (size_type iBlockEntry = iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
358 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry );
359 const VectorScalar *
const x_k = & m_x( k_offset , iBlockColumn );
360 const VectorScalar *
const x_j = & m_x( j_offset , iBlockColumn );
361 const MatrixScalar *
const A_k = & m_A.values( k_offset , iBlockEntry );
362 const MatrixScalar *
const A_j = & m_A.values( j_offset , iBlockEntry );
364 for (size_type
j=tid;
j<j_range;
j+=nid) {
368 for (size_type k=tid; k<k_range; k+=nid) {
384 for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
388 const size_type lBeg = m_A.block.entry_begin(m_tile, i);
389 const size_type lEnd = m_A.block.entry_end(m_tile, i);
392 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
398 const size_type
j = cijk_j[l];
399 const size_type k = cijk_k[l];
400 const TensorScalar v = cijk_v[l];
402 s += v * ( sh_A_j[
j] * sh_x_k[k] + sh_A_k[k] * sh_x_j[
j] );
408 if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
409 if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
410 if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
411 if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
412 if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
413 if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
423 for (size_type i=tid; i<i_range; i+=nid) {
424 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
434 typedef typename execution_space::size_type size_type;
436 Zero(
const vector_type & x ) : m_x( x ), m_d(x.extent(0)) {}
438 KOKKOS_INLINE_FUNCTION
439 void operator()(
const size_type
j )
const {
440 for (size_type i=0; i<m_d; ++i)
445 const vector_type m_x;
452 static void apply(
const matrix_type &
A ,
453 const vector_type & x ,
454 const vector_type & y )
456 const size_type row_count = A.graph.row_map.extent(0) - 1;
457 const size_type tensor_dimension = A.block.dimension();
458 const size_type tile_size = A.block.tile_size();
459 const size_type num_tiles = A.block.num_tiles();
461 const size_type nWarp = 16;
462 const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
463 const dim3 dGrid( row_count , 1 , 1 );
465 const size_type shmem =
466 sizeof(VectorScalar) * (5*tile_size + dBlock.x*dBlock.y);
470 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
472 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
473 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
474 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
475 <<
" row_count(" << row_count <<
")" << std::endl
476 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
477 <<
" tile_size(" << tile_size <<
")" << std::endl
478 <<
" num_tiles(" << num_tiles <<
")" << std::endl
483 Kokkos::parallel_for( row_count ,
Zero(y) );
486 for (size_type tile = 0; tile<num_tiles; ++tile) {
487 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
488 ( ProductTensorLoop( A , x , y, tile ) );
501 template<
typename TensorScalar ,
502 typename MatrixScalar ,
503 typename VectorScalar >
505 BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
506 MatrixScalar, Kokkos::Cuda >,
507 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
508 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
513 typedef execution_space::size_type size_type ;
515 typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
516 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
517 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
519 class ProductTensorLoop {
522 const matrix_type m_A;
523 const vector_type m_x;
524 const vector_type m_y;
526 ProductTensorLoop(
const matrix_type & A,
527 const vector_type & x,
528 const vector_type & y )
529 : m_A( A ), m_x( x ), m_y( y ) {}
532 void operator()(
void)
const
534 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
537 const size_type dim = m_A.block.dimension();
538 const size_type tile_size = m_A.block.tile_size();
539 const size_type max_num_rows = m_A.block.max_num_rows();
541 const size_type nid = blockDim.x * blockDim.y * blockDim.z;
542 const size_type tid =
543 threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * threadIdx.z );
544 const size_type thread_lane = threadIdx.x + blockDim.x * threadIdx.y;
547 VectorScalar *
const sh_x_k =
548 kokkos_impl_cuda_shared_memory<VectorScalar>();
550 VectorScalar *
const sh_x_j = sh_x_k;
551 VectorScalar *
const sh_A_k = sh_x_j + blockDim.x*tile_size;
553 VectorScalar *
const sh_A_j = sh_A_k;
554 VectorScalar *
const sh_y = sh_A_j + blockDim.x*tile_size;
555 volatile VectorScalar *
const sh_t = sh_y + tile_size;
557 size_type *
const sh_j = (size_type*) (sh_t + nid);
558 size_type *
const sh_k = (size_type*) (sh_j + nid);
563 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
564 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
565 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / blockDim.x;
566 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % blockDim.x;
567 if (remBlock > 0) ++numBlock;
570 for (size_type i=tid; i<dim; i+=nid) {
571 m_y(i,blockIdx.x) = 0.0;
575 const size_type n_tile = m_A.block.num_tiles();
578 for (size_type tile = 0; tile<n_tile; ++tile) {
581 const size_type i_offset = m_A.block.offset(tile, 0);
582 const size_type j_offset = m_A.block.offset(tile, 1);
583 const size_type k_offset = m_A.block.offset(tile, 2);
584 const size_type i_range = m_A.block.range(tile, 0);
585 const size_type j_range = m_A.block.range(tile, 1);
586 const size_type k_range = m_A.block.range(tile, 2);
587 const size_type n_row = m_A.block.num_rows(tile);
590 for (size_type i=tid; i<i_range; i+=nid) {
596 size_type iBlockEntry = iBlockEntryBeg;
597 for (size_type block=0; block<numBlock;
598 ++block, iBlockEntry+=blockDim.x) {
600 const size_type block_size =
601 (block == numBlock-1 && remBlock > 0) ? remBlock : blockDim.x;
604 for (size_type col=0; col<block_size; ++col) {
607 const size_type iBlockColumn =
608 m_A.graph.entries( iBlockEntry + col );
610 const VectorScalar *
const x_k = & m_x( k_offset , iBlockColumn );
611 const VectorScalar *
const x_j = & m_x( j_offset , iBlockColumn );
612 const MatrixScalar *
const A_k =
613 & m_A.values( k_offset , iBlockEntry + col );
614 const MatrixScalar *
const A_j =
615 & m_A.values( j_offset , iBlockEntry + col );
616 for (size_type j=tid; j<j_range; j+=nid) {
617 sh_x_j[col+j*blockDim.x] = x_j[
j];
618 sh_A_j[col+j*blockDim.x] = A_j[
j];
630 for (size_type i=threadIdx.z; i<i_range; i+=blockDim.z) {
635 const size_type lBeg = m_A.block.entry_begin(tile, i);
636 const size_type lEnd = m_A.block.entry_end(tile, i);
641 for (size_type l=lBeg; l<lEnd; l+=WarpSize) {
645 if (l+thread_lane < lEnd) {
646 sh_j[tid] = m_A.block.coord(l+thread_lane,0);
647 sh_k[tid] = m_A.block.coord(l+thread_lane,1);
648 sh_t[tid] = m_A.block.value(l+thread_lane);
651 const int mm = (l+WarpSize<lEnd) ? WarpSize : lEnd-l;
652 for (size_type m=threadIdx.y; m<mm; m+=blockDim.y) {
654 if (threadIdx.x < block_size) {
658 const size_type j = sh_j[m+WarpSize*threadIdx.z];
659 const size_type k = sh_k[m+WarpSize*threadIdx.z];
660 const TensorScalar v = sh_t[m+WarpSize*threadIdx.z];
661 const size_type jj = threadIdx.x + j*blockDim.x;
662 const size_type kk = threadIdx.x + k*blockDim.x;
663 s += v * ( sh_A_j[jj] * sh_x_k[kk] + sh_A_k[kk] * sh_x_j[jj] );
672 if ( thread_lane + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
673 if ( thread_lane + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
674 if ( thread_lane + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
675 if ( thread_lane + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
676 if ( thread_lane + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
677 if ( thread_lane == 0 ) sh_y[i] += sh_t[tid];
687 for (size_type i=tid; i<i_range; i+=nid) {
688 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
697 static void apply(
const matrix_type & A ,
698 const vector_type & x ,
699 const vector_type & y )
701 const size_type row_count = A.graph.row_map.extent(0) - 1;
702 const size_type tensor_dimension = A.block.dimension();
703 const size_type tile_size = A.block.tile_size();
704 const size_type num_tiles = A.block.num_tiles();
705 const size_type max_num_rows = A.block.max_num_rows();
707 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
708 const size_type block_size = 8;
709 const size_type nWarp = 16;
710 const dim3 dBlock( block_size, warp_size / block_size, nWarp );
711 const dim3 dGrid( row_count , 1 , 1 );
713 const size_type shmem =
714 sizeof(VectorScalar) * ( (2*block_size+1)*tile_size +
715 dBlock.x*dBlock.y*dBlock.z ) +
716 sizeof(size_type) * 2 * dBlock.x*dBlock.y*dBlock.z;
718 const size_type cmem =
sizeof(size_type)*(max_num_rows+1)*num_tiles;
722 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
724 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
725 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
"," << dBlock.z
727 <<
" block_size(" << block_size <<
")" << std::endl
728 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
729 <<
" row_count(" << row_count <<
")" << std::endl
730 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
731 <<
" tile_size(" << tile_size <<
")" << std::endl
732 <<
" num_tiles(" << num_tiles <<
")" << std::endl
733 <<
" cmem(" << cmem/1024 <<
" kB)" << std::endl
742 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
743 ( ProductTensorLoop( A , x , y ) );
BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
const size_type m_block_size
Kokkos::DefaultExecutionSpace execution_space
Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
TiledCrsProductTensor< TensorScalar, execution_space > tensor_type
Kokkos::Cuda execution_space
CRS matrix of dense blocks.
execution_space::size_type size_type
__device__ void operator()(void) const
ProductTensorLoop(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type block_size)