42 #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
47 #include "Kokkos_Core.hpp"
53 #include "cuda_profiler_api.h"
61 template<
typename TensorScalar ,
62 typename MatrixScalar ,
63 typename VectorScalar >
66 MatrixScalar, Kokkos::Cuda >,
67 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
68 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
77 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >
vector_type ;
79 class ProductTensorLoop {
91 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
96 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
99 const size_type dim = m_A.block.dimension();
102 const size_type n_tile = m_A.block.num_tiles();
103 const size_type tile_size = m_A.block.tile_size();
104 const size_type tile_dim = n_tile == 1 ? dim : tile_size;
107 VectorScalar *
const sh_x_k =
108 kokkos_impl_cuda_shared_memory<VectorScalar>();
109 VectorScalar *
const sh_x_j =
110 n_tile == 1 ? sh_x_k : sh_x_k + m_block_size*tile_dim;
111 VectorScalar *
const sh_A_k =
112 sh_x_j + m_block_size*tile_dim;
113 VectorScalar *
const sh_A_j =
114 n_tile == 1 ? sh_A_k : sh_A_k + m_block_size*tile_dim;
115 VectorScalar *
const sh_y = sh_A_j + m_block_size*tile_dim;
116 volatile VectorScalar *
const sh_t = sh_y + tile_dim;
118 const size_type nid = blockDim.x * blockDim.y;
119 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
122 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
123 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
124 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
125 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
126 if (remBlock > 0) ++numBlock;
130 m_y(i,blockIdx.x) = 0.0;
134 for (
size_type tile = 0; tile<n_tile; ++tile) {
136 const size_type i_offset = m_A.block.offset(tile, 0);
137 const size_type j_offset = m_A.block.offset(tile, 1);
138 const size_type k_offset = m_A.block.offset(tile, 2);
139 const size_type i_range = m_A.block.range(tile, 0);
140 const size_type j_range = m_A.block.range(tile, 1);
141 const size_type k_range = m_A.block.range(tile, 2);
142 const size_type n_row = m_A.block.num_rows(tile);
145 for (
size_type i=tid; i<i_range; i+=nid) {
152 ++block, iBlockEntry+=m_block_size) {
155 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
162 for (
size_type col=0; col<block_size; ++col) {
165 m_A.graph.entries( iBlockEntry + col );
166 const VectorScalar *
const x_k = & m_x( k_offset , iBlockColumn );
167 const VectorScalar *
const x_j = & m_x( j_offset , iBlockColumn );
168 const MatrixScalar *
const A_k = & m_A.values( k_offset , iBlockEntry + col );
169 const MatrixScalar *
const A_j = & m_A.values( j_offset , iBlockEntry + col );
172 sh_x_j[col+
j*m_block_size] = x_j[
j];
173 sh_A_j[col+
j*m_block_size] = A_j[
j];
176 for (
size_type k=tid; k<k_range; k+=nid) {
177 sh_x_k[col+k*m_block_size] = x_k[k];
178 sh_A_k[col+k*m_block_size] = A_k[k];
187 for (
size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
191 const size_type lBeg = m_A.block.entry_begin(tile, i);
192 const size_type lEnd = m_A.block.entry_end(tile, i);
195 for (
size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
197 const size_type kj = m_A.block.coord( l );
198 const TensorScalar v = m_A.block.value( l );
199 const size_type j = ( kj & 0x0ffff ) * m_block_size ;
200 const size_type k = ( kj >> 16 ) * m_block_size ;
202 for (
size_type col = 0; col < block_size; ++col ) {
203 s += v * ( sh_A_j[col+
j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+
j] );
210 if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
211 if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
212 if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
213 if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
214 if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
215 if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
225 for (
size_type i=tid; i<i_range; i+=nid) {
226 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
252 const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
253 const dim3 dGrid( row_count , 1 , 1 );
255 const size_type shmem_factor = num_tiles == 1 ? 2 : 4;
256 const size_type tensor_align = num_tiles == 1 ? tensor_dimension : tile_dim;
258 Kokkos::Impl::CudaTraits::SharedMemoryCapacity / 2;
259 size_type bs = ((shcap /
sizeof(VectorScalar) - dBlock.x*dBlock.y) / tensor_align - 1) / shmem_factor;
266 sizeof(VectorScalar) * ((shmem_factor*block_size+1) * tensor_align + dBlock.x*dBlock.y);
280 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
282 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
283 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
284 <<
" block_size(" << block_size <<
")" << std::endl
285 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
286 <<
" row_count(" << row_count <<
")" << std::endl
287 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
288 <<
" tile_size(" << tile_size <<
")" << std::endl
289 <<
" num_tiles(" << num_tiles <<
")" << std::endl
293 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
294 ( ProductTensorLoop( A , x , y, block_size ) );
303 template<
typename TensorScalar ,
304 typename MatrixScalar ,
305 typename VectorScalar >
307 BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
308 MatrixScalar, Kokkos::Cuda >,
309 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
310 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
315 typedef execution_space::size_type size_type ;
317 typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
318 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
319 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
321 class ProductTensorLoop {
324 const matrix_type m_A;
325 const vector_type m_x;
326 const vector_type m_y;
327 const size_type m_tile;
329 ProductTensorLoop(
const matrix_type &
A,
330 const vector_type & x,
331 const vector_type & y,
332 const size_type tile )
333 : m_A( A ), m_x( x ), m_y( y ), m_tile(tile) {}
336 void operator()(
void)
const
338 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
341 const size_type dim = m_A.block.dimension();
342 const size_type tile_size = m_A.block.tile_size();
344 VectorScalar *
const sh_x_k =
345 kokkos_impl_cuda_shared_memory<VectorScalar>();
346 VectorScalar *
const sh_x_j = sh_x_k + tile_size;
347 VectorScalar *
const sh_A_k = sh_x_j + tile_size;
348 VectorScalar *
const sh_A_j = sh_A_k + tile_size;
349 VectorScalar *
const sh_y = sh_A_j + tile_size;
350 volatile VectorScalar *
const sh_t = sh_y + tile_size;
352 const size_type nid = blockDim.x * blockDim.y;
353 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
362 const size_type i_offset = m_A.block.offset(m_tile, 0);
363 const size_type j_offset = m_A.block.offset(m_tile, 1);
364 const size_type k_offset = m_A.block.offset(m_tile, 2);
365 const size_type i_range = m_A.block.range(m_tile, 0);
366 const size_type j_range = m_A.block.range(m_tile, 1);
367 const size_type k_range = m_A.block.range(m_tile, 2);
368 const size_type n_row = m_A.block.num_rows(m_tile);
371 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
372 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
375 for (size_type i=tid; i<i_range; i+=nid) {
379 const size_type * __restrict cijk_j = &m_A.block.coord(0,0);
380 const size_type * __restrict cijk_k = &m_A.block.coord(0,1);
381 const TensorScalar * __restrict cijk_v = &m_A.block.value(0);
384 for (size_type iBlockEntry = iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
390 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry );
391 const VectorScalar *
const x_k = & m_x( k_offset , iBlockColumn );
392 const VectorScalar *
const x_j = & m_x( j_offset , iBlockColumn );
393 const MatrixScalar *
const A_k = & m_A.values( k_offset , iBlockEntry );
394 const MatrixScalar *
const A_j = & m_A.values( j_offset , iBlockEntry );
396 for (size_type
j=tid;
j<j_range;
j+=nid) {
400 for (size_type k=tid; k<k_range; k+=nid) {
416 for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
420 const size_type lBeg = m_A.block.entry_begin(m_tile, i);
421 const size_type lEnd = m_A.block.entry_end(m_tile, i);
424 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
430 const size_type
j = cijk_j[l];
431 const size_type k = cijk_k[l];
432 const TensorScalar v = cijk_v[l];
434 s += v * ( sh_A_j[
j] * sh_x_k[k] + sh_A_k[k] * sh_x_j[
j] );
440 if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
441 if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
442 if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
443 if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
444 if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
445 if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
455 for (size_type i=tid; i<i_range; i+=nid) {
456 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
466 typedef typename execution_space::size_type size_type;
468 Zero(
const vector_type & x ) : m_x( x ), m_d(x.extent(0)) {}
470 KOKKOS_INLINE_FUNCTION
471 void operator()(
const size_type
j )
const {
472 for (size_type i=0; i<m_d; ++i)
477 const vector_type m_x;
484 static void apply(
const matrix_type &
A ,
485 const vector_type & x ,
486 const vector_type & y )
488 const size_type row_count = A.graph.row_map.extent(0) - 1;
489 const size_type tensor_dimension = A.block.dimension();
490 const size_type tile_size = A.block.tile_size();
491 const size_type num_tiles = A.block.num_tiles();
493 const size_type nWarp = 16;
494 const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
495 const dim3 dGrid( row_count , 1 , 1 );
497 const size_type shmem =
498 sizeof(VectorScalar) * (5*tile_size + dBlock.x*dBlock.y);
502 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
504 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
505 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
506 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
507 <<
" row_count(" << row_count <<
")" << std::endl
508 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
509 <<
" tile_size(" << tile_size <<
")" << std::endl
510 <<
" num_tiles(" << num_tiles <<
")" << std::endl
515 Kokkos::parallel_for( row_count ,
Zero(y) );
518 for (size_type tile = 0; tile<num_tiles; ++tile) {
519 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
520 ( ProductTensorLoop( A , x , y, tile ) );
533 template<
typename TensorScalar ,
534 typename MatrixScalar ,
535 typename VectorScalar >
537 BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
538 MatrixScalar, Kokkos::Cuda >,
539 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
540 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
545 typedef execution_space::size_type size_type ;
547 typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
548 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
549 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
551 class ProductTensorLoop {
554 const matrix_type m_A;
555 const vector_type m_x;
556 const vector_type m_y;
558 ProductTensorLoop(
const matrix_type & A,
559 const vector_type & x,
560 const vector_type & y )
561 : m_A( A ), m_x( x ), m_y( y ) {}
564 void operator()(
void)
const
566 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
569 const size_type dim = m_A.block.dimension();
570 const size_type tile_size = m_A.block.tile_size();
571 const size_type max_num_rows = m_A.block.max_num_rows();
573 const size_type nid = blockDim.x * blockDim.y * blockDim.z;
574 const size_type tid =
575 threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * threadIdx.z );
576 const size_type thread_lane = threadIdx.x + blockDim.x * threadIdx.y;
579 VectorScalar *
const sh_x_k =
580 kokkos_impl_cuda_shared_memory<VectorScalar>();
582 VectorScalar *
const sh_x_j = sh_x_k;
583 VectorScalar *
const sh_A_k = sh_x_j + blockDim.x*tile_size;
585 VectorScalar *
const sh_A_j = sh_A_k;
586 VectorScalar *
const sh_y = sh_A_j + blockDim.x*tile_size;
587 volatile VectorScalar *
const sh_t = sh_y + tile_size;
589 size_type *
const sh_j = (size_type*) (sh_t + nid);
590 size_type *
const sh_k = (size_type*) (sh_j + nid);
595 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
596 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
597 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / blockDim.x;
598 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % blockDim.x;
599 if (remBlock > 0) ++numBlock;
602 for (size_type i=tid; i<dim; i+=nid) {
603 m_y(i,blockIdx.x) = 0.0;
607 const size_type n_tile = m_A.block.num_tiles();
610 for (size_type tile = 0; tile<n_tile; ++tile) {
613 const size_type i_offset = m_A.block.offset(tile, 0);
614 const size_type j_offset = m_A.block.offset(tile, 1);
615 const size_type k_offset = m_A.block.offset(tile, 2);
616 const size_type i_range = m_A.block.range(tile, 0);
617 const size_type j_range = m_A.block.range(tile, 1);
618 const size_type k_range = m_A.block.range(tile, 2);
619 const size_type n_row = m_A.block.num_rows(tile);
622 for (size_type i=tid; i<i_range; i+=nid) {
628 size_type iBlockEntry = iBlockEntryBeg;
629 for (size_type block=0; block<numBlock;
630 ++block, iBlockEntry+=blockDim.x) {
632 const size_type block_size =
633 (block == numBlock-1 && remBlock > 0) ? remBlock : blockDim.x;
636 for (size_type col=0; col<block_size; ++col) {
639 const size_type iBlockColumn =
640 m_A.graph.entries( iBlockEntry + col );
642 const VectorScalar *
const x_k = & m_x( k_offset , iBlockColumn );
643 const VectorScalar *
const x_j = & m_x( j_offset , iBlockColumn );
644 const MatrixScalar *
const A_k =
645 & m_A.values( k_offset , iBlockEntry + col );
646 const MatrixScalar *
const A_j =
647 & m_A.values( j_offset , iBlockEntry + col );
648 for (size_type j=tid; j<j_range; j+=nid) {
649 sh_x_j[col+j*blockDim.x] = x_j[
j];
650 sh_A_j[col+j*blockDim.x] = A_j[
j];
662 for (size_type i=threadIdx.z; i<i_range; i+=blockDim.z) {
667 const size_type lBeg = m_A.block.entry_begin(tile, i);
668 const size_type lEnd = m_A.block.entry_end(tile, i);
673 for (size_type l=lBeg; l<lEnd; l+=WarpSize) {
677 if (l+thread_lane < lEnd) {
678 sh_j[tid] = m_A.block.coord(l+thread_lane,0);
679 sh_k[tid] = m_A.block.coord(l+thread_lane,1);
680 sh_t[tid] = m_A.block.value(l+thread_lane);
683 const int mm = (l+WarpSize<lEnd) ? WarpSize : lEnd-l;
684 for (size_type m=threadIdx.y; m<mm; m+=blockDim.y) {
686 if (threadIdx.x < block_size) {
690 const size_type j = sh_j[m+WarpSize*threadIdx.z];
691 const size_type k = sh_k[m+WarpSize*threadIdx.z];
692 const TensorScalar v = sh_t[m+WarpSize*threadIdx.z];
693 const size_type jj = threadIdx.x + j*blockDim.x;
694 const size_type kk = threadIdx.x + k*blockDim.x;
695 s += v * ( sh_A_j[jj] * sh_x_k[kk] + sh_A_k[kk] * sh_x_j[jj] );
704 if ( thread_lane + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
705 if ( thread_lane + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
706 if ( thread_lane + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
707 if ( thread_lane + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
708 if ( thread_lane + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
709 if ( thread_lane == 0 ) sh_y[i] += sh_t[tid];
719 for (size_type i=tid; i<i_range; i+=nid) {
720 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
729 static void apply(
const matrix_type & A ,
730 const vector_type & x ,
731 const vector_type & y )
733 const size_type row_count = A.graph.row_map.extent(0) - 1;
734 const size_type tensor_dimension = A.block.dimension();
735 const size_type tile_size = A.block.tile_size();
736 const size_type num_tiles = A.block.num_tiles();
737 const size_type max_num_rows = A.block.max_num_rows();
739 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
740 const size_type block_size = 8;
741 const size_type nWarp = 16;
742 const dim3 dBlock( block_size, warp_size / block_size, nWarp );
743 const dim3 dGrid( row_count , 1 , 1 );
745 const size_type shmem =
746 sizeof(VectorScalar) * ( (2*block_size+1)*tile_size +
747 dBlock.x*dBlock.y*dBlock.z ) +
748 sizeof(size_type) * 2 * dBlock.x*dBlock.y*dBlock.z;
750 const size_type cmem =
sizeof(size_type)*(max_num_rows+1)*num_tiles;
754 std::cout <<
"Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
756 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
757 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
"," << dBlock.z
759 <<
" block_size(" << block_size <<
")" << std::endl
760 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
761 <<
" row_count(" << row_count <<
")" << std::endl
762 <<
" tensor_dimension(" << tensor_dimension <<
")" << std::endl
763 <<
" tile_size(" << tile_size <<
")" << std::endl
764 <<
" num_tiles(" << num_tiles <<
")" << std::endl
765 <<
" cmem(" << cmem/1024 <<
" kB)" << std::endl
774 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
775 ( ProductTensorLoop( A , x , y ) );
BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
const size_type m_block_size
Kokkos::DefaultExecutionSpace execution_space
Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
TiledCrsProductTensor< TensorScalar, execution_space > tensor_type
Kokkos::Cuda execution_space
CRS matrix of dense blocks.
execution_space::size_type size_type
__device__ void operator()(void) const
ProductTensorLoop(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type block_size)