42 #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
47 #include "Kokkos_Core.hpp"
58 #include "cuda_profiler_api.h"
70 template<
typename TensorScalar,
71 typename MatrixScalar,
72 typename VectorScalar >
75 MatrixScalar, Kokkos::Cuda >,
76 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
77 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
86 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >
vector_type;
93 class MultiplyKernel {
105 : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
111 const size_type dim = m_A.block.dimension();
114 volatile VectorScalar *
const sh_x =
115 kokkos_impl_cuda_shared_memory<VectorScalar>();
116 volatile MatrixScalar *
const sh_A = sh_x + BlockSize*dim;
117 volatile VectorScalar *
const sh_y = sh_A + BlockSize*dim;
118 #if !HAVE_CUDA_SHUFFLE
119 volatile VectorScalar *
const sh_t = sh_y + dim;
122 const size_type nid = blockDim.x * blockDim.y;
123 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
126 const int mask = blockDim.x == 32 ? 0xffffffff :
127 ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
130 for (
size_type i = tid; i < dim; i += nid ) {
136 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
137 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
138 for (
size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
139 iBlockEntry += BlockSize) {
141 (iBlockEntryEnd-iBlockEntry < BlockSize) ?
142 iBlockEntryEnd-iBlockEntry : BlockSize;
149 for (
size_type col = 0; col < block_size; ++col ) {
151 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
152 const VectorScalar *
const x = & m_x( 0, iBlockColumn );
153 const MatrixScalar *
const A = & m_A.values( 0, iBlockEntry + col );
156 for (
size_type i = tid; i < dim; i += nid ) {
157 sh_x[col + i * BlockSize] = x[i];
158 sh_A[col + i * BlockSize] = A[i];
166 for (
size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
170 const size_type lBeg = m_A.block.entry_begin( i );
171 const size_type lEnd = m_A.block.entry_end( i );
174 for (
size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
177 const size_type kj = m_A.block.coord( l );
178 const TensorScalar v = m_A.block.value( l );
179 const size_type j = ( kj & 0x0ffff ) * BlockSize ;
180 const size_type k = ( kj >> 16 ) * BlockSize ;
182 for (
size_type col = 0; col < block_size; ++col ) {
183 y += v * ( sh_A[col+
j] * sh_x[col+k] +
184 sh_A[col+k] * sh_x[col+
j] );
190 #if HAVE_CUDA_SHUFFLE
191 if (blockDim.x >= 2) y +=
shfl_down(y, 1, blockDim.x, mask);
192 if (blockDim.x >= 4) y +=
shfl_down(y, 2, blockDim.x, mask);
193 if (blockDim.x >= 8) y +=
shfl_down(y, 4, blockDim.x, mask);
194 if (blockDim.x >= 16) y +=
shfl_down(y, 8, blockDim.x, mask);
195 if (blockDim.x >= 32) y +=
shfl_down(y, 16, blockDim.x, mask);
196 if ( threadIdx.x == 0 ) sh_y[i] += y;
199 if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
201 if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
203 if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
205 if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
207 if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
209 if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
220 for (
size_type i = tid; i < dim; i += nid ) {
221 m_y( i, blockIdx.x ) = sh_y[ i ];
230 class MultiplyKernel {
233 const matrix_type m_A;
234 const vector_type m_x;
235 const vector_type m_y;
236 const size_type BlockSize;
238 MultiplyKernel(
const matrix_type &
A,
239 const vector_type & x,
240 const vector_type & y,
241 const size_type block_size )
242 : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
245 void operator()(
void)
const
248 const size_type dim = m_A.block.dimension();
250 volatile VectorScalar *
const sh_x =
251 kokkos_impl_cuda_shared_memory<VectorScalar>();
252 volatile VectorScalar *
const sh_y = sh_x + BlockSize*dim;
253 #if !HAVE_CUDA_SHUFFLE
254 volatile VectorScalar *
const sh_t = sh_y + dim;
257 const size_type nid = blockDim.x * blockDim.y;
258 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
261 const int mask = blockDim.x == 32 ? 0xffffffff :
262 ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
265 for ( size_type i = tid; i < dim; i += nid ) {
271 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
272 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
273 for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
274 iBlockEntry += BlockSize) {
275 const size_type block_size =
276 (iBlockEntryEnd-iBlockEntry < BlockSize) ?
277 iBlockEntryEnd-iBlockEntry : BlockSize;
284 for ( size_type col = 0; col < block_size; ++col ) {
286 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
287 const VectorScalar *
const x = & m_x( 0, iBlockColumn );
290 for ( size_type i = tid; i < dim; i += nid ) {
291 sh_x[col + i * BlockSize] = x[i];
299 for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
303 const size_type lBeg = m_A.block.entry_begin( i );
304 const size_type lEnd = m_A.block.entry_end( i );
307 for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
310 const size_type kj = m_A.block.coord( l );
311 const TensorScalar v = m_A.block.value( l );
312 const size_type
j = ( kj & 0x0ffff ) ;
313 const size_type k = ( kj >> 16 ) ;
315 for ( size_type col = 0; col < block_size; ++col ) {
316 const size_type bCol = iBlockEntry + col;
317 #if (__CUDA_ARCH__ >= 350)
318 y += v * ( __ldg(&m_A.values(j,bCol)) * sh_x[col+k*BlockSize] +
319 __ldg(&m_A.values(k,bCol)) * sh_x[col+j*BlockSize] );
321 y += v * ( m_A.values(j,bCol) * sh_x[col+k*BlockSize] +
322 m_A.values(k,bCol) * sh_x[col+j*BlockSize] );
329 #if HAVE_CUDA_SHUFFLE
330 if (blockDim.x >= 2) y +=
shfl_down(y, 1, blockDim.x, mask);
331 if (blockDim.x >= 4) y +=
shfl_down(y, 2, blockDim.x, mask);
332 if (blockDim.x >= 8) y +=
shfl_down(y, 4, blockDim.x, mask);
333 if (blockDim.x >= 16) y +=
shfl_down(y, 8, blockDim.x, mask);
334 if (blockDim.x >= 32) y +=
shfl_down(y, 16, blockDim.x, mask);
335 if ( threadIdx.x == 0 ) sh_y[i] += y;
338 if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
340 if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
342 if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
344 if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
346 if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
348 if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
359 for ( size_type i = tid; i < dim; i += nid ) {
360 m_y( i, blockIdx.x ) = sh_y[ i ];
369 struct TensorReadEntry {
380 const size_type tensor_align = tensor_dimension;
381 const size_type avg_tensor_entries_per_row = A.
block.avg_entries_per_row();
407 Kokkos::Impl::cuda_parallel_launch_local_memory<MultiplyKernel>);
409 (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
411 (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
413 (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
422 avg_tensor_entries_per_row >= 88 ? 32 : 16;
423 const size_type rows_per_warp = warp_size / threads_per_row;
425 const size_type vec_scalar_size =
sizeof(VectorScalar);
427 const size_type mat_scalar_size =
sizeof(MatrixScalar);
430 #define USE_FIXED_BLOCKSIZE 0
432 #if USE_FIXED_BLOCKSIZE
435 size_type nw = warps_per_sm / num_blocks;
436 while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
438 const size_type sh_per_block = shcap / num_blocks;
440 device_prop.
has_shuffle ? 0 : vec_scalar_size*warp_size*num_warp;
442 size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
445 size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
446 (vec_scalar_size+mat_scalar_size);
448 if (bs % 2 == 0) --bs;
454 ( (vec_scalar_size*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
457 ( ((vec_scalar_size+mat_scalar_size)*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
472 const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
474 half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
476 for (
size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
480 device_prop.
has_shuffle ? 0 : vec_scalar_size*warp_size*warps_per_block;
483 (vec_scalar_size*bs+vec_scalar_size)*tensor_align+sr;
486 ((vec_scalar_size+mat_scalar_size)*bs+vec_scalar_size)*tensor_align+sr;
488 shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
489 if (shmem <= max_shmem_per_block) {
491 size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
494 min_warps_per_block),
495 max_warps_per_block);
496 while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
498 TensorReadEntry entry;
499 entry.block_size = bs;
501 entry.num_blocks = num_blocks;
502 entry.num_warp = num_warp;
506 entry.reads = (
static_cast<double>(tensor_reads) /
507 static_cast<double>(factor*num_blocks*num_warp));
512 reads_per_thread.
size() == 0, std::logic_error,
513 "Stochastic problem dimension is too large to fit in shared memory");
515 double min_reads = reads_per_thread[0].reads;
516 for (
int i=1; i<reads_per_thread.
size(); ++i) {
517 if (reads_per_thread[i].reads < min_reads) {
519 min_reads = reads_per_thread[i].reads;
523 const size_type block_size = reads_per_thread[idx].block_size;
524 const size_type shmem = reads_per_thread[idx].shmem;
525 const size_type num_blocks = reads_per_thread[idx].num_blocks;
526 const size_type num_warp = reads_per_thread[idx].num_warp;
531 const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
532 const dim3 dGrid( row_count, 1, 1 );
535 std::cout <<
"block_size = " << block_size
536 <<
" tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
537 <<
" regs_per_thread = " << regs_per_thread
538 <<
" num blocks = " << num_blocks
539 <<
" num warps = " << num_warp
540 <<
" num rows = " << tensor_dimension
541 <<
" rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
542 <<
" avg entries/row = " << avg_tensor_entries_per_row
548 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
549 ( MultiplyKernel( A, x, y, block_size ) );
KOKKOS_INLINE_FUNCTION void sync_warp(const int &mask)
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
size_type max_blocks_per_sm
static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
size_type warp_granularity
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
const size_type BlockSize
size_type max_regs_per_sm
CrsProductTensor< TensorScalar, execution_space > tensor_type
size_type max_shmem_per_block
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
size_type shared_memory_granularity
__device__ void operator()(void) const
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
size_type shared_memory_capacity
size_type max_threads_per_block
size_type max_warps_per_sm
void push_back(const value_type &x)
size_type max_regs_per_block
MultiplyKernel(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type block_size)
CRS matrix of dense blocks.
execution_space::size_type size_type
size_type get_kernel_registers(Kernel kernel)
Kokkos::Cuda execution_space