42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
45 #if defined( __CUDACC__)
50 #include "KokkosSparse_CrsMatrix.hpp"
60 #include "Kokkos_Core.hpp"
70 # if (__CUDA_ARCH__ >= 300)
71 # define HAVE_CUDA_SHUFFLE 1
73 # define HAVE_CUDA_SHUFFLE 0
76 # define HAVE_CUDA_SHUFFLE 0
89 template <
typename MatrixStorage,
90 typename MatrixOrdinal,
91 typename MatrixMemory,
93 typename InputStorage,
95 typename OutputStorage,
97 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
102 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
104 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
113 typedef Kokkos::Cuda MatrixDevice;
115 typedef execution_space::size_type size_type;
117 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
121 MatrixSize> matrix_type;
122 typedef typename matrix_type::values_type matrix_values_type;
124 typedef Kokkos::View<
const InputVectorValue*,
125 InputP... > input_vector_type;
126 typedef Kokkos::View< OutputVectorValue*,
127 OutputP... > output_vector_type;
131 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
132 typedef typename matrix_values_type::array_type matrix_array_type;
133 typedef typename input_vector_type::array_type input_array_type;
134 typedef typename output_vector_type::array_type output_array_type;
141 const matrix_array_type m_A_values ;
142 const matrix_graph_type m_A_graph ;
143 const input_array_type m_x ;
144 const output_array_type m_y ;
145 const tensor_type m_tensor ;
146 const input_scalar m_a ;
147 const output_scalar m_b ;
148 const size_type BlockSize;
150 Multiply(
const matrix_type &
A ,
151 const input_vector_type & x ,
152 const output_vector_type & y ,
153 const input_scalar & a ,
154 const output_scalar & b ,
155 const size_type block_size )
156 : m_A_values( A.values )
157 , m_A_graph( A.graph )
160 , m_tensor( Kokkos::
cijk(A.values) )
163 , BlockSize(block_size)
168 __device__
void operator()(
void)
const
171 const size_type dim = m_tensor.dimension();
174 volatile input_scalar *
const sh_x =
175 kokkos_impl_cuda_shared_memory<input_scalar>();
176 volatile matrix_scalar *
const sh_A = sh_x + BlockSize*dim;
177 volatile output_scalar *
const sh_y = sh_A + BlockSize*dim;
178 #if !HAVE_CUDA_SHUFFLE
179 volatile output_scalar *
const sh_t = sh_y + dim;
182 const size_type nid = blockDim.x * blockDim.y;
183 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
186 for ( size_type i = tid; i < dim; i += nid ) {
192 const size_type iBlockEntryBeg = m_A_graph.row_map[ blockIdx.x ];
193 const size_type iBlockEntryEnd = m_A_graph.row_map[ blockIdx.x + 1 ];
194 for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
195 iBlockEntry += BlockSize) {
196 const size_type block_size =
197 (iBlockEntryEnd-iBlockEntry < BlockSize) ?
198 iBlockEntryEnd-iBlockEntry : BlockSize;
205 for ( size_type col = 0; col < block_size; ++col ) {
207 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
208 const input_scalar *
const x = & m_x( 0, iBlockColumn );
209 const matrix_scalar *
const A = & m_A_values( iBlockEntry + col, 0 );
212 for ( size_type i = tid; i < dim; i += nid ) {
213 sh_x[col + i * BlockSize] = x[i];
214 sh_A[col + i * BlockSize] = A[i];
222 for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
226 const size_type lBeg = m_tensor.entry_begin( i );
227 const size_type lEnd = m_tensor.entry_end( i );
230 for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
233 const size_type kj = m_tensor.coord( l );
234 const tensor_scalar v = m_tensor.value( l );
235 const size_type
j = ( kj & 0x0ffff ) * BlockSize ;
236 const size_type k = ( kj >> 16 ) * BlockSize ;
238 for ( size_type col = 0; col < block_size; ++col ) {
239 y += v * ( sh_A[col+
j] * sh_x[col+k] +
240 sh_A[col+k] * sh_x[col+
j] );
246 #if HAVE_CUDA_SHUFFLE
252 if ( threadIdx.x == 0 ) sh_y[i] += y;
255 if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
256 if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
257 if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
258 if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
259 if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
260 if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
271 if ( m_b == output_scalar(0) )
272 for ( size_type i = tid; i < dim; i += nid )
273 m_y( i, blockIdx.x ) = m_a * sh_y[ i ];
275 for ( size_type i = tid; i < dim; i += nid )
276 m_y( i, blockIdx.x ) = m_a * sh_y[ i ] + m_b * m_y( i, blockIdx.x );
279 struct TensorReadEntry {
280 size_type block_size, shmem, num_blocks, num_warp;
284 static void apply(
const matrix_type & A ,
285 const input_vector_type & x ,
286 const output_vector_type & y ,
287 const input_scalar & a = input_scalar(1) ,
288 const output_scalar & b = output_scalar(0) )
291 const size_type row_count = A.graph.row_map.extent(0) - 1;
292 const size_type tensor_dimension = tensor.dimension();
293 const size_type tensor_align = tensor_dimension;
294 const size_type avg_tensor_entries_per_row = tensor.avg_entries_per_row();
297 const size_type fem_nnz_per_row = 27;
300 DeviceProp device_prop;
301 const size_type shcap = device_prop.shared_memory_capacity;
302 const size_type sh_granularity = device_prop.shared_memory_granularity;
303 const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
304 const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
305 const size_type warp_size = device_prop.warp_size;
306 const size_type warp_granularity = device_prop.warp_granularity;
307 const size_type max_warps_per_block =
308 std::min(device_prop.max_threads_per_block / warp_size,
309 device_prop.max_warps_per_sm);
310 const size_type min_warps_per_block = 1;
311 const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
312 const size_type max_regs_per_block = device_prop.max_regs_per_block;
313 const size_type reg_bank_size = device_prop.reg_bank_size;
318 const size_type regs_per_thread =
319 device_prop.get_kernel_registers(
320 Kokkos::Impl::cuda_parallel_launch_local_memory<Multiply>);
321 const size_type regs_per_warp =
322 (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
323 const size_type warps_per_sm =
324 (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
325 const size_type warps_per_block =
326 (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
334 const size_type threads_per_row =
335 avg_tensor_entries_per_row >= 88 ? 32 : 16;
336 const size_type rows_per_warp = warp_size / threads_per_row;
338 const size_type in_vec_scalar_size =
sizeof(input_scalar);
339 const size_type out_vec_scalar_size =
sizeof(output_scalar);
340 const size_type mat_scalar_size =
sizeof(matrix_scalar);
342 #define USE_FIXED_BLOCKSIZE 0
344 #if USE_FIXED_BLOCKSIZE
346 const size_type num_blocks = 3;
347 size_type nw = warps_per_sm / num_blocks;
348 while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
349 const size_type num_warp = nw;
350 const size_type sh_per_block = shcap / num_blocks;
352 device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*num_warp;
353 size_type bs = ((sh_per_block - sr) / tensor_align - out_vec_scalar_size) /
354 (in_vec_scalar_size+mat_scalar_size);
355 if (bs % 2 == 0) --bs;
356 const size_type block_size_max = 31;
357 const size_type block_size =
std::min(bs, block_size_max);
359 const size_type shmem =
360 ( ((in_vec_scalar_size+mat_scalar_size)*block_size+out_vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
373 const size_type block_size_min = 3;
374 const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
375 const size_type block_size_max =
376 half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
378 for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
382 device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*warps_per_block;
384 ((in_vec_scalar_size+mat_scalar_size)*bs+out_vec_scalar_size)*tensor_align+sr;
385 shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
386 if (shmem <= max_shmem_per_block) {
387 size_type num_blocks =
std::min(shcap / shmem, max_blocks_per_sm);
388 size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
391 min_warps_per_block),
392 max_warps_per_block);
393 while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
395 TensorReadEntry entry;
396 entry.block_size = bs;
398 entry.num_blocks = num_blocks;
399 entry.num_warp = num_warp;
402 size_type factor =
std::min(num_blocks,3u);
403 entry.reads = (
static_cast<double>(tensor_reads) /
404 static_cast<double>(factor*num_blocks*num_warp));
409 reads_per_thread.
size() == 0, std::logic_error,
410 "Stochastic problem dimension is too large to fit in shared memory");
412 double min_reads = reads_per_thread[0].reads;
413 for (
int i=1; i<reads_per_thread.
size(); ++i) {
414 if (reads_per_thread[i].reads < min_reads) {
416 min_reads = reads_per_thread[i].reads;
420 const size_type block_size = reads_per_thread[idx].block_size;
421 const size_type shmem = reads_per_thread[idx].shmem;
422 const size_type num_blocks = reads_per_thread[idx].num_blocks;
423 const size_type num_warp = reads_per_thread[idx].num_warp;
428 const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
429 const dim3 dGrid( row_count, 1, 1 );
432 std::cout <<
"block_size = " << block_size
433 <<
" tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
434 <<
" regs_per_thread = " << regs_per_thread
435 <<
" num blocks = " << num_blocks
436 <<
" num warps = " << num_warp
437 <<
" num rows = " << tensor_dimension
438 <<
" rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
439 <<
" avg entries/row = " << avg_tensor_entries_per_row
445 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
446 ( Multiply( A, x, y, a, b, block_size ) );
458 template <
typename MatrixStorage,
459 typename MatrixOrdinal,
460 typename MatrixMemory,
462 typename InputStorage,
464 typename OutputStorage,
465 typename ... OutputP>
466 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
471 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
473 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
482 typedef Kokkos::Cuda MatrixDevice;
484 typedef execution_space::size_type size_type;
486 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
490 MatrixSize> matrix_type;
491 typedef Kokkos::View<
const InputVectorValue**,
492 InputP... > input_vector_type;
493 typedef Kokkos::View< OutputVectorValue**,
494 OutputP... > output_vector_type;
500 static void apply(
const matrix_type & A ,
501 const input_vector_type & x ,
502 const output_vector_type & y ,
503 const input_scalar & a = input_scalar(1) ,
504 const output_scalar & b = output_scalar(0) )
506 typedef Kokkos::View<
const InputVectorValue*, InputP... > input_vector_type_1D;
507 typedef Kokkos::View< OutputVectorValue*, OutputP... > output_vector_type_1D;
508 typedef Multiply< matrix_type, input_vector_type_1D,
509 output_vector_type_1D > multiply_type_1D;
511 const size_type num_col = y.extent(1);
512 for (size_type col=0; col<num_col; ++col)
514 multiply_type_1D::apply(
516 Kokkos::subview( x, Kokkos::ALL(), col),
517 Kokkos::subview(y, Kokkos::ALL(), col),
522 template <
typename Kernel>
524 #if __CUDA_ARCH__ >= 300
525 __launch_bounds__(1024,2)
527 MeanFullOccupancyKernelLaunch(Kernel kernel) {
535 template <
typename MatrixStorage,
536 typename MatrixOrdinal,
537 typename MatrixMemory,
539 typename InputStorage,
541 typename OutputStorage,
542 typename ... OutputP>
543 class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
548 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
550 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
559 typedef Kokkos::Cuda MatrixDevice;
561 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
565 MatrixSize> matrix_type;
566 typedef typename matrix_type::values_type matrix_values_type;
568 typedef Kokkos::View<
const InputVectorValue*,
569 InputP... > input_vector_type;
570 typedef Kokkos::View< OutputVectorValue*,
571 OutputP... > output_vector_type;
573 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
578 template <
int BlockSize>
582 typedef typename input_vector_type::array_type input_array_type;
583 typedef typename output_vector_type::array_type output_array_type;
585 const matrix_array_type m_A_values ;
586 const matrix_graph_type m_A_graph ;
587 const output_array_type v_y ;
588 const input_array_type v_x ;
589 const input_scalar m_a ;
590 const output_scalar m_b ;
591 const size_type m_row_count;
592 const size_type dim ;
594 Kernel(
const matrix_type & A ,
595 const input_vector_type & x ,
596 const output_vector_type & y ,
597 const input_scalar & a ,
598 const output_scalar & b )
599 : m_A_values( A.values )
600 , m_A_graph( A.graph )
605 , m_row_count( A.graph.row_map.extent(0)-1 )
609 __device__
void operator()(
void)
const
613 volatile matrix_scalar *
const sh_A =
614 kokkos_impl_cuda_shared_memory<matrix_scalar>();
615 volatile size_type *
const sh_col =
616 reinterpret_cast<volatile size_type*
>(sh_A + BlockSize*blockDim.y);
619 const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
620 if (iBlockRow < m_row_count) {
622 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
623 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
626 if (m_b == output_scalar(0))
627 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
628 v_y(pce, iBlockRow) = 0.0;
630 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
631 v_y(pce, iBlockRow) = m_b*v_y(pce, iBlockRow);
634 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
635 col_block+=BlockSize) {
636 const size_type num_col = col_block+BlockSize <= iEntryEnd ?
637 BlockSize : iEntryEnd-col_block;
641 for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
642 sh_col[col*blockDim.y+threadIdx.y] =
643 m_A_graph.entries(col_block+col);
644 sh_A[col*blockDim.y+threadIdx.y] =
645 m_A_values(col_block+col);
647 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
652 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
653 output_scalar s = 0.0;
654 for ( size_type col = 0; col < num_col; ++col ) {
655 const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
656 const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
657 s += aA*v_x(pce, iCol);
659 v_y(pce, iBlockRow) += s;
666 static void apply(
const matrix_type & A ,
667 const input_vector_type & x ,
668 const output_vector_type & y ,
669 const input_scalar & a = input_scalar(1) ,
670 const output_scalar & b = output_scalar(0) )
672 const size_t row_count = A.graph.row_map.extent(0) - 1;
678 size_type threads_per_row;
679 size_type rows_per_block;
681 threads_per_row = 32;
685 threads_per_row = 16;
688 const size_type num_blocks =
689 (row_count + rows_per_block -1 ) / rows_per_block;
692 const dim3 dBlock( threads_per_row , rows_per_block , 1 );
693 const dim3 dGrid( num_blocks, 1, 1 );
698 const int BlockSize = 32;
699 if (
sizeof(matrix_scalar) > 4)
700 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
702 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
703 const size_t shared =
704 (BlockSize*dBlock.y) * (
sizeof(size_type) +
sizeof(matrix_scalar));
707 MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
708 ( Kernel<BlockSize>( A, x, y, a, b ) );
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::DefaultExecutionSpace execution_space
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
void push_back(const value_type &x)