10 #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
11 #define KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
13 #if defined( __CUDACC__)
16 #include "Kokkos_Core.hpp"
32 template <
typename Kernel>
34 #if __CUDA_ARCH__ >= 300
35 __launch_bounds__(1024,2)
37 FullOccupancyKernelLaunch(Kernel kernel) {
49 template <
typename MatrixMemorySpace,
50 typename MatrixStorage,
51 typename MatrixOrdinal,
52 typename MatrixMemory,
54 typename InputStorage,
56 typename OutputStorage,
59 class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
61 Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace>,
64 Kokkos::View< const Sacado::MP::Vector<InputStorage>*,
66 Kokkos::View< Sacado::MP::Vector<OutputStorage>*,
78 typedef Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace> Device;
80 typedef typename execution_space::size_type size_type;
82 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
86 MatrixSize> matrix_type;
87 typedef typename matrix_type::values_type matrix_values_type;
88 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
89 typedef Kokkos::View<
const InputVectorValue*,
90 InputP... > input_vector_type;
91 typedef Kokkos::View< OutputVectorValue*,
92 OutputP... > output_vector_type;
93 typedef Update update_type;
97 template <
unsigned NumPerThread>
102 const matrix_graph_type m_Agraph;
103 const matrix_values_type m_Avals;
104 const input_vector_type m_x;
105 const output_vector_type m_y;
106 const update_type m_update;
107 const size_type m_row_count;
109 Kernel(
const matrix_type &
A,
110 const input_vector_type & x,
111 const output_vector_type & y,
112 const update_type&
update )
113 : m_Agraph( A.graph )
114 , m_Avals( A.values )
118 , m_row_count( A.graph.row_map.extent(0)-1 )
122 inline void operator()(
void)
const
124 volatile size_type *
const sh_row =
125 kokkos_impl_cuda_shared_memory<size_type>();
126 volatile size_type *
const sh_col = sh_row + blockDim.y+1;
128 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
129 const size_type nid = blockDim.x*blockDim.y;
131 const size_type block_row = blockDim.y*blockIdx.x;
134 const size_type num_row =
135 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
136 m_row_count+1 - block_row;
137 for (size_type i=tid; i<num_row; i+=nid)
138 sh_row[i] = m_Agraph.row_map[block_row+i];
141 const size_type iRow = block_row + threadIdx.y;
142 if (iRow < m_row_count) {
144 const size_type iEntryBegin = sh_row[threadIdx.y];
145 const size_type iEntryEnd = sh_row[threadIdx.y+1];
147 for (size_type e=0; e<NumPerThread; ++e)
150 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
151 col_block+=blockDim.x) {
152 const size_type num_col =
153 col_block+blockDim.x <= iEntryEnd ?
154 blockDim.x : iEntryEnd-col_block;
161 if (threadIdx.x < num_col)
162 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
163 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
166 for ( size_type col = 0; col < num_col; ++col ) {
167 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
169 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
170 ++e, ee+=blockDim.x) {
171 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
172 m_x(iCol).fastAccessCoeff(ee);
178 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
179 ++e, ee+=blockDim.x) {
186 static void apply(
const matrix_type &
A,
187 const input_vector_type & x,
188 const output_vector_type & y,
189 const update_type &
update )
201 size_type threads_per_vector = A.dev_config.block_dim.x;
202 if (threads_per_vector == 0)
203 threads_per_vector = value_dimension ;
204 size_type rows_per_block = A.dev_config.block_dim.y;
205 if (rows_per_block == 0)
206 rows_per_block = 256 / threads_per_vector;
207 const size_type row_count = A.graph.row_map.extent(0)-1;
208 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
209 const dim3 block( threads_per_vector, rows_per_block, 1 );
210 const dim3 grid( num_blocks, 1 );
213 size_type num_per_thread = value_dimension / threads_per_vector;
215 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
216 "Entries/thread * threads/vector must equal number of vector entries");
220 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
222 threads_per_vector > warp_size, std::logic_error,
223 "Threads/vector cannont exceed Cuda warp size");
226 if (num_per_thread == 1) {
227 launch_impl<1>( A, x, y,
update, block, grid );
229 else if (num_per_thread == 2) {
230 launch_impl<2>( A, x, y,
update, block, grid );
232 else if (num_per_thread == 3) {
233 launch_impl<3>( A, x, y,
update, block, grid );
235 else if (num_per_thread == 4) {
236 launch_impl<4>( A, x, y,
update, block, grid );
240 true, std::logic_error,
"Invalid num_per_thread == " << num_per_thread);
247 template <
unsigned num_per_thread>
248 static void launch_impl(
const matrix_type & A,
249 const input_vector_type & x,
250 const output_vector_type & y,
251 const update_type & update,
255 typedef Kernel<num_per_thread> Krnl;
258 const bool occupancy_check =
false;
259 if (occupancy_check) {
260 DeviceProp device_prop;
261 size_type warps_per_sm;
262 if (num_per_thread == 1)
263 warps_per_sm = device_prop.get_resident_warps_per_sm(
264 FullOccupancyKernelLaunch<Krnl>);
266 warps_per_sm = device_prop.get_resident_warps_per_sm(
267 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
268 std::cout <<
"warps_per_sm = " << warps_per_sm
269 <<
" max_warps_per_sm = " << device_prop.max_warps_per_sm
273 const size_t shared = (block.y+1 + block.x*block.y)*
sizeof(size_type);
274 if (
sizeof(size_type) <= 4)
275 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
277 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
280 if (num_per_thread == 1)
281 FullOccupancyKernelLaunch<<< grid, block, shared >>>
282 ( Krnl( A, x, y, update ) );
284 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
285 ( Krnl( A, x, y, update ) );
297 template <
typename MatrixMemorySpace,
298 typename MatrixStorage,
299 typename MatrixOrdinal,
300 typename MatrixMemory,
302 typename InputStorage,
304 typename OutputStorage,
305 typename ... OutputP,
307 class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
309 Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace>,
312 Kokkos::View< const Sacado::MP::Vector<InputStorage>**,
314 Kokkos::View< Sacado::MP::Vector<OutputStorage>**,
326 typedef Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace> Device;
328 typedef typename execution_space::size_type size_type;
331 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
335 MatrixSize> matrix_type;
336 typedef typename matrix_type::values_type matrix_values_type;
337 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
338 typedef Kokkos::View<
const InputVectorValue**,
339 InputP... > input_vector_type;
340 typedef Kokkos::View< OutputVectorValue**,
341 OutputP... > output_vector_type;
342 typedef Update update_type;
346 template <
unsigned NumPerThread>
351 const matrix_graph_type m_Agraph;
352 const matrix_values_type m_Avals;
353 const input_vector_type m_x;
354 const output_vector_type m_y;
355 const update_type m_update;
356 const size_type m_row_count;
357 const size_type m_num_vec_cols;
359 Kernel(
const matrix_type & A,
360 const input_vector_type & x,
361 const output_vector_type & y,
362 const update_type& update )
363 : m_Agraph( A.graph )
364 , m_Avals( A.values )
368 , m_row_count( A.graph.row_map.extent(0)-1 )
369 , m_num_vec_cols( x.extent(1) )
373 inline void operator()(
void)
const
375 volatile size_type *
const sh_row =
376 kokkos_impl_cuda_shared_memory<size_type>();
377 volatile size_type *
const sh_col = sh_row + blockDim.y+1;
379 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
380 const size_type nid = blockDim.x*blockDim.y;
382 const size_type block_row = blockDim.y*blockIdx.x;
385 const size_type num_row =
386 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
387 m_row_count+1 - block_row;
388 for (size_type i=tid; i<num_row; i+=nid)
389 sh_row[i] = m_Agraph.row_map[block_row+i];
392 const size_type iRow = block_row + threadIdx.y;
393 if (iRow < m_row_count) {
395 const size_type iEntryBegin = sh_row[threadIdx.y];
396 const size_type iEntryEnd = sh_row[threadIdx.y+1];
402 for (size_type vec_col=0; vec_col<m_num_vec_cols; vec_col++) {
404 for (size_type e=0; e<NumPerThread; ++e)
407 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
408 col_block+=blockDim.x) {
409 const size_type num_col =
410 col_block+blockDim.x <= iEntryEnd ?
411 blockDim.x : iEntryEnd-col_block;
418 if (threadIdx.x < num_col)
419 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
420 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
423 for ( size_type col = 0; col < num_col; ++col ) {
424 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
426 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
427 ++e, ee+=blockDim.x) {
428 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
429 m_x(iCol, vec_col).fastAccessCoeff(ee);
435 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
436 ++e, ee+=blockDim.x) {
445 static void apply(
const matrix_type & A,
446 const input_vector_type & x,
447 const output_vector_type & y,
448 const update_type & update )
460 size_type threads_per_vector = A.dev_config.block_dim.x;
461 if (threads_per_vector == 0)
462 threads_per_vector = value_dimension ;
463 size_type rows_per_block = A.dev_config.block_dim.y;
464 if (rows_per_block == 0)
465 rows_per_block = 256 / threads_per_vector;
466 const size_type row_count = A.graph.row_map.extent(0)-1;
467 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
468 const dim3 block( threads_per_vector, rows_per_block, 1 );
469 const dim3 grid( num_blocks, 1 );
472 size_type num_per_thread = value_dimension / threads_per_vector;
474 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
475 "Entries/thread * threads/vector must equal number of vector entries");
478 if (num_per_thread == 1) {
479 launch_impl<1>( A, x, y,
update, block, grid );
481 else if (num_per_thread == 2) {
482 launch_impl<2>( A, x, y,
update, block, grid );
484 else if (num_per_thread == 3) {
485 launch_impl<3>( A, x, y,
update, block, grid );
487 else if (num_per_thread == 4) {
488 launch_impl<4>( A, x, y,
update, block, grid );
492 true, std::logic_error,
"Invalid num_per_thread == " << num_per_thread);
499 template <
unsigned num_per_thread>
500 static void launch_impl(
const matrix_type & A,
501 const input_vector_type & x,
502 const output_vector_type & y,
503 const update_type & update,
507 typedef Kernel<num_per_thread> Krnl;
510 const bool occupancy_check =
false;
511 if (occupancy_check) {
512 DeviceProp device_prop;
513 size_type warps_per_sm;
514 if (num_per_thread == 1)
515 warps_per_sm = device_prop.get_resident_warps_per_sm(
516 FullOccupancyKernelLaunch<Krnl>);
518 warps_per_sm = device_prop.get_resident_warps_per_sm(
519 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
520 std::cout <<
"warps_per_sm = " << warps_per_sm
521 <<
" max_warps_per_sm = " << device_prop.max_warps_per_sm
525 const size_t shared = (block.y+1 + block.x*block.y)*
sizeof(size_type);
526 if (
sizeof(size_type) <= 4)
527 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
529 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
532 if (num_per_thread == 1)
533 FullOccupancyKernelLaunch<<< grid, block, shared >>>
534 ( Krnl( A, x, y, update ) );
536 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
537 ( Krnl( A, x, y, update ) );
Kokkos::DefaultExecutionSpace execution_space
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)