42 #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
43 #define KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
45 #if defined( __CUDACC__)
48 #include "Kokkos_Core.hpp"
64 template <
typename Kernel>
66 #if __CUDA_ARCH__ >= 300
67 __launch_bounds__(1024,2)
69 FullOccupancyKernelLaunch(Kernel kernel) {
81 template <
typename MatrixStorage,
82 typename MatrixOrdinal,
83 typename MatrixMemory,
85 typename InputStorage,
87 typename OutputStorage,
90 class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
92 Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space>,
95 Kokkos::View< const Sacado::MP::Vector<InputStorage>*,
97 Kokkos::View< Sacado::MP::Vector<OutputStorage>*,
109 typedef Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space> Device;
111 typedef typename execution_space::size_type size_type;
113 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
117 MatrixSize> matrix_type;
118 typedef typename matrix_type::values_type matrix_values_type;
119 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
120 typedef Kokkos::View<
const InputVectorValue*,
121 InputP... > input_vector_type;
122 typedef Kokkos::View< OutputVectorValue*,
123 OutputP... > output_vector_type;
124 typedef Update update_type;
128 template <
unsigned NumPerThread>
133 const matrix_graph_type m_Agraph;
134 const matrix_values_type m_Avals;
135 const input_vector_type m_x;
136 const output_vector_type m_y;
137 const update_type m_update;
138 const size_type m_row_count;
140 Kernel(
const matrix_type &
A,
141 const input_vector_type & x,
142 const output_vector_type & y,
143 const update_type&
update )
144 : m_Agraph( A.graph )
145 , m_Avals( A.values )
149 , m_row_count( A.graph.row_map.extent(0)-1 )
153 inline void operator()(
void)
const
155 volatile size_type *
const sh_row =
156 kokkos_impl_cuda_shared_memory<size_type>();
157 volatile size_type *
const sh_col = sh_row + blockDim.y+1;
159 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
160 const size_type nid = blockDim.x*blockDim.y;
162 const size_type block_row = blockDim.y*blockIdx.x;
165 const size_type num_row =
166 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
167 m_row_count+1 - block_row;
168 for (size_type i=tid; i<num_row; i+=nid)
169 sh_row[i] = m_Agraph.row_map[block_row+i];
172 const size_type iRow = block_row + threadIdx.y;
173 if (iRow < m_row_count) {
175 const size_type iEntryBegin = sh_row[threadIdx.y];
176 const size_type iEntryEnd = sh_row[threadIdx.y+1];
178 for (size_type e=0; e<NumPerThread; ++e)
181 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
182 col_block+=blockDim.x) {
183 const size_type num_col =
184 col_block+blockDim.x <= iEntryEnd ?
185 blockDim.x : iEntryEnd-col_block;
192 if (threadIdx.x < num_col)
193 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
194 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
197 for ( size_type col = 0; col < num_col; ++col ) {
198 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
200 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
201 ++e, ee+=blockDim.x) {
202 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
203 m_x(iCol).fastAccessCoeff(ee);
209 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
210 ++e, ee+=blockDim.x) {
217 static void apply(
const matrix_type &
A,
218 const input_vector_type & x,
219 const output_vector_type & y,
220 const update_type &
update )
232 size_type threads_per_vector = A.dev_config.block_dim.x;
233 if (threads_per_vector == 0)
234 threads_per_vector = value_dimension ;
235 size_type rows_per_block = A.dev_config.block_dim.y;
236 if (rows_per_block == 0)
237 rows_per_block = 256 / threads_per_vector;
238 const size_type row_count = A.graph.row_map.extent(0)-1;
239 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
240 const dim3 block( threads_per_vector, rows_per_block, 1 );
241 const dim3 grid( num_blocks, 1 );
244 size_type num_per_thread = value_dimension / threads_per_vector;
246 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
247 "Entries/thread * threads/vector must equal number of vector entries");
251 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
253 threads_per_vector > warp_size, std::logic_error,
254 "Threads/vector cannont exceed Cuda warp size");
257 if (num_per_thread == 1) {
258 launch_impl<1>( A, x, y,
update, block, grid );
260 else if (num_per_thread == 2) {
261 launch_impl<2>( A, x, y,
update, block, grid );
263 else if (num_per_thread == 3) {
264 launch_impl<3>( A, x, y,
update, block, grid );
266 else if (num_per_thread == 4) {
267 launch_impl<4>( A, x, y,
update, block, grid );
271 true, std::logic_error,
"Invalid num_per_thread == " << num_per_thread);
278 template <
unsigned num_per_thread>
279 static void launch_impl(
const matrix_type & A,
280 const input_vector_type & x,
281 const output_vector_type & y,
282 const update_type & update,
286 typedef Kernel<num_per_thread> Krnl;
289 const bool occupancy_check =
false;
290 if (occupancy_check) {
291 DeviceProp device_prop;
292 size_type warps_per_sm;
293 if (num_per_thread == 1)
294 warps_per_sm = device_prop.get_resident_warps_per_sm(
295 FullOccupancyKernelLaunch<Krnl>);
297 warps_per_sm = device_prop.get_resident_warps_per_sm(
298 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
299 std::cout <<
"warps_per_sm = " << warps_per_sm
300 <<
" max_warps_per_sm = " << device_prop.max_warps_per_sm
304 const size_t shared = (block.y+1 + block.x*block.y)*
sizeof(size_type);
305 if (
sizeof(size_type) <= 4)
306 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
308 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
311 if (num_per_thread == 1)
312 FullOccupancyKernelLaunch<<< grid, block, shared >>>
313 ( Krnl( A, x, y, update ) );
315 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
316 ( Krnl( A, x, y, update ) );
328 template <
typename MatrixStorage,
329 typename MatrixOrdinal,
330 typename MatrixMemory,
332 typename InputStorage,
334 typename OutputStorage,
335 typename ... OutputP,
337 class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
339 Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space>,
342 Kokkos::View< const Sacado::MP::Vector<InputStorage>**,
344 Kokkos::View< Sacado::MP::Vector<OutputStorage>**,
356 typedef Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space> Device;
358 typedef typename execution_space::size_type size_type;
361 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
365 MatrixSize> matrix_type;
366 typedef typename matrix_type::values_type matrix_values_type;
367 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
368 typedef Kokkos::View<
const InputVectorValue**,
369 InputP... > input_vector_type;
370 typedef Kokkos::View< OutputVectorValue**,
371 OutputP... > output_vector_type;
372 typedef Update update_type;
376 template <
unsigned NumPerThread>
381 const matrix_graph_type m_Agraph;
382 const matrix_values_type m_Avals;
383 const input_vector_type m_x;
384 const output_vector_type m_y;
385 const update_type m_update;
386 const size_type m_row_count;
387 const size_type m_num_vec_cols;
389 Kernel(
const matrix_type & A,
390 const input_vector_type & x,
391 const output_vector_type & y,
392 const update_type& update )
393 : m_Agraph( A.graph )
394 , m_Avals( A.values )
398 , m_row_count( A.graph.row_map.extent(0)-1 )
399 , m_num_vec_cols( x.extent(1) )
403 inline void operator()(
void)
const
405 volatile size_type *
const sh_row =
406 kokkos_impl_cuda_shared_memory<size_type>();
407 volatile size_type *
const sh_col = sh_row + blockDim.y+1;
409 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
410 const size_type nid = blockDim.x*blockDim.y;
412 const size_type block_row = blockDim.y*blockIdx.x;
415 const size_type num_row =
416 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
417 m_row_count+1 - block_row;
418 for (size_type i=tid; i<num_row; i+=nid)
419 sh_row[i] = m_Agraph.row_map[block_row+i];
422 const size_type iRow = block_row + threadIdx.y;
423 if (iRow < m_row_count) {
425 const size_type iEntryBegin = sh_row[threadIdx.y];
426 const size_type iEntryEnd = sh_row[threadIdx.y+1];
432 for (size_type vec_col=0; vec_col<m_num_vec_cols; vec_col++) {
434 for (size_type e=0; e<NumPerThread; ++e)
437 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
438 col_block+=blockDim.x) {
439 const size_type num_col =
440 col_block+blockDim.x <= iEntryEnd ?
441 blockDim.x : iEntryEnd-col_block;
448 if (threadIdx.x < num_col)
449 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
450 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
453 for ( size_type col = 0; col < num_col; ++col ) {
454 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
456 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
457 ++e, ee+=blockDim.x) {
458 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
459 m_x(iCol, vec_col).fastAccessCoeff(ee);
465 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
466 ++e, ee+=blockDim.x) {
475 static void apply(
const matrix_type & A,
476 const input_vector_type & x,
477 const output_vector_type & y,
478 const update_type & update )
490 size_type threads_per_vector = A.dev_config.block_dim.x;
491 if (threads_per_vector == 0)
492 threads_per_vector = value_dimension ;
493 size_type rows_per_block = A.dev_config.block_dim.y;
494 if (rows_per_block == 0)
495 rows_per_block = 256 / threads_per_vector;
496 const size_type row_count = A.graph.row_map.extent(0)-1;
497 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
498 const dim3 block( threads_per_vector, rows_per_block, 1 );
499 const dim3 grid( num_blocks, 1 );
502 size_type num_per_thread = value_dimension / threads_per_vector;
504 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
505 "Entries/thread * threads/vector must equal number of vector entries");
508 if (num_per_thread == 1) {
509 launch_impl<1>( A, x, y,
update, block, grid );
511 else if (num_per_thread == 2) {
512 launch_impl<2>( A, x, y,
update, block, grid );
514 else if (num_per_thread == 3) {
515 launch_impl<3>( A, x, y,
update, block, grid );
517 else if (num_per_thread == 4) {
518 launch_impl<4>( A, x, y,
update, block, grid );
522 true, std::logic_error,
"Invalid num_per_thread == " << num_per_thread);
529 template <
unsigned num_per_thread>
530 static void launch_impl(
const matrix_type & A,
531 const input_vector_type & x,
532 const output_vector_type & y,
533 const update_type & update,
537 typedef Kernel<num_per_thread> Krnl;
540 const bool occupancy_check =
false;
541 if (occupancy_check) {
542 DeviceProp device_prop;
543 size_type warps_per_sm;
544 if (num_per_thread == 1)
545 warps_per_sm = device_prop.get_resident_warps_per_sm(
546 FullOccupancyKernelLaunch<Krnl>);
548 warps_per_sm = device_prop.get_resident_warps_per_sm(
549 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
550 std::cout <<
"warps_per_sm = " << warps_per_sm
551 <<
" max_warps_per_sm = " << device_prop.max_warps_per_sm
555 const size_t shared = (block.y+1 + block.x*block.y)*
sizeof(size_type);
556 if (
sizeof(size_type) <= 4)
557 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
559 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
562 if (num_per_thread == 1)
563 FullOccupancyKernelLaunch<<< grid, block, shared >>>
564 ( Krnl( A, x, y, update ) );
566 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
567 ( Krnl( A, x, y, update ) );
Kokkos::DefaultExecutionSpace execution_space
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)