Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_CrsMatrix_MP_Vector_Cuda.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
11 #define KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
12 
13 #if defined( __CUDACC__)
14 
16 #include "Kokkos_Core.hpp"
18 
19 //----------------------------------------------------------------------------
20 // Specializations of Kokkos::CrsMatrix for Sacado::MP::Vector scalar type
21 // and Cuda device
22 //----------------------------------------------------------------------------
23 
24 namespace Stokhos {
25 
26 namespace details {
27 
28 // We can make things slightly faster for the ensemble multiply kernel with
29 // NumPerThread == 1 by using at most 32 registers per thread for 100%
30 // occupancy, which we get with these launch bounds on Kepler. For
31 // NumPerThread > 1 it is worse though due to too many spilled registers.
32 template <typename Kernel>
33 __global__ void
34 #if __CUDA_ARCH__ >= 300
35 __launch_bounds__(1024,2)
36 #endif
37 FullOccupancyKernelLaunch(Kernel kernel) {
38  kernel();
39 }
40 
41 // Kernel implementing y = A * x for Cuda device where
42 // A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
43 // x, y == Kokkos::View< Sacado::MP::Vector<...>*,...>,
44 // x and y are rank 1
45 // We spell everything out here to make sure the ranks and devices match.
46 //
47 // This implementation uses the underlying 2-D view directly.
48 // Currently only works for statically sized MP::Vector
49 template <typename MatrixMemorySpace,
50  typename MatrixStorage,
51  typename MatrixOrdinal,
52  typename MatrixMemory,
53  typename MatrixSize,
54  typename InputStorage,
55  typename ... InputP,
56  typename OutputStorage,
57  typename ... OutputP,
58  typename Update>
59 class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
60  MatrixOrdinal,
61  Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace>,
62  MatrixMemory,
63  MatrixSize>,
64  Kokkos::View< const Sacado::MP::Vector<InputStorage>*,
65  InputP... >,
66  Kokkos::View< Sacado::MP::Vector<OutputStorage>*,
67  OutputP... >,
68  Update,
69  void
70  >
71 {
72 public:
73 
74  typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
75  typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
76  typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
77 
78  typedef Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace> Device;
80  typedef typename execution_space::size_type size_type;
81 
82  typedef KokkosSparse::CrsMatrix<const MatrixValue,
83  MatrixOrdinal,
84  Device,
85  MatrixMemory,
86  MatrixSize> matrix_type;
87  typedef typename matrix_type::values_type matrix_values_type;
88  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
89  typedef Kokkos::View< const InputVectorValue*,
90  InputP... > input_vector_type;
91  typedef Kokkos::View< OutputVectorValue*,
92  OutputP... > output_vector_type;
93  typedef Update update_type;
94 
95  // Multiply that uses shared memory for coalesced access of sparse
96  // graph row offsets and column indices and Rank-1 vectors
97  template <unsigned NumPerThread>
98  struct Kernel {
99  typedef typename output_vector_type::value_type output_vector_value;
101 
102  const matrix_graph_type m_Agraph;
103  const matrix_values_type m_Avals;
104  const input_vector_type m_x;
105  const output_vector_type m_y;
106  const update_type m_update;
107  const size_type m_row_count;
108 
109  Kernel( const matrix_type & A,
110  const input_vector_type & x,
111  const output_vector_type & y,
112  const update_type& update )
113  : m_Agraph( A.graph )
114  , m_Avals( A.values )
115  , m_x( x )
116  , m_y( y )
117  , m_update( update )
118  , m_row_count( A.graph.row_map.extent(0)-1 )
119  {}
120 
121  __device__
122  inline void operator()(void) const
123  {
124  volatile size_type * const sh_row =
125  kokkos_impl_cuda_shared_memory<size_type>();
126  volatile size_type * const sh_col = sh_row + blockDim.y+1;
127 
128  const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
129  const size_type nid = blockDim.x*blockDim.y;
130 
131  const size_type block_row = blockDim.y*blockIdx.x;
132 
133  // Read blockDim.y+1 row offsets in coalesced manner
134  const size_type num_row =
135  block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
136  m_row_count+1 - block_row;
137  for (size_type i=tid; i<num_row; i+=nid)
138  sh_row[i] = m_Agraph.row_map[block_row+i];
139  __syncthreads();
140 
141  const size_type iRow = block_row + threadIdx.y;
142  if (iRow < m_row_count) {
143  scalar_type sum[NumPerThread];
144  const size_type iEntryBegin = sh_row[threadIdx.y];
145  const size_type iEntryEnd = sh_row[threadIdx.y+1];
146 
147  for (size_type e=0; e<NumPerThread; ++e)
148  sum[e] = 0;
149 
150  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
151  col_block+=blockDim.x) {
152  const size_type num_col =
153  col_block+blockDim.x <= iEntryEnd ?
154  blockDim.x : iEntryEnd-col_block;
155 
156  // Read blockDim.x entries column indices at a time to maintain
157  // coalesced accesses (don't need __syncthreads() assuming
158  // blockDim.x <= warp_size
159  // Note: it might be a little faster if we ensured aligned access
160  // to m_A.graph.entries() and m_A.values() below.
161  if (threadIdx.x < num_col)
162  sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
163  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
164  __syncthreads();
165 
166  for ( size_type col = 0; col < num_col; ++col ) {
167  size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
168 
169  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
170  ++e, ee+=blockDim.x) {
171  sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
172  m_x(iCol).fastAccessCoeff(ee);
173  }
174  }
175 
176  }
177 
178  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
179  ++e, ee+=blockDim.x) {
180  m_update( m_y(iRow).fastAccessCoeff(ee), sum[e] );
181  }
182  }
183  } // operator()
184  };
185 
186  static void apply( const matrix_type & A,
187  const input_vector_type & x,
188  const output_vector_type & y,
189  const update_type & update )
190  {
191  // Compute CUDA block and grid sizes.
192  //
193  // For Kepler, the best block size appears to be 256 threads with
194  // 16 threads per vector for double precision, yielding 16 rows per
195  // block. Due to register usage, this gives 64 or 48 warps per SM
196  // and thus 8 or 6 blocks per SM. We use these values by default if
197  // the user-specified block dimensions are zero
198 
199  const size_type value_dimension = Kokkos::dimension_scalar(x);
200 
201  size_type threads_per_vector = A.dev_config.block_dim.x;
202  if (threads_per_vector == 0)
203  threads_per_vector = value_dimension ;
204  size_type rows_per_block = A.dev_config.block_dim.y;
205  if (rows_per_block == 0)
206  rows_per_block = 256 / threads_per_vector;
207  const size_type row_count = A.graph.row_map.extent(0)-1;
208  const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
209  const dim3 block( threads_per_vector, rows_per_block, 1 );
210  const dim3 grid( num_blocks, 1 );
211 
212  // Check threads_per_vector evenly divides number of vector entries
213  size_type num_per_thread = value_dimension / threads_per_vector;
215  num_per_thread * threads_per_vector != value_dimension, std::logic_error,
216  "Entries/thread * threads/vector must equal number of vector entries");
217 
218  // Check threads_per_vector is not greater than warp size (kernels assume
219  // this)
220  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
222  threads_per_vector > warp_size, std::logic_error,
223  "Threads/vector cannont exceed Cuda warp size");
224 
225  // Launch kernel based on static number of entries per thread
226  if (num_per_thread == 1) {
227  launch_impl<1>( A, x, y, update, block, grid );
228  }
229  else if (num_per_thread == 2) {
230  launch_impl<2>( A, x, y, update, block, grid );
231  }
232  else if (num_per_thread == 3) {
233  launch_impl<3>( A, x, y, update, block, grid );
234  }
235  else if (num_per_thread == 4) {
236  launch_impl<4>( A, x, y, update, block, grid );
237  }
238  else
240  true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
241  }
242 
243 private:
244 
245  // Function to launch our kernel, templated on number of entries per thread
246  // and shared memory choice
247  template <unsigned num_per_thread>
248  static void launch_impl( const matrix_type & A,
249  const input_vector_type & x,
250  const output_vector_type & y,
251  const update_type & update,
252  dim3 block,
253  dim3 grid)
254  {
255  typedef Kernel<num_per_thread> Krnl;
256 
257  // Use this to check occupancy, 64 is 100% on Kepler
258  const bool occupancy_check = false;
259  if (occupancy_check) {
260  DeviceProp device_prop;
261  size_type warps_per_sm;
262  if (num_per_thread == 1)
263  warps_per_sm = device_prop.get_resident_warps_per_sm(
264  FullOccupancyKernelLaunch<Krnl>);
265  else
266  warps_per_sm = device_prop.get_resident_warps_per_sm(
267  Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
268  std::cout << "warps_per_sm = " << warps_per_sm
269  << " max_warps_per_sm = " << device_prop.max_warps_per_sm
270  << std::endl;
271  }
272 
273  const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
274  if (sizeof(size_type) <= 4)
275  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
276  else
277  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
278 
279  // Launch
280  if (num_per_thread == 1)
281  FullOccupancyKernelLaunch<<< grid, block, shared >>>
282  ( Krnl( A, x, y, update ) );
283  else
284  Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
285  ( Krnl( A, x, y, update ) );
286  }
287 };
288 
289 // Kernel implementing y = A * x for Cuda device where
290 // A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
291 // x, y == Kokkos::View< Sacado::MP::Vector<...>**,...>,
292 // x and y are rank 2
293 // We spell everything out here to make sure the ranks and devices match.
294 //
295 // This implementation uses the underlying 2-D view directly.
296 // Currently only works for statically sized MP::Vector
297 template <typename MatrixMemorySpace,
298  typename MatrixStorage,
299  typename MatrixOrdinal,
300  typename MatrixMemory,
301  typename MatrixSize,
302  typename InputStorage,
303  typename ... InputP,
304  typename OutputStorage,
305  typename ... OutputP,
306  typename Update>
307 class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
308  MatrixOrdinal,
309  Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace>,
310  MatrixMemory,
311  MatrixSize>,
312  Kokkos::View< const Sacado::MP::Vector<InputStorage>**,
313  InputP... >,
314  Kokkos::View< Sacado::MP::Vector<OutputStorage>**,
315  OutputP... >,
316  Update,
317  void
318  >
319 {
320 public:
321 
322  typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
323  typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
324  typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
325 
326  typedef Kokkos::Device<Kokkos::Cuda, MatrixMemorySpace> Device;
327  typedef typename Device::execution_space execution_space;
328  typedef typename execution_space::size_type size_type;
329 
330 
331  typedef KokkosSparse::CrsMatrix<const MatrixValue,
332  MatrixOrdinal,
333  Device,
334  MatrixMemory,
335  MatrixSize> matrix_type;
336  typedef typename matrix_type::values_type matrix_values_type;
337  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
338  typedef Kokkos::View< const InputVectorValue**,
339  InputP... > input_vector_type;
340  typedef Kokkos::View< OutputVectorValue**,
341  OutputP... > output_vector_type;
342  typedef Update update_type;
343 
344  // Specialization that uses shared memory for coalesced access of sparse
345  // graph row offsets and column indices and Rank-2 vectors
346  template <unsigned NumPerThread>
347  struct Kernel {
348  typedef typename output_vector_type::value_type output_vector_value;
350 
351  const matrix_graph_type m_Agraph;
352  const matrix_values_type m_Avals;
353  const input_vector_type m_x;
354  const output_vector_type m_y;
355  const update_type m_update;
356  const size_type m_row_count;
357  const size_type m_num_vec_cols;
358 
359  Kernel( const matrix_type & A,
360  const input_vector_type & x,
361  const output_vector_type & y,
362  const update_type& update )
363  : m_Agraph( A.graph )
364  , m_Avals( A.values )
365  , m_x( x )
366  , m_y( y )
367  , m_update( update )
368  , m_row_count( A.graph.row_map.extent(0)-1 )
369  , m_num_vec_cols( x.extent(1) )
370  {}
371 
372  __device__
373  inline void operator()(void) const
374  {
375  volatile size_type * const sh_row =
376  kokkos_impl_cuda_shared_memory<size_type>();
377  volatile size_type * const sh_col = sh_row + blockDim.y+1;
378 
379  const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
380  const size_type nid = blockDim.x*blockDim.y;
381 
382  const size_type block_row = blockDim.y*blockIdx.x;
383 
384  // Read blockDim.y+1 row offsets in coalesced manner
385  const size_type num_row =
386  block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
387  m_row_count+1 - block_row;
388  for (size_type i=tid; i<num_row; i+=nid)
389  sh_row[i] = m_Agraph.row_map[block_row+i];
390  __syncthreads();
391 
392  const size_type iRow = block_row + threadIdx.y;
393  if (iRow < m_row_count) {
394  scalar_type sum[NumPerThread];
395  const size_type iEntryBegin = sh_row[threadIdx.y];
396  const size_type iEntryEnd = sh_row[threadIdx.y+1];
397 
398  // We could potentially improve performance by putting this loop
399  // inside the matrix column loop, however that would require storing
400  // an array for sum over vector columns, which would require shared
401  // memory.
402  for (size_type vec_col=0; vec_col<m_num_vec_cols; vec_col++) {
403 
404  for (size_type e=0; e<NumPerThread; ++e)
405  sum[e] = 0;
406 
407  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
408  col_block+=blockDim.x) {
409  const size_type num_col =
410  col_block+blockDim.x <= iEntryEnd ?
411  blockDim.x : iEntryEnd-col_block;
412 
413  // Read blockDim.x entries column indices at a time to maintain
414  // coalesced accesses (don't need __syncthreads() assuming
415  // blockDim.x <= warp_size
416  // Note: it might be a little faster if we ensured aligned access
417  // to m_A.graph.entries() and m_A.values() below.
418  if (threadIdx.x < num_col)
419  sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
420  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
421  __syncthreads();
422 
423  for ( size_type col = 0; col < num_col; ++col ) {
424  size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
425 
426  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
427  ++e, ee+=blockDim.x) {
428  sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
429  m_x(iCol, vec_col).fastAccessCoeff(ee);
430  }
431  }
432 
433  }
434 
435  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
436  ++e, ee+=blockDim.x) {
437  m_update( m_y(iRow, vec_col).fastAccessCoeff(ee), sum[e] );
438  }
439 
440  }
441  }
442  } // operator()
443  };
444 
445  static void apply( const matrix_type & A,
446  const input_vector_type & x,
447  const output_vector_type & y,
448  const update_type & update )
449  {
450  // Compute CUDA block and grid sizes.
451  //
452  // For Kepler, the best block size appears to be 256 threads with
453  // 16 threads per vector for double precision, yielding 16 rows per
454  // block. Due to register usage, this gives 64 or 48 warps per SM
455  // and thus 8 or 6 blocks per SM. We use these values by default if
456  // the user-specified block dimensions are zero
457 
458  const size_type value_dimension = dimension_scalar(x);
459 
460  size_type threads_per_vector = A.dev_config.block_dim.x;
461  if (threads_per_vector == 0)
462  threads_per_vector = value_dimension ;
463  size_type rows_per_block = A.dev_config.block_dim.y;
464  if (rows_per_block == 0)
465  rows_per_block = 256 / threads_per_vector;
466  const size_type row_count = A.graph.row_map.extent(0)-1;
467  const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
468  const dim3 block( threads_per_vector, rows_per_block, 1 );
469  const dim3 grid( num_blocks, 1 );
470 
471  // Check threads_per_vector evenly divides number of vector entries
472  size_type num_per_thread = value_dimension / threads_per_vector;
474  num_per_thread * threads_per_vector != value_dimension, std::logic_error,
475  "Entries/thread * threads/vector must equal number of vector entries");
476 
477  // Launch kernel based on static number of entries per thread
478  if (num_per_thread == 1) {
479  launch_impl<1>( A, x, y, update, block, grid );
480  }
481  else if (num_per_thread == 2) {
482  launch_impl<2>( A, x, y, update, block, grid );
483  }
484  else if (num_per_thread == 3) {
485  launch_impl<3>( A, x, y, update, block, grid );
486  }
487  else if (num_per_thread == 4) {
488  launch_impl<4>( A, x, y, update, block, grid );
489  }
490  else
492  true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
493  }
494 
495 private:
496 
497  // Function to launch our kernel, templated on number of entries per thread
498  // and shared memory choice
499  template <unsigned num_per_thread>
500  static void launch_impl( const matrix_type & A,
501  const input_vector_type & x,
502  const output_vector_type & y,
503  const update_type & update,
504  dim3 block,
505  dim3 grid)
506  {
507  typedef Kernel<num_per_thread> Krnl;
508 
509  // Use this to check occupancy, 64 is 100% on Kepler
510  const bool occupancy_check = false;
511  if (occupancy_check) {
512  DeviceProp device_prop;
513  size_type warps_per_sm;
514  if (num_per_thread == 1)
515  warps_per_sm = device_prop.get_resident_warps_per_sm(
516  FullOccupancyKernelLaunch<Krnl>);
517  else
518  warps_per_sm = device_prop.get_resident_warps_per_sm(
519  Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
520  std::cout << "warps_per_sm = " << warps_per_sm
521  << " max_warps_per_sm = " << device_prop.max_warps_per_sm
522  << std::endl;
523  }
524 
525  const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
526  if (sizeof(size_type) <= 4)
527  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
528  else
529  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
530 
531  // Launch
532  if (num_per_thread == 1)
533  FullOccupancyKernelLaunch<<< grid, block, shared >>>
534  ( Krnl( A, x, y, update ) );
535  else
536  Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
537  ( Krnl( A, x, y, update ) );
538  }
539 };
540 
541 } // namespace details
542 
543 } // namespace Stokhos
544 
545 #endif /* #if defined( __CUDACC__) */
546 
547 #endif /* #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP */
Kokkos::DefaultExecutionSpace execution_space
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)