Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_CrsMatrix_MP_Vector_Cuda.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
43 #define KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
44 
45 #if defined( __CUDACC__)
46 
48 #include "Kokkos_Core.hpp"
50 
51 //----------------------------------------------------------------------------
52 // Specializations of Kokkos::CrsMatrix for Sacado::MP::Vector scalar type
53 // and Cuda device
54 //----------------------------------------------------------------------------
55 
56 namespace Stokhos {
57 
58 namespace details {
59 
60 // We can make things slightly faster for the ensemble multiply kernel with
61 // NumPerThread == 1 by using at most 32 registers per thread for 100%
62 // occupancy, which we get with these launch bounds on Kepler. For
63 // NumPerThread > 1 it is worse though due to too many spilled registers.
64 template <typename Kernel>
65 __global__ void
66 #if __CUDA_ARCH__ >= 300
67 __launch_bounds__(1024,2)
68 #endif
69 FullOccupancyKernelLaunch(Kernel kernel) {
70  kernel();
71 }
72 
73 // Kernel implementing y = A * x for Cuda device where
74 // A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
75 // x, y == Kokkos::View< Sacado::MP::Vector<...>*,...>,
76 // x and y are rank 1
77 // We spell everything out here to make sure the ranks and devices match.
78 //
79 // This implementation uses the underlying 2-D view directly.
80 // Currently only works for statically sized MP::Vector
81 template <typename MatrixStorage,
82  typename MatrixOrdinal,
83  typename MatrixMemory,
84  typename MatrixSize,
85  typename InputStorage,
86  typename ... InputP,
87  typename OutputStorage,
88  typename ... OutputP,
89  typename Update>
90 class MPMultiply< KokkosSparse::CrsMatrix<Sacado::MP::Vector<MatrixStorage>,
91  MatrixOrdinal,
92  Kokkos::Cuda,
93  MatrixMemory,
94  MatrixSize>,
95  Kokkos::View< Sacado::MP::Vector<InputStorage>*,
96  InputP... >,
97  Kokkos::View< Sacado::MP::Vector<OutputStorage>*,
98  OutputP... >,
99  Update,
100  void
101  >
102 {
103 public:
104 
105  typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
106  typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
107  typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
108 
109  typedef typename Kokkos::Cuda Device;
110  typedef Device execution_space;
111  typedef typename execution_space::size_type size_type;
112 
113  typedef KokkosSparse::CrsMatrix<MatrixValue,
114  MatrixOrdinal,
116  MatrixMemory,
117  MatrixSize> matrix_type;
118  typedef typename matrix_type::values_type matrix_values_type;
119  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
120  typedef Kokkos::View< InputVectorValue*,
121  InputP... > input_vector_type;
122  typedef Kokkos::View< OutputVectorValue*,
123  OutputP... > output_vector_type;
124  typedef Update update_type;
125 
126  // Multiply that uses shared memory for coalesced access of sparse
127  // graph row offsets and column indices and Rank-1 vectors
128  template <unsigned NumPerThread>
129  struct Kernel {
130  typedef typename output_vector_type::value_type output_vector_value;
132 
133  const matrix_graph_type m_Agraph;
134  const matrix_values_type m_Avals;
135  const input_vector_type m_x;
136  const output_vector_type m_y;
137  const update_type m_update;
138  const size_type m_row_count;
139 
140  Kernel( const matrix_type & A,
141  const input_vector_type & x,
142  const output_vector_type & y,
143  const update_type& update )
144  : m_Agraph( A.graph )
145  , m_Avals( A.values )
146  , m_x( x )
147  , m_y( y )
148  , m_update( update )
149  , m_row_count( A.graph.row_map.extent(0)-1 )
150  {}
151 
152  __device__
153  inline void operator()(void) const
154  {
155  volatile size_type * const sh_row =
156  kokkos_impl_cuda_shared_memory<size_type>();
157  volatile size_type * const sh_col = sh_row + blockDim.y+1;
158 
159  const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
160  const size_type nid = blockDim.x*blockDim.y;
161 
162  const size_type block_row = blockDim.y*blockIdx.x;
163 
164  // Read blockDim.y+1 row offsets in coalesced manner
165  const size_type num_row =
166  block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
167  m_row_count+1 - block_row;
168  for (size_type i=tid; i<num_row; i+=nid)
169  sh_row[i] = m_Agraph.row_map[block_row+i];
170  __syncthreads();
171 
172  const size_type iRow = block_row + threadIdx.y;
173  if (iRow < m_row_count) {
174  scalar_type sum[NumPerThread];
175  const size_type iEntryBegin = sh_row[threadIdx.y];
176  const size_type iEntryEnd = sh_row[threadIdx.y+1];
177 
178  for (size_type e=0; e<NumPerThread; ++e)
179  sum[e] = 0;
180 
181  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
182  col_block+=blockDim.x) {
183  const size_type num_col =
184  col_block+blockDim.x <= iEntryEnd ?
185  blockDim.x : iEntryEnd-col_block;
186 
187  // Read blockDim.x entries column indices at a time to maintain
188  // coalesced accesses (don't need __syncthreads() assuming
189  // blockDim.x <= warp_size
190  // Note: it might be a little faster if we ensured aligned access
191  // to m_A.graph.entries() and m_A.values() below.
192  if (threadIdx.x < num_col)
193  sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
194  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
195  __syncthreads();
196 
197  for ( size_type col = 0; col < num_col; ++col ) {
198  size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
199 
200  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
201  ++e, ee+=blockDim.x) {
202  sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
203  m_x(iCol).fastAccessCoeff(ee);
204  }
205  }
206 
207  }
208 
209  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
210  ++e, ee+=blockDim.x) {
211  m_update( m_y(iRow).fastAccessCoeff(ee), sum[e] );
212  }
213  }
214  } // operator()
215  };
216 
217  static void apply( const matrix_type & A,
218  const input_vector_type & x,
219  const output_vector_type & y,
220  const update_type & update )
221  {
222  // Compute CUDA block and grid sizes.
223  //
224  // For Kepler, the best block size appears to be 256 threads with
225  // 16 threads per vector for double precision, yielding 16 rows per
226  // block. Due to register usage, this gives 64 or 48 warps per SM
227  // and thus 8 or 6 blocks per SM. We use these values by default if
228  // the user-specified block dimensions are zero
229 
230  const size_type value_dimension = Kokkos::dimension_scalar(x);
231 
232  size_type threads_per_vector = A.dev_config.block_dim.x;
233  if (threads_per_vector == 0)
234  threads_per_vector = value_dimension ;
235  size_type rows_per_block = A.dev_config.block_dim.y;
236  if (rows_per_block == 0)
237  rows_per_block = 256 / threads_per_vector;
238  const size_type row_count = A.graph.row_map.extent(0)-1;
239  const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
240  const dim3 block( threads_per_vector, rows_per_block, 1 );
241  const dim3 grid( num_blocks, 1 );
242 
243  // Check threads_per_vector evenly divides number of vector entries
244  size_type num_per_thread = value_dimension / threads_per_vector;
246  num_per_thread * threads_per_vector != value_dimension, std::logic_error,
247  "Entries/thread * threads/vector must equal number of vector entries");
248 
249  // Check threads_per_vector is not greater than warp size (kernels assume
250  // this)
251  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
253  threads_per_vector > warp_size, std::logic_error,
254  "Threads/vector cannont exceed Cuda warp size");
255 
256  // Launch kernel based on static number of entries per thread
257  if (num_per_thread == 1) {
258  launch_impl<1>( A, x, y, update, block, grid );
259  }
260  else if (num_per_thread == 2) {
261  launch_impl<2>( A, x, y, update, block, grid );
262  }
263  else if (num_per_thread == 3) {
264  launch_impl<3>( A, x, y, update, block, grid );
265  }
266  else if (num_per_thread == 4) {
267  launch_impl<4>( A, x, y, update, block, grid );
268  }
269  else
271  true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
272  }
273 
274 private:
275 
276  // Function to launch our kernel, templated on number of entries per thread
277  // and shared memory choice
278  template <unsigned num_per_thread>
279  static void launch_impl( const matrix_type & A,
280  const input_vector_type & x,
281  const output_vector_type & y,
282  const update_type & update,
283  dim3 block,
284  dim3 grid)
285  {
286  typedef Kernel<num_per_thread> Krnl;
287 
288  // Use this to check occupancy, 64 is 100% on Kepler
289  const bool occupancy_check = false;
290  if (occupancy_check) {
291  DeviceProp device_prop;
292  size_type warps_per_sm;
293  if (num_per_thread == 1)
294  warps_per_sm = device_prop.get_resident_warps_per_sm(
295  FullOccupancyKernelLaunch<Krnl>);
296  else
297  warps_per_sm = device_prop.get_resident_warps_per_sm(
298  Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
299  std::cout << "warps_per_sm = " << warps_per_sm
300  << " max_warps_per_sm = " << device_prop.max_warps_per_sm
301  << std::endl;
302  }
303 
304  const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
305  if (sizeof(size_type) <= 4)
306  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
307  else
308  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
309 
310  // Launch
311  if (num_per_thread == 1)
312  FullOccupancyKernelLaunch<<< grid, block, shared >>>
313  ( Krnl( A, x, y, update ) );
314  else
315  Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
316  ( Krnl( A, x, y, update ) );
317  }
318 };
319 
320 // Kernel implementing y = A * x for Cuda device where
321 // A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
322 // x, y == Kokkos::View< Sacado::MP::Vector<...>**,...>,
323 // x and y are rank 2
324 // We spell everything out here to make sure the ranks and devices match.
325 //
326 // This implementation uses the underlying 2-D view directly.
327 // Currently only works for statically sized MP::Vector
328 template <typename MatrixStorage,
329  typename MatrixOrdinal,
330  typename MatrixMemory,
331  typename MatrixSize,
332  typename InputStorage,
333  typename ... InputP,
334  typename OutputStorage,
335  typename ... OutputP,
336  typename Update>
337 class MPMultiply< KokkosSparse::CrsMatrix<Sacado::MP::Vector<MatrixStorage>,
338  MatrixOrdinal,
339  Kokkos::Cuda,
340  MatrixMemory,
341  MatrixSize>,
342  Kokkos::View< Sacado::MP::Vector<InputStorage>**,
343  InputP... >,
344  Kokkos::View< Sacado::MP::Vector<OutputStorage>**,
345  OutputP... >,
346  Update,
347  void
348  >
349 {
350 public:
351 
352  typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
353  typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
354  typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
355 
356  typedef typename Kokkos::Cuda Device;
357  typedef Device execution_space;
358  typedef typename execution_space::size_type size_type;
359 
360 
361  typedef KokkosSparse::CrsMatrix<MatrixValue,
362  MatrixOrdinal,
364  MatrixMemory,
365  MatrixSize> matrix_type;
366  typedef typename matrix_type::values_type matrix_values_type;
367  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
368  typedef Kokkos::View< InputVectorValue**,
369  InputP... > input_vector_type;
370  typedef Kokkos::View< OutputVectorValue**,
371  OutputP... > output_vector_type;
372  typedef Update update_type;
373 
374  // Specialization that uses shared memory for coalesced access of sparse
375  // graph row offsets and column indices and Rank-2 vectors
376  template <unsigned NumPerThread>
377  struct Kernel {
378  typedef typename output_vector_type::value_type output_vector_value;
380 
381  const matrix_graph_type m_Agraph;
382  const matrix_values_type m_Avals;
383  const input_vector_type m_x;
384  const output_vector_type m_y;
385  const update_type m_update;
386  const size_type m_row_count;
387  const size_type m_num_vec_cols;
388 
389  Kernel( const matrix_type & A,
390  const input_vector_type & x,
391  const output_vector_type & y,
392  const update_type& update )
393  : m_Agraph( A.graph )
394  , m_Avals( A.values )
395  , m_x( x )
396  , m_y( y )
397  , m_update( update )
398  , m_row_count( A.graph.row_map.extent(0)-1 )
399  , m_num_vec_cols( x.extent(1) )
400  {}
401 
402  __device__
403  inline void operator()(void) const
404  {
405  volatile size_type * const sh_row =
406  kokkos_impl_cuda_shared_memory<size_type>();
407  volatile size_type * const sh_col = sh_row + blockDim.y+1;
408 
409  const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
410  const size_type nid = blockDim.x*blockDim.y;
411 
412  const size_type block_row = blockDim.y*blockIdx.x;
413 
414  // Read blockDim.y+1 row offsets in coalesced manner
415  const size_type num_row =
416  block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
417  m_row_count+1 - block_row;
418  for (size_type i=tid; i<num_row; i+=nid)
419  sh_row[i] = m_Agraph.row_map[block_row+i];
420  __syncthreads();
421 
422  const size_type iRow = block_row + threadIdx.y;
423  if (iRow < m_row_count) {
424  scalar_type sum[NumPerThread];
425  const size_type iEntryBegin = sh_row[threadIdx.y];
426  const size_type iEntryEnd = sh_row[threadIdx.y+1];
427 
428  // We could potentially improve performance by putting this loop
429  // inside the matrix column loop, however that would require storing
430  // an array for sum over vector columns, which would require shared
431  // memory.
432  for (size_type vec_col=0; vec_col<m_num_vec_cols; vec_col++) {
433 
434  for (size_type e=0; e<NumPerThread; ++e)
435  sum[e] = 0;
436 
437  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
438  col_block+=blockDim.x) {
439  const size_type num_col =
440  col_block+blockDim.x <= iEntryEnd ?
441  blockDim.x : iEntryEnd-col_block;
442 
443  // Read blockDim.x entries column indices at a time to maintain
444  // coalesced accesses (don't need __syncthreads() assuming
445  // blockDim.x <= warp_size
446  // Note: it might be a little faster if we ensured aligned access
447  // to m_A.graph.entries() and m_A.values() below.
448  if (threadIdx.x < num_col)
449  sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
450  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
451  __syncthreads();
452 
453  for ( size_type col = 0; col < num_col; ++col ) {
454  size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
455 
456  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
457  ++e, ee+=blockDim.x) {
458  sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
459  m_x(iCol, vec_col).fastAccessCoeff(ee);
460  }
461  }
462 
463  }
464 
465  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
466  ++e, ee+=blockDim.x) {
467  m_update( m_y(iRow, vec_col).fastAccessCoeff(ee), sum[e] );
468  }
469 
470  }
471  }
472  } // operator()
473  };
474 
475  static void apply( const matrix_type & A,
476  const input_vector_type & x,
477  const output_vector_type & y,
478  const update_type & update )
479  {
480  // Compute CUDA block and grid sizes.
481  //
482  // For Kepler, the best block size appears to be 256 threads with
483  // 16 threads per vector for double precision, yielding 16 rows per
484  // block. Due to register usage, this gives 64 or 48 warps per SM
485  // and thus 8 or 6 blocks per SM. We use these values by default if
486  // the user-specified block dimensions are zero
487 
488  const size_type value_dimension = dimension_scalar(x);
489 
490  size_type threads_per_vector = A.dev_config.block_dim.x;
491  if (threads_per_vector == 0)
492  threads_per_vector = value_dimension ;
493  size_type rows_per_block = A.dev_config.block_dim.y;
494  if (rows_per_block == 0)
495  rows_per_block = 256 / threads_per_vector;
496  const size_type row_count = A.graph.row_map.extent(0)-1;
497  const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
498  const dim3 block( threads_per_vector, rows_per_block, 1 );
499  const dim3 grid( num_blocks, 1 );
500 
501  // Check threads_per_vector evenly divides number of vector entries
502  size_type num_per_thread = value_dimension / threads_per_vector;
504  num_per_thread * threads_per_vector != value_dimension, std::logic_error,
505  "Entries/thread * threads/vector must equal number of vector entries");
506 
507  // Launch kernel based on static number of entries per thread
508  if (num_per_thread == 1) {
509  launch_impl<1>( A, x, y, update, block, grid );
510  }
511  else if (num_per_thread == 2) {
512  launch_impl<2>( A, x, y, update, block, grid );
513  }
514  else if (num_per_thread == 3) {
515  launch_impl<3>( A, x, y, update, block, grid );
516  }
517  else if (num_per_thread == 4) {
518  launch_impl<4>( A, x, y, update, block, grid );
519  }
520  else
522  true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
523  }
524 
525 private:
526 
527  // Function to launch our kernel, templated on number of entries per thread
528  // and shared memory choice
529  template <unsigned num_per_thread>
530  static void launch_impl( const matrix_type & A,
531  const input_vector_type & x,
532  const output_vector_type & y,
533  const update_type & update,
534  dim3 block,
535  dim3 grid)
536  {
537  typedef Kernel<num_per_thread> Krnl;
538 
539  // Use this to check occupancy, 64 is 100% on Kepler
540  const bool occupancy_check = false;
541  if (occupancy_check) {
542  DeviceProp device_prop;
543  size_type warps_per_sm;
544  if (num_per_thread == 1)
545  warps_per_sm = device_prop.get_resident_warps_per_sm(
546  FullOccupancyKernelLaunch<Krnl>);
547  else
548  warps_per_sm = device_prop.get_resident_warps_per_sm(
549  Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
550  std::cout << "warps_per_sm = " << warps_per_sm
551  << " max_warps_per_sm = " << device_prop.max_warps_per_sm
552  << std::endl;
553  }
554 
555  const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
556  if (sizeof(size_type) <= 4)
557  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
558  else
559  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
560 
561  // Launch
562  if (num_per_thread == 1)
563  FullOccupancyKernelLaunch<<< grid, block, shared >>>
564  ( Krnl( A, x, y, update ) );
565  else
566  Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
567  ( Krnl( A, x, y, update ) );
568  }
569 };
570 
571 } // namespace details
572 
573 } // namespace Stokhos
574 
575 #endif /* #if defined( __CUDACC__) */
576 
577 #endif /* #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP */
Kokkos::DefaultExecutionSpace execution_space
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)