Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_CrsMatrix_UQ_PCE_Cuda.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
11 #define KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
12 
13 #if defined( __CUDACC__)
14 
15 #include "Sacado_UQ_PCE.hpp"
16 #include "Kokkos_View_UQ_PCE.hpp"
18 #include "KokkosSparse_CrsMatrix.hpp"
19 
20 //MD 08/2017 Note: I commented out below, as this file is totally
21 //removed from KokkosKernels. It does not look like the
22 //included file is used anywhere in the file.
23 //#include "Kokkos_MV.hpp" // for some utilities
24 
25 #include "Stokhos_Multiply.hpp"
27 
28 #include "Kokkos_Core.hpp"
29 
31 //#include "Stokhos_Cuda_WarpShuffle.hpp"
32 
34 
35 //#include "cuda_profiler_api.h"
36 
37 #ifdef __CUDA_ARCH__
38 # if (__CUDA_ARCH__ >= 300)
39 # define HAVE_CUDA_SHUFFLE 1
40 # else
41 # define HAVE_CUDA_SHUFFLE 0
42 # endif
43 #else
44 # define HAVE_CUDA_SHUFFLE 0
45 #endif
46 
47 namespace Stokhos {
48 
49 //----------------------------------------------------------------------------
50 // Specialization of Kokkos CrsMatrix math functions
51 //----------------------------------------------------------------------------
52 
53 // Kernel implementing y = A * x where
54 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
55 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
56 // x and y are rank 1
57 template <typename MatrixStorage,
58  typename MatrixOrdinal,
59  typename MatrixMemory,
60  typename MatrixSize,
61  typename InputStorage,
62  typename ... InputP,
63  typename OutputStorage,
64  typename ... OutputP>
65 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
66  MatrixOrdinal,
67  Kokkos::Cuda,
68  MatrixMemory,
69  MatrixSize>,
70  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
71  InputP... >,
72  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
73  OutputP... >
74  >
75 {
76 public:
77  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
78  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
79  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
80 
81  typedef Kokkos::Cuda MatrixDevice;
82  typedef MatrixDevice execution_space;
83  typedef execution_space::size_type size_type;
84 
85  typedef KokkosSparse::CrsMatrix< const MatrixValue,
86  MatrixOrdinal,
87  MatrixDevice,
88  MatrixMemory,
89  MatrixSize> matrix_type;
90  typedef typename matrix_type::values_type matrix_values_type;
91  typedef typename Kokkos::CijkType<matrix_values_type>::type tensor_type;
92  typedef Kokkos::View< const InputVectorValue*,
93  InputP... > input_vector_type;
94  typedef Kokkos::View< OutputVectorValue*,
95  OutputP... > output_vector_type;
96 
97 private:
98 
99  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
100  typedef typename matrix_values_type::array_type matrix_array_type;
101  typedef typename input_vector_type::array_type input_array_type;
102  typedef typename output_vector_type::array_type output_array_type;
103 
104  typedef typename MatrixValue::value_type matrix_scalar;
105  typedef typename InputVectorValue::value_type input_scalar;
106  typedef typename OutputVectorValue::value_type output_scalar;
107  typedef typename tensor_type::value_type tensor_scalar;
108 
109  const matrix_array_type m_A_values ;
110  const matrix_graph_type m_A_graph ;
111  const input_array_type m_x ;
112  const output_array_type m_y ;
113  const tensor_type m_tensor ;
114  const input_scalar m_a ;
115  const output_scalar m_b ;
116  const size_type BlockSize;
117 
118  Multiply( const matrix_type & A ,
119  const input_vector_type & x ,
120  const output_vector_type & y ,
121  const input_scalar & a ,
122  const output_scalar & b ,
123  const size_type block_size )
124  : m_A_values( A.values )
125  , m_A_graph( A.graph )
126  , m_x( x )
127  , m_y( y )
128  , m_tensor( Kokkos::cijk(A.values) )
129  , m_a( a )
130  , m_b( b )
131  , BlockSize(block_size)
132  {}
133 
134 public:
135 
136  __device__ void operator()(void) const
137  {
138  // Number of bases in the stochastic system:
139  const size_type dim = m_tensor.dimension();
140 
141  // Get shared memory for loading x, A, and y
142  volatile input_scalar * const sh_x =
143  kokkos_impl_cuda_shared_memory<input_scalar>();
144  volatile matrix_scalar * const sh_A = sh_x + BlockSize*dim;
145  volatile output_scalar * const sh_y = sh_A + BlockSize*dim;
146 #if !HAVE_CUDA_SHUFFLE
147  volatile output_scalar * const sh_t = sh_y + dim;
148 #endif
149 
150  const size_type nid = blockDim.x * blockDim.y;
151  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
152 
153  // Zero y
154  for ( size_type i = tid; i < dim; i += nid ) {
155  sh_y[i] = 0.0;
156  }
157 
158  // Loop over columns in the discrete (finite element) system.
159  // blockIdx.x == row in the deterministic (finite element) system
160  const size_type iBlockEntryBeg = m_A_graph.row_map[ blockIdx.x ];
161  const size_type iBlockEntryEnd = m_A_graph.row_map[ blockIdx.x + 1 ];
162  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
163  iBlockEntry += BlockSize) {
164  const size_type block_size =
165  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
166  iBlockEntryEnd-iBlockEntry : BlockSize;
167 
168  // Wait for X and A to be used in the previous iteration
169  // before reading new values.
170  __syncthreads();
171 
172  // Coalesced read blocks of X and A into shared memory
173  for ( size_type col = 0; col < block_size; ++col ) {
174 
175  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
176  const input_scalar * const x = & m_x( 0, iBlockColumn );
177  const matrix_scalar * const A = & m_A_values( iBlockEntry + col, 0 );
178 
179  // Coalesced read by the whole block from global memory:
180  for ( size_type i = tid; i < dim; i += nid ) {
181  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
182  sh_A[col + i * BlockSize] = A[i]; // m_A_values( iBlockEntry , i );
183  }
184 
185  }
186 
187  __syncthreads(); // wait for X and A to be read before using them
188 
189  // This cuda block is responsible for computing all values of 'y'
190  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
191  output_scalar y = 0;
192 
193  // Product tensor entries which this warp will iterate:
194  const size_type lBeg = m_tensor.entry_begin( i );
195  const size_type lEnd = m_tensor.entry_end( i );
196 
197  // Loop through sparse tensor contributions with coalesced reads.
198  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
199 
200  // Read 'blockDim.x' entries from the tensor (coalesced)
201  const size_type kj = m_tensor.coord( l );
202  const tensor_scalar v = m_tensor.value( l );
203  const size_type j = ( kj & 0x0ffff ) * BlockSize ;
204  const size_type k = ( kj >> 16 ) * BlockSize ;
205 
206  for ( size_type col = 0; col < block_size; ++col ) {
207  y += v * ( sh_A[col+j] * sh_x[col+k] +
208  sh_A[col+k] * sh_x[col+j] );
209  }
210 
211  }
212 
213  // Reduction of 'y' within 'blockDim.x'
214 #if HAVE_CUDA_SHUFFLE
215  if (blockDim.x >= 2) y += Kokkos::shfl_down(y, 1, blockDim.x);
216  if (blockDim.x >= 4) y += Kokkos::shfl_down(y, 2, blockDim.x);
217  if (blockDim.x >= 8) y += Kokkos::shfl_down(y, 4, blockDim.x);
218  if (blockDim.x >= 16) y += Kokkos::shfl_down(y, 8, blockDim.x);
219  if (blockDim.x >= 32) y += Kokkos::shfl_down(y, 16, blockDim.x);
220  if ( threadIdx.x == 0 ) sh_y[i] += y;
221 #else
222  sh_t[ tid ] = y;
223  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
224  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
225  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
226  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
227  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
228  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
229 #endif
230 
231  }
232 
233  }
234 
235  // Wait for all contributions of y to be completed
236  __syncthreads();
237 
238  // Store result back in global memory
239  if ( m_b == output_scalar(0) )
240  for ( size_type i = tid; i < dim; i += nid )
241  m_y( i, blockIdx.x ) = m_a * sh_y[ i ];
242  else
243  for ( size_type i = tid; i < dim; i += nid )
244  m_y( i, blockIdx.x ) = m_a * sh_y[ i ] + m_b * m_y( i, blockIdx.x );
245  }
246 
247  struct TensorReadEntry {
248  size_type block_size, shmem, num_blocks, num_warp;
249  double reads;
250  };
251 
252  static void apply( const matrix_type & A ,
253  const input_vector_type & x ,
254  const output_vector_type & y ,
255  const input_scalar & a = input_scalar(1) ,
256  const output_scalar & b = output_scalar(0) )
257  {
258  const tensor_type tensor = Kokkos::cijk(A.values);
259  const size_type row_count = A.graph.row_map.extent(0) - 1;
260  const size_type tensor_dimension = tensor.dimension();
261  const size_type tensor_align = tensor_dimension;
262  const size_type avg_tensor_entries_per_row = tensor.avg_entries_per_row();
263 
264  // Should compute this from FEM graph
265  const size_type fem_nnz_per_row = 27;
266 
267  // Get device properties we need for whatever device is currently selected
268  DeviceProp device_prop;
269  const size_type shcap = device_prop.shared_memory_capacity;
270  const size_type sh_granularity = device_prop.shared_memory_granularity;
271  const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
272  const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
273  const size_type warp_size = device_prop.warp_size;
274  const size_type warp_granularity = device_prop.warp_granularity;
275  const size_type max_warps_per_block =
276  std::min(device_prop.max_threads_per_block / warp_size,
277  device_prop.max_warps_per_sm);
278  const size_type min_warps_per_block = 1;
279  const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
280  const size_type max_regs_per_block = device_prop.max_regs_per_block;
281  const size_type reg_bank_size = device_prop.reg_bank_size;
282 
283  // Compute number of warps we can fit on each SM based on register limits
284  // Use Cuda introspection to determine number of registers per thread
285  //const size_type regs_per_thread = 46;
286  const size_type regs_per_thread =
287  device_prop.get_kernel_registers(
288  Kokkos::Impl::cuda_parallel_launch_local_memory<Multiply>);
289  const size_type regs_per_warp =
290  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
291  const size_type warps_per_sm =
292  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
293  const size_type warps_per_block =
294  (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
295 
296  // Compute number of threads per stochastic row based on number of
297  // nonzero entries per row.
298  // For double, 16 threads/row is still coalesced, but not for float.
299  // We should reorder the tensor values for a given vector width to
300  // maintain coalesced reads. This would help smaller problems too by
301  // allowing fewer threads/row.
302  const size_type threads_per_row =
303  avg_tensor_entries_per_row >= 88 ? 32 : 16;
304  const size_type rows_per_warp = warp_size / threads_per_row;
305 
306  const size_type in_vec_scalar_size = sizeof(input_scalar);
307  const size_type out_vec_scalar_size = sizeof(output_scalar);
308  const size_type mat_scalar_size = sizeof(matrix_scalar);
309 
310 #define USE_FIXED_BLOCKSIZE 0
311 
312 #if USE_FIXED_BLOCKSIZE
313 
314  const size_type num_blocks = 3;
315  size_type nw = warps_per_sm / num_blocks;
316  while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
317  const size_type num_warp = nw;
318  const size_type sh_per_block = shcap / num_blocks;
319  const size_type sr =
320  device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*num_warp;
321  size_type bs = ((sh_per_block - sr) / tensor_align - out_vec_scalar_size) /
322  (in_vec_scalar_size+mat_scalar_size);
323  if (bs % 2 == 0) --bs;
324  const size_type block_size_max = 31;
325  const size_type block_size = std::min(bs, block_size_max);
326  //const size_type block_size = 7;
327  const size_type shmem =
328  ( ((in_vec_scalar_size+mat_scalar_size)*block_size+out_vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
329 
330 #else
331 
332  // We want to maximize the number of blocks per SM (to maximize throughput)
333  // as well as the block_size (to minimize tensor reads), subject to
334  // shared memory and register constraints. Here we try to do this computing
335  // the number of tensor reads per block per thread for each choice of
336  // block_size, and then choose the configuration with the smallest value.
337  // This isn't perfect, but seems to generally work OK. It could be
338  // improved with a better model of:
339  // * Number of blocks versus warps per block (to minimize synchronization)
340  // * Thread efficiency for small numbers of rows per thread
341  const size_type block_size_min = 3;
342  const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
343  const size_type block_size_max =
344  half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
345  Teuchos::Array<TensorReadEntry> reads_per_thread;
346  for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
347  // We don't know the number of warps yet, so we just have to bound
348  // sr by the maximum number possible (which is all warps in 1 block)
349  const size_type sr =
350  device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*warps_per_block;
351  size_type shmem =
352  ((in_vec_scalar_size+mat_scalar_size)*bs+out_vec_scalar_size)*tensor_align+sr;
353  shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
354  if (shmem <= max_shmem_per_block) {
355  size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
356  size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
357  size_type num_warp =
358  std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
359  min_warps_per_block),
360  max_warps_per_block);
361  while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
362  --num_warp;
363  TensorReadEntry entry;
364  entry.block_size = bs;
365  entry.shmem = shmem;
366  entry.num_blocks = num_blocks;
367  entry.num_warp = num_warp;
368 
369  // Prefer at least 3 blocks
370  size_type factor = std::min(num_blocks,3u);
371  entry.reads = (static_cast<double>(tensor_reads) /
372  static_cast<double>(factor*num_blocks*num_warp));
373  reads_per_thread.push_back(entry);
374  }
375  }
377  reads_per_thread.size() == 0, std::logic_error,
378  "Stochastic problem dimension is too large to fit in shared memory");
379  size_type idx = 0;
380  double min_reads = reads_per_thread[0].reads;
381  for (int i=1; i<reads_per_thread.size(); ++i) {
382  if (reads_per_thread[i].reads < min_reads) {
383  idx = i;
384  min_reads = reads_per_thread[i].reads;
385  }
386  }
387 
388  const size_type block_size = reads_per_thread[idx].block_size;
389  const size_type shmem = reads_per_thread[idx].shmem;
390  const size_type num_blocks = reads_per_thread[idx].num_blocks;
391  const size_type num_warp = reads_per_thread[idx].num_warp;
392 
393 #endif
394 
395  // Setup thread blocks and grid
396  const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
397  const dim3 dGrid( row_count, 1, 1 );
398 
399 #if 0
400  std::cout << "block_size = " << block_size
401  << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
402  << " regs_per_thread = " << regs_per_thread
403  << " num blocks = " << num_blocks
404  << " num warps = " << num_warp
405  << " num rows = " << tensor_dimension
406  << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
407  << " avg entries/row = " << avg_tensor_entries_per_row
408  << std::endl;
409 #endif
410 
411  // Finally launch our kernel
412  //cudaProfilerStart();
413  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
414  ( Multiply( A, x, y, a, b, block_size ) );
415  //cudaProfilerStop();
416  }
417 };
418 
419 // Kernel implementing y = A * x where
420 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
421 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
422 // x and y are rank 2
423 //
424 // Note: Unlike the rank-1 version, this version has not been
425 // optimized, and doesn't even include the block-column implementation
426 template <typename MatrixStorage,
427  typename MatrixOrdinal,
428  typename MatrixMemory,
429  typename MatrixSize,
430  typename InputStorage,
431  typename ... InputP,
432  typename OutputStorage,
433  typename ... OutputP>
434 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
435  MatrixOrdinal,
436  Kokkos::Cuda,
437  MatrixMemory,
438  MatrixSize>,
439  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
440  InputP... >,
441  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
442  OutputP... >
443  >
444 {
445 public:
446  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
447  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
448  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
449 
450  typedef Kokkos::Cuda MatrixDevice;
451  typedef MatrixDevice execution_space;
452  typedef execution_space::size_type size_type;
453 
454  typedef KokkosSparse::CrsMatrix< const MatrixValue,
455  MatrixOrdinal,
456  MatrixDevice,
457  MatrixMemory,
458  MatrixSize> matrix_type;
459  typedef Kokkos::View< const InputVectorValue**,
460  InputP... > input_vector_type;
461  typedef Kokkos::View< OutputVectorValue**,
462  OutputP... > output_vector_type;
463  typedef typename InputVectorValue::value_type input_scalar;
464  typedef typename OutputVectorValue::value_type output_scalar;
465 
466 public:
467 
468  static void apply( const matrix_type & A ,
469  const input_vector_type & x ,
470  const output_vector_type & y ,
471  const input_scalar & a = input_scalar(1) ,
472  const output_scalar & b = output_scalar(0) )
473  {
474  typedef Kokkos::View< const InputVectorValue*, InputP... > input_vector_type_1D;
475  typedef Kokkos::View< OutputVectorValue*, OutputP... > output_vector_type_1D;
476  typedef Multiply< matrix_type, input_vector_type_1D,
477  output_vector_type_1D > multiply_type_1D;
478 
479  const size_type num_col = y.extent(1);
480  for (size_type col=0; col<num_col; ++col)
481 
482  multiply_type_1D::apply(
483  A,
484  Kokkos::subview( x, Kokkos::ALL(), col),
485  Kokkos::subview(y, Kokkos::ALL(), col),
486  a, b );
487  }
488 };
489 
490 template <typename Kernel>
491 __global__ void
492 #if __CUDA_ARCH__ >= 300
493 __launch_bounds__(1024,2)
494 #endif
495 MeanFullOccupancyKernelLaunch(Kernel kernel) {
496  kernel();
497 }
498 
499 // Kernel implementing y = A * x where PCE size of A is 1
500 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
501 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
502 // x and y are rank 1
503 template <typename MatrixStorage,
504  typename MatrixOrdinal,
505  typename MatrixMemory,
506  typename MatrixSize,
507  typename InputStorage,
508  typename ... InputP,
509  typename OutputStorage,
510  typename ... OutputP>
511 class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
512  MatrixOrdinal,
513  Kokkos::Cuda,
514  MatrixMemory,
515  MatrixSize >,
516  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
517  InputP... >,
518  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
519  OutputP... >,
520  >
521 {
522 public:
523  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
524  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
525  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
526 
527  typedef Kokkos::Cuda MatrixDevice;
528  typedef MatrixDevice execution_space;
529  typedef KokkosSparse::CrsMatrix< const MatrixValue,
530  MatrixOrdinal,
531  MatrixDevice,
532  MatrixMemory,
533  MatrixSize> matrix_type;
534  typedef typename matrix_type::values_type matrix_values_type;
535  typedef typename MatrixValue::ordinal_type size_type;
536  typedef Kokkos::View< const InputVectorValue*,
537  InputP... > input_vector_type;
538  typedef Kokkos::View< OutputVectorValue*,
539  OutputP... > output_vector_type;
540 
541  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
542  typedef typename MatrixValue::value_type matrix_scalar;
543  typedef typename InputVectorValue::value_type input_scalar;
544  typedef typename OutputVectorValue::value_type output_scalar;
545 
546  template <int BlockSize>
547  struct Kernel {
548  typedef MatrixDevice execution_space;
549  typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
550  typedef typename input_vector_type::array_type input_array_type;
551  typedef typename output_vector_type::array_type output_array_type;
552 
553  const matrix_array_type m_A_values ;
554  const matrix_graph_type m_A_graph ;
555  const output_array_type v_y ;
556  const input_array_type v_x ;
557  const input_scalar m_a ;
558  const output_scalar m_b ;
559  const size_type m_row_count;
560  const size_type dim ;
561 
562  Kernel( const matrix_type & A ,
563  const input_vector_type & x ,
564  const output_vector_type & y ,
565  const input_scalar & a ,
566  const output_scalar & b )
567  : m_A_values( A.values )
568  , m_A_graph( A.graph )
569  , v_y( y )
570  , v_x( x )
571  , m_a( a )
572  , m_b( b )
573  , m_row_count( A.graph.row_map.extent(0)-1 )
574  , dim( dimension_scalar(x) )
575  {}
576 
577  __device__ void operator()(void) const
578  {
579  // Store matrix values and column indices in shared memory
580  // to reduce global memory reads
581  volatile matrix_scalar * const sh_A =
582  kokkos_impl_cuda_shared_memory<matrix_scalar>();
583  volatile size_type * const sh_col =
584  reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
585 
586  // Check for valid row
587  const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
588  if (iBlockRow < m_row_count) {
589 
590  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
591  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
592 
593  // Initialize result
594  if (m_b == output_scalar(0))
595  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
596  v_y(pce, iBlockRow) = 0.0;
597  else
598  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
599  v_y(pce, iBlockRow) = m_b*v_y(pce, iBlockRow);
600 
601  // Loop over columns in chunks of size BlockSize
602  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
603  col_block+=BlockSize) {
604  const size_type num_col = col_block+BlockSize <= iEntryEnd ?
605  BlockSize : iEntryEnd-col_block;
606 
607  // Read BlockSize entries column indices at a time to maintain
608  // coalesced accesses
609  for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
610  sh_col[col*blockDim.y+threadIdx.y] =
611  m_A_graph.entries(col_block+col);
612  sh_A[col*blockDim.y+threadIdx.y] =
613  m_A_values(col_block+col);
614  }
615  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
616  __syncthreads();
617 
618  // Do portion mat-vec for each PCE coefficient and for columns
619  // within this block
620  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
621  output_scalar s = 0.0;
622  for ( size_type col = 0; col < num_col; ++col ) {
623  const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
624  const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
625  s += aA*v_x(pce, iCol);
626  }
627  v_y(pce, iBlockRow) += s;
628  }
629  }
630  }
631  }
632  };
633 
634  static void apply( const matrix_type & A ,
635  const input_vector_type & x ,
636  const output_vector_type & y ,
637  const input_scalar & a = input_scalar(1) ,
638  const output_scalar & b = output_scalar(0) )
639  {
640  const size_t row_count = A.graph.row_map.extent(0) - 1;
641  const size_type dim = dimension_scalar(x);
642 
643  // Compute number of threads for PCE coefficients and number of
644  // matrix rows processed by each CUDA block. A total of 256 threads
645  // gives best occupancy
646  size_type threads_per_row;
647  size_type rows_per_block;
648  if (dim >= 32) {
649  threads_per_row = 32;
650  rows_per_block = 8;
651  }
652  else {
653  threads_per_row = 16;
654  rows_per_block = 16;
655  }
656  const size_type num_blocks =
657  (row_count + rows_per_block -1 ) / rows_per_block;
658 
659  // Setup thread blocks and grid
660  const dim3 dBlock( threads_per_row , rows_per_block , 1 );
661  const dim3 dGrid( num_blocks, 1, 1 );
662 
663  // Setup shared memory for storing matrix values and column indices
664  // Number of columns we process at a time -- making this bigger
665  // requires more shared memory and reduces occupancy
666  const int BlockSize = 32;
667  if (sizeof(matrix_scalar) > 4)
668  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
669  else
670  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
671  const size_t shared =
672  (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
673 
674  // Launch kernel
675  MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
676  ( Kernel<BlockSize>( A, x, y, a, b ) );
677  }
678 };
679 
680 /*
681  * Disable this specialization as it is actually slower than the default
682  * implementation of launching the single column kernel, one column at at time.
683  * It looks like it uses a few more registers which is reducing occupancy.
684 
685 // Kernel implementing y = A * x where PCE size of A is 1
686 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
687 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
688 // x and y are rank 2
689 template <typename MatrixStorage,
690  typename MatrixOrdinal,
691  typename MatrixMemory,
692  typename MatrixSize,
693  typename InputStorage,
694  typename ... InputP,
695  typename OutputStorage,
696  typename ... OutputP>
697 class MeanMultiply< Kokkos::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
698  MatrixOrdinal,
699  Kokkos::Cuda,
700  MatrixMemory,
701  MatrixSize >,
702  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
703  InputP... >,
704  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
705  OutputP... >,
706  >
707 {
708 public:
709  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
710  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
711  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
712 
713  typedef Kokkos::Cuda MatrixDevice;
714  typedef MatrixDevice execution_space;
715  typedef Kokkos::CrsMatrix< const MatrixValue,
716  MatrixOrdinal,
717  MatrixDevice,
718  MatrixMemory,
719  MatrixSize> matrix_type;
720  typedef typename matrix_type::values_type matrix_values_type;
721  typedef typename MatrixValue::ordinal_type size_type;
722  typedef Kokkos::View< InputVectorValue**,
723  InputP... > input_vector_type;
724  typedef Kokkos::View< OutputVectorValue**,
725  OutputP... > output_vector_type;
726 
727  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
728  typedef typename MatrixValue::value_type matrix_scalar;
729  typedef typename InputVectorValue::value_type input_scalar;
730  typedef typename OutputVectorValue::value_type output_scalar;
731 
732  template <int BlockSize>
733  struct Kernel {
734  typedef Device execution_space;
735  typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
736  typedef typename input_vector_type::array_type input_array_type;
737  typedef typename output_vector_type::array_type output_array_type;
738 
739  const matrix_array_type m_A_values ;
740  const matrix_graph_type m_A_graph ;
741  const output_array_type v_y ;
742  const input_array_type v_x ;
743  const input_scalar m_a ;
744  const output_scalar m_b ;
745  const size_type m_row_count;
746  const size_type m_num_col;
747  const size_type dim ;
748 
749  Kernel( const matrix_type & A ,
750  const input_vector_type & x ,
751  const output_vector_type & y ,
752  const input_scalar & a ,
753  const output_scalar & b )
754  : m_A_values( A.values )
755  , m_A_graph( A.graph )
756  , v_y( y )
757  , v_x( x )
758  , m_a( a )
759  , m_b( b )
760  , m_row_count( A.graph.row_map.extent(0)-1 )
761  , m_num_col( x.extent(1) )
762  , dim( dimension_scalar(x) )
763  {}
764 
765  __device__ void operator()(void) const
766  {
767  // Store matrix values and column indices in shared memory
768  // to reduce global memory reads
769  volatile matrix_scalar * const sh_A =
770  kokkos_impl_cuda_shared_memory<matrix_scalar>();
771  volatile size_type * const sh_col =
772  reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
773 
774  // Check for valid row
775  const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
776  if (iBlockRow < m_row_count) {
777 
778  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
779  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
780 
781  // Initialize result
782  if (m_b == output_scalar(0))
783  for ( size_type xcol = 0; xcol < m_num_col; xcol++)
784  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
785  v_y(pce, iBlockRow, xcol) = 0.0;
786  else
787  for ( size_type xcol = 0; xcol < m_num_col; xcol++)
788  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
789  v_y(pce, iBlockRow, xcol) = m_b*v_y(pce, iBlockRow, xcol);
790 
791  // Loop over columns in chunks of size BlockSize
792  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
793  col_block+=BlockSize) {
794  const size_type num_col = col_block+BlockSize <= iEntryEnd ?
795  BlockSize : iEntryEnd-col_block;
796 
797  // Read BlockSize entries column indices at a time to maintain
798  // coalesced accesses
799  for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
800  sh_col[col*blockDim.y+threadIdx.y] =
801  m_A_graph.entries(col_block+col);
802  sh_A[col*blockDim.y+threadIdx.y] =
803  m_A_values(col_block+col);
804  }
805  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
806  __syncthreads();
807 
808  // Do portion mat-vec for each PCE coefficient and for columns
809  // within this block
810  for ( size_type xcol = 0; xcol < m_num_col; xcol++) {
811  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
812  output_scalar s = 0.0;
813  for ( size_type col = 0; col < num_col; ++col ) {
814  const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
815  const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
816  s += aA*v_x(pce, iCol, xcol);
817  }
818  v_y(pce, iBlockRow, xcol) += s;
819  }
820  }
821  }
822  }
823  }
824  };
825 
826  static void apply( const matrix_type & A ,
827  const input_vector_type & x ,
828  const output_vector_type & y ,
829  const input_scalar & a = input_scalar(1) ,
830  const output_scalar & b = output_scalar(0) )
831  {
832  const size_t row_count = A.graph.row_map.extent(0) - 1;
833  const size_type dim = dimension_scalar(x);
834 
835  // Compute number of threads for PCE coefficients and number of
836  // matrix rows processed by each CUDA block. A total of 256 threads
837  // gives best occupancy
838  size_type threads_per_row;
839  size_type rows_per_block;
840  if (dim >= 32) {
841  threads_per_row = 32;
842  rows_per_block = 6;
843  }
844  else {
845  threads_per_row = 16;
846  rows_per_block = 12;
847  }
848  const size_type num_blocks =
849  (row_count + rows_per_block -1 ) / rows_per_block;
850 
851  // Setup thread blocks and grid
852  const dim3 dBlock( threads_per_row , rows_per_block , 1 );
853  const dim3 dGrid( num_blocks, 1, 1 );
854 
855  // Setup shared memory for storing matrix values and column indices
856  // Number of columns we process at a time -- making this bigger
857  // requires more shared memory and reduces occupancy
858  const int BlockSize = 32;
859  if (sizeof(matrix_scalar) > 4)
860  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
861  else
862  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
863  const size_t shared =
864  (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
865 
866  // Launch kernel
867  // MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
868  // ( Kernel<BlockSize>( A, x, y, a, b ) );
869 
870  Kokkos::Impl::cuda_parallel_launch_local_memory<<<dGrid, dBlock, shared >>>
871  ( Kernel<BlockSize>( A, x, y, a, b ) );
872  }
873 };
874 */
875 
876 } // namespace Stokhos
877 
878 #endif /* #if defined( __CUDACC__) */
879 
880 #endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP */
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::DefaultExecutionSpace execution_space
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
void push_back(const value_type &x)
size_type size() const