Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_CrsMatrix_UQ_PCE_Cuda.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
44 
45 #if defined( __CUDACC__)
46 
47 #include "Sacado_UQ_PCE.hpp"
48 #include "Kokkos_View_UQ_PCE.hpp"
50 #include "KokkosSparse_CrsMatrix.hpp"
51 
52 //MD 08/2017 Note: I commented out below, as this file is totally
53 //removed from KokkosKernels. It does not look like the
54 //included file is used anywhere in the file.
55 //#include "Kokkos_MV.hpp" // for some utilities
56 
57 #include "Stokhos_Multiply.hpp"
59 
60 #include "Kokkos_Core.hpp"
61 
63 //#include "Stokhos_Cuda_WarpShuffle.hpp"
64 
66 
67 //#include "cuda_profiler_api.h"
68 
69 #ifdef __CUDA_ARCH__
70 # if (__CUDA_ARCH__ >= 300)
71 # define HAVE_CUDA_SHUFFLE 1
72 # else
73 # define HAVE_CUDA_SHUFFLE 0
74 # endif
75 #else
76 # define HAVE_CUDA_SHUFFLE 0
77 #endif
78 
79 namespace Stokhos {
80 
81 //----------------------------------------------------------------------------
82 // Specialization of Kokkos CrsMatrix math functions
83 //----------------------------------------------------------------------------
84 
85 // Kernel implementing y = A * x where
86 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
87 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
88 // x and y are rank 1
89 template <typename MatrixStorage,
90  typename MatrixOrdinal,
91  typename MatrixMemory,
92  typename MatrixSize,
93  typename InputStorage,
94  typename ... InputP,
95  typename OutputStorage,
96  typename ... OutputP>
97 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
98  MatrixOrdinal,
99  Kokkos::Cuda,
100  MatrixMemory,
101  MatrixSize>,
102  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
103  InputP... >,
104  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
105  OutputP... >
106  >
107 {
108 public:
109  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
110  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
111  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
112 
113  typedef Kokkos::Cuda MatrixDevice;
114  typedef MatrixDevice execution_space;
115  typedef execution_space::size_type size_type;
116 
117  typedef KokkosSparse::CrsMatrix< MatrixValue,
118  MatrixOrdinal,
119  MatrixDevice,
120  MatrixMemory,
121  MatrixSize> matrix_type;
122  typedef typename matrix_type::values_type matrix_values_type;
123  typedef typename Kokkos::CijkType<matrix_values_type>::type tensor_type;
124  typedef Kokkos::View< InputVectorValue*,
125  InputP... > input_vector_type;
126  typedef Kokkos::View< OutputVectorValue*,
127  OutputP... > output_vector_type;
128 
129 private:
130 
131  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
132  typedef typename matrix_values_type::array_type matrix_array_type;
133  typedef typename input_vector_type::array_type input_array_type;
134  typedef typename output_vector_type::array_type output_array_type;
135 
136  typedef typename MatrixValue::value_type matrix_scalar;
137  typedef typename InputVectorValue::value_type input_scalar;
138  typedef typename OutputVectorValue::value_type output_scalar;
139  typedef typename tensor_type::value_type tensor_scalar;
140 
141  const matrix_array_type m_A_values ;
142  const matrix_graph_type m_A_graph ;
143  const input_array_type m_x ;
144  const output_array_type m_y ;
145  const tensor_type m_tensor ;
146  const input_scalar m_a ;
147  const output_scalar m_b ;
148  const size_type BlockSize;
149 
150  Multiply( const matrix_type & A ,
151  const input_vector_type & x ,
152  const output_vector_type & y ,
153  const input_scalar & a ,
154  const output_scalar & b ,
155  const size_type block_size )
156  : m_A_values( A.values )
157  , m_A_graph( A.graph )
158  , m_x( x )
159  , m_y( y )
160  , m_tensor( Kokkos::cijk(A.values) )
161  , m_a( a )
162  , m_b( b )
163  , BlockSize(block_size)
164  {}
165 
166 public:
167 
168  __device__ void operator()(void) const
169  {
170  // Number of bases in the stochastic system:
171  const size_type dim = m_tensor.dimension();
172 
173  // Get shared memory for loading x, A, and y
174  volatile input_scalar * const sh_x =
175  kokkos_impl_cuda_shared_memory<input_scalar>();
176  volatile matrix_scalar * const sh_A = sh_x + BlockSize*dim;
177  volatile output_scalar * const sh_y = sh_A + BlockSize*dim;
178 #if !HAVE_CUDA_SHUFFLE
179  volatile output_scalar * const sh_t = sh_y + dim;
180 #endif
181 
182  const size_type nid = blockDim.x * blockDim.y;
183  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
184 
185  // Zero y
186  for ( size_type i = tid; i < dim; i += nid ) {
187  sh_y[i] = 0.0;
188  }
189 
190  // Loop over columns in the discrete (finite element) system.
191  // blockIdx.x == row in the deterministic (finite element) system
192  const size_type iBlockEntryBeg = m_A_graph.row_map[ blockIdx.x ];
193  const size_type iBlockEntryEnd = m_A_graph.row_map[ blockIdx.x + 1 ];
194  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
195  iBlockEntry += BlockSize) {
196  const size_type block_size =
197  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
198  iBlockEntryEnd-iBlockEntry : BlockSize;
199 
200  // Wait for X and A to be used in the previous iteration
201  // before reading new values.
202  __syncthreads();
203 
204  // Coalesced read blocks of X and A into shared memory
205  for ( size_type col = 0; col < block_size; ++col ) {
206 
207  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
208  const input_scalar * const x = & m_x( 0, iBlockColumn );
209  const matrix_scalar * const A = & m_A_values( iBlockEntry + col, 0 );
210 
211  // Coalesced read by the whole block from global memory:
212  for ( size_type i = tid; i < dim; i += nid ) {
213  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
214  sh_A[col + i * BlockSize] = A[i]; // m_A_values( iBlockEntry , i );
215  }
216 
217  }
218 
219  __syncthreads(); // wait for X and A to be read before using them
220 
221  // This cuda block is responsible for computing all values of 'y'
222  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
223  output_scalar y = 0;
224 
225  // Product tensor entries which this warp will iterate:
226  const size_type lBeg = m_tensor.entry_begin( i );
227  const size_type lEnd = m_tensor.entry_end( i );
228 
229  // Loop through sparse tensor contributions with coalesced reads.
230  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
231 
232  // Read 'blockDim.x' entries from the tensor (coalesced)
233  const size_type kj = m_tensor.coord( l );
234  const tensor_scalar v = m_tensor.value( l );
235  const size_type j = ( kj & 0x0ffff ) * BlockSize ;
236  const size_type k = ( kj >> 16 ) * BlockSize ;
237 
238  for ( size_type col = 0; col < block_size; ++col ) {
239  y += v * ( sh_A[col+j] * sh_x[col+k] +
240  sh_A[col+k] * sh_x[col+j] );
241  }
242 
243  }
244 
245  // Reduction of 'y' within 'blockDim.x'
246 #if HAVE_CUDA_SHUFFLE
247  if (blockDim.x >= 2) y += Kokkos::shfl_down(y, 1, blockDim.x);
248  if (blockDim.x >= 4) y += Kokkos::shfl_down(y, 2, blockDim.x);
249  if (blockDim.x >= 8) y += Kokkos::shfl_down(y, 4, blockDim.x);
250  if (blockDim.x >= 16) y += Kokkos::shfl_down(y, 8, blockDim.x);
251  if (blockDim.x >= 32) y += Kokkos::shfl_down(y, 16, blockDim.x);
252  if ( threadIdx.x == 0 ) sh_y[i] += y;
253 #else
254  sh_t[ tid ] = y;
255  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
256  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
257  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
258  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
259  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
260  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
261 #endif
262 
263  }
264 
265  }
266 
267  // Wait for all contributions of y to be completed
268  __syncthreads();
269 
270  // Store result back in global memory
271  if ( m_b == output_scalar(0) )
272  for ( size_type i = tid; i < dim; i += nid )
273  m_y( i, blockIdx.x ) = m_a * sh_y[ i ];
274  else
275  for ( size_type i = tid; i < dim; i += nid )
276  m_y( i, blockIdx.x ) = m_a * sh_y[ i ] + m_b * m_y( i, blockIdx.x );
277  }
278 
279  struct TensorReadEntry {
280  size_type block_size, shmem, num_blocks, num_warp;
281  double reads;
282  };
283 
284  static void apply( const matrix_type & A ,
285  const input_vector_type & x ,
286  const output_vector_type & y ,
287  const input_scalar & a = input_scalar(1) ,
288  const output_scalar & b = output_scalar(0) )
289  {
290  const tensor_type tensor = Kokkos::cijk(A.values);
291  const size_type row_count = A.graph.row_map.extent(0) - 1;
292  const size_type tensor_dimension = tensor.dimension();
293  const size_type tensor_align = tensor_dimension;
294  const size_type avg_tensor_entries_per_row = tensor.avg_entries_per_row();
295 
296  // Should compute this from FEM graph
297  const size_type fem_nnz_per_row = 27;
298 
299  // Get device properties we need for whatever device is currently selected
300  DeviceProp device_prop;
301  const size_type shcap = device_prop.shared_memory_capacity;
302  const size_type sh_granularity = device_prop.shared_memory_granularity;
303  const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
304  const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
305  const size_type warp_size = device_prop.warp_size;
306  const size_type warp_granularity = device_prop.warp_granularity;
307  const size_type max_warps_per_block =
308  std::min(device_prop.max_threads_per_block / warp_size,
309  device_prop.max_warps_per_sm);
310  const size_type min_warps_per_block = 1;
311  const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
312  const size_type max_regs_per_block = device_prop.max_regs_per_block;
313  const size_type reg_bank_size = device_prop.reg_bank_size;
314 
315  // Compute number of warps we can fit on each SM based on register limits
316  // Use Cuda introspection to determine number of registers per thread
317  //const size_type regs_per_thread = 46;
318  const size_type regs_per_thread =
319  device_prop.get_kernel_registers(
320  Kokkos::Impl::cuda_parallel_launch_local_memory<Multiply>);
321  const size_type regs_per_warp =
322  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
323  const size_type warps_per_sm =
324  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
325  const size_type warps_per_block =
326  (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
327 
328  // Compute number of threads per stochastic row based on number of
329  // nonzero entries per row.
330  // For double, 16 threads/row is still coalesced, but not for float.
331  // We should reorder the tensor values for a given vector width to
332  // maintain coalesced reads. This would help smaller problems too by
333  // allowing fewer threads/row.
334  const size_type threads_per_row =
335  avg_tensor_entries_per_row >= 88 ? 32 : 16;
336  const size_type rows_per_warp = warp_size / threads_per_row;
337 
338  const size_type in_vec_scalar_size = sizeof(input_scalar);
339  const size_type out_vec_scalar_size = sizeof(output_scalar);
340  const size_type mat_scalar_size = sizeof(matrix_scalar);
341 
342 #define USE_FIXED_BLOCKSIZE 0
343 
344 #if USE_FIXED_BLOCKSIZE
345 
346  const size_type num_blocks = 3;
347  size_type nw = warps_per_sm / num_blocks;
348  while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
349  const size_type num_warp = nw;
350  const size_type sh_per_block = shcap / num_blocks;
351  const size_type sr =
352  device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*num_warp;
353  size_type bs = ((sh_per_block - sr) / tensor_align - out_vec_scalar_size) /
354  (in_vec_scalar_size+mat_scalar_size);
355  if (bs % 2 == 0) --bs;
356  const size_type block_size_max = 31;
357  const size_type block_size = std::min(bs, block_size_max);
358  //const size_type block_size = 7;
359  const size_type shmem =
360  ( ((in_vec_scalar_size+mat_scalar_size)*block_size+out_vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
361 
362 #else
363 
364  // We want to maximize the number of blocks per SM (to maximize throughput)
365  // as well as the block_size (to minimize tensor reads), subject to
366  // shared memory and register constraints. Here we try to do this computing
367  // the number of tensor reads per block per thread for each choice of
368  // block_size, and then choose the configuration with the smallest value.
369  // This isn't perfect, but seems to generally work OK. It could be
370  // improved with a better model of:
371  // * Number of blocks versus warps per block (to minimize synchronization)
372  // * Thread efficiency for small numbers of rows per thread
373  const size_type block_size_min = 3;
374  const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
375  const size_type block_size_max =
376  half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
377  Teuchos::Array<TensorReadEntry> reads_per_thread;
378  for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
379  // We don't know the number of warps yet, so we just have to bound
380  // sr by the maximum number possible (which is all warps in 1 block)
381  const size_type sr =
382  device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*warps_per_block;
383  size_type shmem =
384  ((in_vec_scalar_size+mat_scalar_size)*bs+out_vec_scalar_size)*tensor_align+sr;
385  shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
386  if (shmem <= max_shmem_per_block) {
387  size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
388  size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
389  size_type num_warp =
390  std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
391  min_warps_per_block),
392  max_warps_per_block);
393  while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
394  --num_warp;
395  TensorReadEntry entry;
396  entry.block_size = bs;
397  entry.shmem = shmem;
398  entry.num_blocks = num_blocks;
399  entry.num_warp = num_warp;
400 
401  // Prefer at least 3 blocks
402  size_type factor = std::min(num_blocks,3u);
403  entry.reads = (static_cast<double>(tensor_reads) /
404  static_cast<double>(factor*num_blocks*num_warp));
405  reads_per_thread.push_back(entry);
406  }
407  }
409  reads_per_thread.size() == 0, std::logic_error,
410  "Stochastic problem dimension is too large to fit in shared memory");
411  size_type idx = 0;
412  double min_reads = reads_per_thread[0].reads;
413  for (int i=1; i<reads_per_thread.size(); ++i) {
414  if (reads_per_thread[i].reads < min_reads) {
415  idx = i;
416  min_reads = reads_per_thread[i].reads;
417  }
418  }
419 
420  const size_type block_size = reads_per_thread[idx].block_size;
421  const size_type shmem = reads_per_thread[idx].shmem;
422  const size_type num_blocks = reads_per_thread[idx].num_blocks;
423  const size_type num_warp = reads_per_thread[idx].num_warp;
424 
425 #endif
426 
427  // Setup thread blocks and grid
428  const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
429  const dim3 dGrid( row_count, 1, 1 );
430 
431 #if 0
432  std::cout << "block_size = " << block_size
433  << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
434  << " regs_per_thread = " << regs_per_thread
435  << " num blocks = " << num_blocks
436  << " num warps = " << num_warp
437  << " num rows = " << tensor_dimension
438  << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
439  << " avg entries/row = " << avg_tensor_entries_per_row
440  << std::endl;
441 #endif
442 
443  // Finally launch our kernel
444  //cudaProfilerStart();
445  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
446  ( Multiply( A, x, y, a, b, block_size ) );
447  //cudaProfilerStop();
448  }
449 };
450 
451 // Kernel implementing y = A * x where
452 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
453 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
454 // x and y are rank 2
455 //
456 // Note: Unlike the rank-1 version, this version has not been
457 // optimized, and doesn't even include the block-column implementation
458 template <typename MatrixStorage,
459  typename MatrixOrdinal,
460  typename MatrixMemory,
461  typename MatrixSize,
462  typename InputStorage,
463  typename ... InputP,
464  typename OutputStorage,
465  typename ... OutputP>
466 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
467  MatrixOrdinal,
468  Kokkos::Cuda,
469  MatrixMemory,
470  MatrixSize>,
471  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
472  InputP... >,
473  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
474  OutputP... >
475  >
476 {
477 public:
478  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
479  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
480  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
481 
482  typedef Kokkos::Cuda MatrixDevice;
483  typedef MatrixDevice execution_space;
484  typedef execution_space::size_type size_type;
485 
486  typedef KokkosSparse::CrsMatrix< MatrixValue,
487  MatrixOrdinal,
488  MatrixDevice,
489  MatrixMemory,
490  MatrixSize> matrix_type;
491  typedef Kokkos::View< InputVectorValue**,
492  InputP... > input_vector_type;
493  typedef Kokkos::View< OutputVectorValue**,
494  OutputP... > output_vector_type;
495  typedef typename InputVectorValue::value_type input_scalar;
496  typedef typename OutputVectorValue::value_type output_scalar;
497 
498 public:
499 
500  static void apply( const matrix_type & A ,
501  const input_vector_type & x ,
502  const output_vector_type & y ,
503  const input_scalar & a = input_scalar(1) ,
504  const output_scalar & b = output_scalar(0) )
505  {
506  typedef Kokkos::View< InputVectorValue*, InputP... > input_vector_type_1D;
507  typedef Kokkos::View< OutputVectorValue*, OutputP... > output_vector_type_1D;
508  typedef Multiply< matrix_type, input_vector_type_1D,
509  output_vector_type_1D > multiply_type_1D;
510 
511  const size_type num_col = y.extent(1);
512  for (size_type col=0; col<num_col; ++col)
513 
514  multiply_type_1D::apply(
515  A,
516  Kokkos::subview( x, Kokkos::ALL(), col),
517  Kokkos::subview(y, Kokkos::ALL(), col),
518  a, b );
519  }
520 };
521 
522 template <typename Kernel>
523 __global__ void
524 #if __CUDA_ARCH__ >= 300
525 __launch_bounds__(1024,2)
526 #endif
527 MeanFullOccupancyKernelLaunch(Kernel kernel) {
528  kernel();
529 }
530 
531 // Kernel implementing y = A * x where PCE size of A is 1
532 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
533 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
534 // x and y are rank 1
535 template <typename MatrixStorage,
536  typename MatrixOrdinal,
537  typename MatrixMemory,
538  typename MatrixSize,
539  typename InputStorage,
540  typename ... InputP,
541  typename OutputStorage,
542  typename ... OutputP>
543 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
544  MatrixOrdinal,
545  Kokkos::Cuda,
546  MatrixMemory,
547  MatrixSize >,
548  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
549  InputP... >,
550  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
551  OutputP... >,
552  >
553 {
554 public:
555  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
556  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
557  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
558 
559  typedef Kokkos::Cuda MatrixDevice;
560  typedef MatrixDevice execution_space;
561  typedef KokkosSparse::CrsMatrix< MatrixValue,
562  MatrixOrdinal,
563  MatrixDevice,
564  MatrixMemory,
565  MatrixSize> matrix_type;
566  typedef typename matrix_type::values_type matrix_values_type;
567  typedef typename MatrixValue::ordinal_type size_type;
568  typedef Kokkos::View< InputVectorValue*,
569  InputP... > input_vector_type;
570  typedef Kokkos::View< OutputVectorValue*,
571  OutputP... > output_vector_type;
572 
573  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
574  typedef typename MatrixValue::value_type matrix_scalar;
575  typedef typename InputVectorValue::value_type input_scalar;
576  typedef typename OutputVectorValue::value_type output_scalar;
577 
578  template <int BlockSize>
579  struct Kernel {
580  typedef MatrixDevice execution_space;
581  typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
582  typedef typename input_vector_type::array_type input_array_type;
583  typedef typename output_vector_type::array_type output_array_type;
584 
585  const matrix_array_type m_A_values ;
586  const matrix_graph_type m_A_graph ;
587  const output_array_type v_y ;
588  const input_array_type v_x ;
589  const input_scalar m_a ;
590  const output_scalar m_b ;
591  const size_type m_row_count;
592  const size_type dim ;
593 
594  Kernel( const matrix_type & A ,
595  const input_vector_type & x ,
596  const output_vector_type & y ,
597  const input_scalar & a ,
598  const output_scalar & b )
599  : m_A_values( A.values )
600  , m_A_graph( A.graph )
601  , v_y( y )
602  , v_x( x )
603  , m_a( a )
604  , m_b( b )
605  , m_row_count( A.graph.row_map.extent(0)-1 )
606  , dim( dimension_scalar(x) )
607  {}
608 
609  __device__ void operator()(void) const
610  {
611  // Store matrix values and column indices in shared memory
612  // to reduce global memory reads
613  volatile matrix_scalar * const sh_A =
614  kokkos_impl_cuda_shared_memory<matrix_scalar>();
615  volatile size_type * const sh_col =
616  reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
617 
618  // Check for valid row
619  const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
620  if (iBlockRow < m_row_count) {
621 
622  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
623  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
624 
625  // Initialize result
626  if (m_b == output_scalar(0))
627  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
628  v_y(pce, iBlockRow) = 0.0;
629  else
630  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
631  v_y(pce, iBlockRow) = m_b*v_y(pce, iBlockRow);
632 
633  // Loop over columns in chunks of size BlockSize
634  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
635  col_block+=BlockSize) {
636  const size_type num_col = col_block+BlockSize <= iEntryEnd ?
637  BlockSize : iEntryEnd-col_block;
638 
639  // Read BlockSize entries column indices at a time to maintain
640  // coalesced accesses
641  for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
642  sh_col[col*blockDim.y+threadIdx.y] =
643  m_A_graph.entries(col_block+col);
644  sh_A[col*blockDim.y+threadIdx.y] =
645  m_A_values(col_block+col);
646  }
647  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
648  __syncthreads();
649 
650  // Do portion mat-vec for each PCE coefficient and for columns
651  // within this block
652  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
653  output_scalar s = 0.0;
654  for ( size_type col = 0; col < num_col; ++col ) {
655  const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
656  const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
657  s += aA*v_x(pce, iCol);
658  }
659  v_y(pce, iBlockRow) += s;
660  }
661  }
662  }
663  }
664  };
665 
666  static void apply( const matrix_type & A ,
667  const input_vector_type & x ,
668  const output_vector_type & y ,
669  const input_scalar & a = input_scalar(1) ,
670  const output_scalar & b = output_scalar(0) )
671  {
672  const size_t row_count = A.graph.row_map.extent(0) - 1;
673  const size_type dim = dimension_scalar(x);
674 
675  // Compute number of threads for PCE coefficients and number of
676  // matrix rows processed by each CUDA block. A total of 256 threads
677  // gives best occupancy
678  size_type threads_per_row;
679  size_type rows_per_block;
680  if (dim >= 32) {
681  threads_per_row = 32;
682  rows_per_block = 8;
683  }
684  else {
685  threads_per_row = 16;
686  rows_per_block = 16;
687  }
688  const size_type num_blocks =
689  (row_count + rows_per_block -1 ) / rows_per_block;
690 
691  // Setup thread blocks and grid
692  const dim3 dBlock( threads_per_row , rows_per_block , 1 );
693  const dim3 dGrid( num_blocks, 1, 1 );
694 
695  // Setup shared memory for storing matrix values and column indices
696  // Number of columns we process at a time -- making this bigger
697  // requires more shared memory and reduces occupancy
698  const int BlockSize = 32;
699  if (sizeof(matrix_scalar) > 4)
700  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
701  else
702  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
703  const size_t shared =
704  (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
705 
706  // Launch kernel
707  MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
708  ( Kernel<BlockSize>( A, x, y, a, b ) );
709  }
710 };
711 
712 /*
713  * Disable this specialization as it is actually slower than the default
714  * implementation of launching the single column kernel, one column at at time.
715  * It looks like it uses a few more registers which is reducing occupancy.
716 
717 // Kernel implementing y = A * x where PCE size of A is 1
718 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
719 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
720 // x and y are rank 2
721 template <typename MatrixStorage,
722  typename MatrixOrdinal,
723  typename MatrixMemory,
724  typename MatrixSize,
725  typename InputStorage,
726  typename ... InputP,
727  typename OutputStorage,
728  typename ... OutputP>
729 class MeanMultiply< Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
730  MatrixOrdinal,
731  Kokkos::Cuda,
732  MatrixMemory,
733  MatrixSize >,
734  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
735  InputP... >,
736  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
737  OutputP... >,
738  >
739 {
740 public:
741  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
742  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
743  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
744 
745  typedef Kokkos::Cuda MatrixDevice;
746  typedef MatrixDevice execution_space;
747  typedef Kokkos::CrsMatrix< MatrixValue,
748  MatrixOrdinal,
749  MatrixDevice,
750  MatrixMemory,
751  MatrixSize> matrix_type;
752  typedef typename matrix_type::values_type matrix_values_type;
753  typedef typename MatrixValue::ordinal_type size_type;
754  typedef Kokkos::View< InputVectorValue**,
755  InputP... > input_vector_type;
756  typedef Kokkos::View< OutputVectorValue**,
757  OutputP... > output_vector_type;
758 
759  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
760  typedef typename MatrixValue::value_type matrix_scalar;
761  typedef typename InputVectorValue::value_type input_scalar;
762  typedef typename OutputVectorValue::value_type output_scalar;
763 
764  template <int BlockSize>
765  struct Kernel {
766  typedef Device execution_space;
767  typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
768  typedef typename input_vector_type::array_type input_array_type;
769  typedef typename output_vector_type::array_type output_array_type;
770 
771  const matrix_array_type m_A_values ;
772  const matrix_graph_type m_A_graph ;
773  const output_array_type v_y ;
774  const input_array_type v_x ;
775  const input_scalar m_a ;
776  const output_scalar m_b ;
777  const size_type m_row_count;
778  const size_type m_num_col;
779  const size_type dim ;
780 
781  Kernel( const matrix_type & A ,
782  const input_vector_type & x ,
783  const output_vector_type & y ,
784  const input_scalar & a ,
785  const output_scalar & b )
786  : m_A_values( A.values )
787  , m_A_graph( A.graph )
788  , v_y( y )
789  , v_x( x )
790  , m_a( a )
791  , m_b( b )
792  , m_row_count( A.graph.row_map.extent(0)-1 )
793  , m_num_col( x.extent(1) )
794  , dim( dimension_scalar(x) )
795  {}
796 
797  __device__ void operator()(void) const
798  {
799  // Store matrix values and column indices in shared memory
800  // to reduce global memory reads
801  volatile matrix_scalar * const sh_A =
802  kokkos_impl_cuda_shared_memory<matrix_scalar>();
803  volatile size_type * const sh_col =
804  reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
805 
806  // Check for valid row
807  const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
808  if (iBlockRow < m_row_count) {
809 
810  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
811  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
812 
813  // Initialize result
814  if (m_b == output_scalar(0))
815  for ( size_type xcol = 0; xcol < m_num_col; xcol++)
816  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
817  v_y(pce, iBlockRow, xcol) = 0.0;
818  else
819  for ( size_type xcol = 0; xcol < m_num_col; xcol++)
820  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
821  v_y(pce, iBlockRow, xcol) = m_b*v_y(pce, iBlockRow, xcol);
822 
823  // Loop over columns in chunks of size BlockSize
824  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
825  col_block+=BlockSize) {
826  const size_type num_col = col_block+BlockSize <= iEntryEnd ?
827  BlockSize : iEntryEnd-col_block;
828 
829  // Read BlockSize entries column indices at a time to maintain
830  // coalesced accesses
831  for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
832  sh_col[col*blockDim.y+threadIdx.y] =
833  m_A_graph.entries(col_block+col);
834  sh_A[col*blockDim.y+threadIdx.y] =
835  m_A_values(col_block+col);
836  }
837  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
838  __syncthreads();
839 
840  // Do portion mat-vec for each PCE coefficient and for columns
841  // within this block
842  for ( size_type xcol = 0; xcol < m_num_col; xcol++) {
843  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
844  output_scalar s = 0.0;
845  for ( size_type col = 0; col < num_col; ++col ) {
846  const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
847  const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
848  s += aA*v_x(pce, iCol, xcol);
849  }
850  v_y(pce, iBlockRow, xcol) += s;
851  }
852  }
853  }
854  }
855  }
856  };
857 
858  static void apply( const matrix_type & A ,
859  const input_vector_type & x ,
860  const output_vector_type & y ,
861  const input_scalar & a = input_scalar(1) ,
862  const output_scalar & b = output_scalar(0) )
863  {
864  const size_t row_count = A.graph.row_map.extent(0) - 1;
865  const size_type dim = dimension_scalar(x);
866 
867  // Compute number of threads for PCE coefficients and number of
868  // matrix rows processed by each CUDA block. A total of 256 threads
869  // gives best occupancy
870  size_type threads_per_row;
871  size_type rows_per_block;
872  if (dim >= 32) {
873  threads_per_row = 32;
874  rows_per_block = 6;
875  }
876  else {
877  threads_per_row = 16;
878  rows_per_block = 12;
879  }
880  const size_type num_blocks =
881  (row_count + rows_per_block -1 ) / rows_per_block;
882 
883  // Setup thread blocks and grid
884  const dim3 dBlock( threads_per_row , rows_per_block , 1 );
885  const dim3 dGrid( num_blocks, 1, 1 );
886 
887  // Setup shared memory for storing matrix values and column indices
888  // Number of columns we process at a time -- making this bigger
889  // requires more shared memory and reduces occupancy
890  const int BlockSize = 32;
891  if (sizeof(matrix_scalar) > 4)
892  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
893  else
894  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
895  const size_t shared =
896  (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
897 
898  // Launch kernel
899  // MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
900  // ( Kernel<BlockSize>( A, x, y, a, b ) );
901 
902  Kokkos::Impl::cuda_parallel_launch_local_memory<<<dGrid, dBlock, shared >>>
903  ( Kernel<BlockSize>( A, x, y, a, b ) );
904  }
905 };
906 */
907 
908 } // namespace Stokhos
909 
910 #endif /* #if defined( __CUDACC__) */
911 
912 #endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP */
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::DefaultExecutionSpace execution_space
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
void push_back(const value_type &x)
size_type size() const