Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_CrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
11 #define STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
12 
13 #include <iostream>
14 
15 #include "Kokkos_Core.hpp"
16 
17 #include "Stokhos_Multiply.hpp"
20 
23 
25 
26 #include "cuda_profiler_api.h"
27 
28 namespace Stokhos {
29 
30 //----------------------------------------------------------------------------
31 
32 // Matrix-vector product specialization for CrsProductTensor layout
33 // To do:
34 // * Incorporate texture/read-only cache
35 // * Change tensor layout to allow coalesced reads with smaller
36 // threads/row (will only probably help smaller problem sizes)
37 // * Get average FEM entries/row from FEM graph (hardcoded to 27)
38 template< typename TensorScalar,
39  typename MatrixScalar,
40  typename VectorScalar >
41 class Multiply<
42  BlockCrsMatrix< CrsProductTensor< TensorScalar, Kokkos::Cuda >,
43  MatrixScalar, Kokkos::Cuda >,
44  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
45  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
46 {
47 public:
48 
49  typedef Kokkos::Cuda execution_space;
50  typedef execution_space::size_type size_type;
51 
54  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type;
55 
56 #define USE_LDG 0
57 
58 #if USE_LDG == 0
59 
60  // The multiply kernel
61  class MultiplyKernel {
62  public:
63 
68 
70  const vector_type & x,
71  const vector_type & y,
72  const size_type block_size )
73  : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
74 
75  __device__
76  void operator()(void) const
77  {
78  // Number of bases in the stochastic system:
79  const size_type dim = m_A.block.dimension();
80 
81  // Get shared memory for loading x, A, and y
82  volatile VectorScalar * const sh_x =
83  kokkos_impl_cuda_shared_memory<VectorScalar>();
84  volatile MatrixScalar * const sh_A = sh_x + BlockSize*dim;
85  volatile VectorScalar * const sh_y = sh_A + BlockSize*dim;
86 #if !HAVE_CUDA_SHUFFLE
87  volatile VectorScalar * const sh_t = sh_y + dim;
88 #endif
89 
90  const size_type nid = blockDim.x * blockDim.y;
91  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
92 
93  // Mask used for shuffle/warp-sync operations
94  const int mask = blockDim.x == 32 ? 0xffffffff :
95  ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
96 
97  // Zero y
98  for ( size_type i = tid; i < dim; i += nid ) {
99  sh_y[i] = 0.0;
100  }
101 
102  // Loop over columns in the discrete (finite element) system.
103  // blockIdx.x == row in the deterministic (finite element) system
104  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
105  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
106  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
107  iBlockEntry += BlockSize) {
108  const size_type block_size =
109  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
110  iBlockEntryEnd-iBlockEntry : BlockSize;
111 
112  // Wait for X and A to be used in the previous iteration
113  // before reading new values.
114  __syncthreads();
115 
116  // Coalesced read blocks of X and A into shared memory
117  for ( size_type col = 0; col < block_size; ++col ) {
118 
119  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
120  const VectorScalar * const x = & m_x( 0, iBlockColumn );
121  const MatrixScalar * const A = & m_A.values( 0, iBlockEntry + col );
122 
123  // Coalesced read by the whole block from global memory:
124  for ( size_type i = tid; i < dim; i += nid ) {
125  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
126  sh_A[col + i * BlockSize] = A[i]; // m_A.values( i, iBlockEntry );
127  }
128 
129  }
130 
131  __syncthreads(); // wait for X and A to be read before using them
132 
133  // This cuda block is responsible for computing all values of 'y'
134  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
135  VectorScalar y = 0;
136 
137  // Product tensor entries which this warp will iterate:
138  const size_type lBeg = m_A.block.entry_begin( i );
139  const size_type lEnd = m_A.block.entry_end( i );
140 
141  // Loop through sparse tensor contributions with coalesced reads.
142  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
143 
144  // Read 'blockDim.x' entries from the tensor (coalesced)
145  const size_type kj = m_A.block.coord( l );
146  const TensorScalar v = m_A.block.value( l );
147  const size_type j = ( kj & 0x0ffff ) * BlockSize ;
148  const size_type k = ( kj >> 16 ) * BlockSize ;
149 
150  for ( size_type col = 0; col < block_size; ++col ) {
151  y += v * ( sh_A[col+j] * sh_x[col+k] +
152  sh_A[col+k] * sh_x[col+j] );
153  }
154 
155  }
156 
157  // Reduction of 'y' within 'blockDim.x'
158 #if HAVE_CUDA_SHUFFLE
159  if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x, mask);
160  if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x, mask);
161  if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x, mask);
162  if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x, mask);
163  if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x, mask);
164  if ( threadIdx.x == 0 ) sh_y[i] += y;
165 #else
166  sh_t[ tid ] = y;
167  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
168  sync_warp(mask);
169  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
170  sync_warp(mask);
171  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
172  sync_warp(mask);
173  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
174  sync_warp(mask);
175  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
176  sync_warp(mask);
177  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
178 #endif
179 
180  }
181 
182  }
183 
184  // Wait for all contributions of y to be completed
185  __syncthreads();
186 
187  // Store result back in global memory
188  for ( size_type i = tid; i < dim; i += nid ) {
189  m_y( i, blockIdx.x ) = sh_y[ i ];
190  }
191  }
192  };
193 
194 #elif USE_LDG == 1
195 
196  // The multiply kernel -- using read-only data cache for A
197  // Currently is slower than one above
198  class MultiplyKernel {
199  public:
200 
201  const matrix_type m_A;
202  const vector_type m_x;
203  const vector_type m_y;
204  const size_type BlockSize;
205 
206  MultiplyKernel( const matrix_type & A,
207  const vector_type & x,
208  const vector_type & y,
209  const size_type block_size )
210  : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
211 
212  __device__
213  void operator()(void) const
214  {
215  // Number of bases in the stochastic system:
216  const size_type dim = m_A.block.dimension();
217 
218  volatile VectorScalar * const sh_x =
219  kokkos_impl_cuda_shared_memory<VectorScalar>();
220  volatile VectorScalar * const sh_y = sh_x + BlockSize*dim;
221 #if !HAVE_CUDA_SHUFFLE
222  volatile VectorScalar * const sh_t = sh_y + dim;
223 #endif
224 
225  const size_type nid = blockDim.x * blockDim.y;
226  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
227 
228  // Mask used for shuffle/warp-sync operations
229  const int mask = blockDim.x == 32 ? 0xffffffff :
230  ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
231 
232  // Zero y
233  for ( size_type i = tid; i < dim; i += nid ) {
234  sh_y[i] = 0.0;
235  }
236 
237  // Loop over columns in the discrete (finite element) system.
238  // blockIdx.x == row in the deterministic (finite element) system
239  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
240  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
241  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
242  iBlockEntry += BlockSize) {
243  const size_type block_size =
244  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
245  iBlockEntryEnd-iBlockEntry : BlockSize;
246 
247  // Wait for X and A to be used in the previous iteration
248  // before reading new values.
249  __syncthreads();
250 
251  // Coalesced read blocks of X into shared memory
252  for ( size_type col = 0; col < block_size; ++col ) {
253 
254  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
255  const VectorScalar * const x = & m_x( 0, iBlockColumn );
256 
257  // Coalesced read by the whole block from global memory:
258  for ( size_type i = tid; i < dim; i += nid ) {
259  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
260  }
261 
262  }
263 
264  __syncthreads(); // wait for X to be read before using them
265 
266  // This cuda block is responsible for computing all values of 'y'
267  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
268  VectorScalar y = 0;
269 
270  // Product tensor entries which this warp will iterate:
271  const size_type lBeg = m_A.block.entry_begin( i );
272  const size_type lEnd = m_A.block.entry_end( i );
273 
274  // Loop through sparse tensor contributions with coalesced reads.
275  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
276 
277  // Read 'blockDim.x' entries from the tensor (coalesced)
278  const size_type kj = m_A.block.coord( l );
279  const TensorScalar v = m_A.block.value( l );
280  const size_type j = ( kj & 0x0ffff ) ;
281  const size_type k = ( kj >> 16 ) ;
282 
283  for ( size_type col = 0; col < block_size; ++col ) {
284  const size_type bCol = iBlockEntry + col;
285 #if (__CUDA_ARCH__ >= 350)
286  y += v * ( __ldg(&m_A.values(j,bCol)) * sh_x[col+k*BlockSize] +
287  __ldg(&m_A.values(k,bCol)) * sh_x[col+j*BlockSize] );
288 #else
289  y += v * ( m_A.values(j,bCol) * sh_x[col+k*BlockSize] +
290  m_A.values(k,bCol) * sh_x[col+j*BlockSize] );
291 #endif
292  }
293 
294  }
295 
296  // Reduction of 'y' within 'blockDim.x'
297 #if HAVE_CUDA_SHUFFLE
298  if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x, mask);
299  if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x, mask);
300  if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x, mask);
301  if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x, mask);
302  if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x, mask);
303  if ( threadIdx.x == 0 ) sh_y[i] += y;
304 #else
305  sh_t[ tid ] = y;
306  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
307  sync_warp(mask);
308  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
309  sync_warp(mask);
310  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
311  sync_warp(mask);
312  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
313  sync_warp(mask);
314  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
315  sync_warp(mask);
316  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
317 #endif
318 
319  }
320 
321  }
322 
323  // Wait for all contributions of y to be completed
324  __syncthreads();
325 
326  // Store result back in global memory
327  for ( size_type i = tid; i < dim; i += nid ) {
328  m_y( i, blockIdx.x ) = sh_y[ i ];
329  }
330  }
331  };
332 
333 #endif
334 
335  //------------------------------------
336 
337  struct TensorReadEntry {
338  size_type block_size, shmem, num_blocks, num_warp;
339  double reads;
340  };
341 
342  static void apply( const matrix_type & A,
343  const vector_type & x,
344  const vector_type & y )
345  {
346  const size_type row_count = A.graph.row_map.extent(0) - 1;
347  const size_type tensor_dimension = A.block.dimension();
348  const size_type tensor_align = tensor_dimension;
349  const size_type avg_tensor_entries_per_row = A.block.avg_entries_per_row();
350 
351  // Should compute this from FEM graph
352  const size_type fem_nnz_per_row = 27;
353 
354  // Get device properties we need for whatever device is currently selected
355  DeviceProp device_prop;
356  const size_type shcap = device_prop.shared_memory_capacity;
357  const size_type sh_granularity = device_prop.shared_memory_granularity;
358  const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
359  const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
360  const size_type warp_size = device_prop.warp_size;
361  const size_type warp_granularity = device_prop.warp_granularity;
362  const size_type max_warps_per_block =
363  std::min(device_prop.max_threads_per_block / warp_size,
364  device_prop.max_warps_per_sm);
365  const size_type min_warps_per_block = 1;
366  const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
367  const size_type max_regs_per_block = device_prop.max_regs_per_block;
368  const size_type reg_bank_size = device_prop.reg_bank_size;
369 
370  // Compute number of warps we can fit on each SM based on register limits
371  // Use Cuda introspection to determine number of registers per thread
372  //const size_type regs_per_thread = 46;
373  const size_type regs_per_thread =
374  device_prop.get_kernel_registers(
375  Kokkos::Impl::cuda_parallel_launch_local_memory<MultiplyKernel>);
376  const size_type regs_per_warp =
377  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
378  const size_type warps_per_sm =
379  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
380  const size_type warps_per_block =
381  (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
382 
383  // Compute number of threads per stochastic row based on number of
384  // nonzero entries per row.
385  // For double, 16 threads/row is still coalesced, but not for float.
386  // We should reorder the tensor values for a given vector width to
387  // maintain coalesced reads. This would help smaller problems too by
388  // allowing fewer threads/row.
389  const size_type threads_per_row =
390  avg_tensor_entries_per_row >= 88 ? 32 : 16;
391  const size_type rows_per_warp = warp_size / threads_per_row;
392 
393  const size_type vec_scalar_size = sizeof(VectorScalar);
394 #if USE_LDG == 0
395  const size_type mat_scalar_size = sizeof(MatrixScalar);
396 #endif
397 
398 #define USE_FIXED_BLOCKSIZE 0
399 
400 #if USE_FIXED_BLOCKSIZE
401 
402  const size_type num_blocks = 3;
403  size_type nw = warps_per_sm / num_blocks;
404  while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
405  const size_type num_warp = nw;
406  const size_type sh_per_block = shcap / num_blocks;
407  const size_type sr =
408  device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*num_warp;
409 #if USE_LDG == 1
410  size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
411  vec_scalar_size;
412 #else
413  size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
414  (vec_scalar_size+mat_scalar_size);
415 #endif
416  if (bs % 2 == 0) --bs;
417  const size_type block_size_max = 31;
418  const size_type block_size = std::min(bs, block_size_max);
419  //const size_type block_size = 7;
420 #if USE_LDG == 1
421  const size_type shmem =
422  ( (vec_scalar_size*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
423 #else
424  const size_type shmem =
425  ( ((vec_scalar_size+mat_scalar_size)*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
426 #endif
427 
428 #else
429 
430  // We want to maximize the number of blocks per SM (to maximize throughput)
431  // as well as the block_size (to minimize tensor reads), subject to
432  // shared memory and register constraints. Here we try to do this computing
433  // the number of tensor reads per block per thread for each choice of
434  // block_size, and then choose the configuration with the smallest value.
435  // This isn't perfect, but seems to generally work OK. It could be
436  // improved with a better model of:
437  // * Number of blocks versus warps per block (to minimize synchronization)
438  // * Thread efficiency for small numbers of rows per thread
439  const size_type block_size_min = 3;
440  const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
441  const size_type block_size_max =
442  half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
443  Teuchos::Array<TensorReadEntry> reads_per_thread;
444  for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
445  // We don't know the number of warps yet, so we just have to bound
446  // sr by the maximum number possible (which is all warps in 1 block)
447  const size_type sr =
448  device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*warps_per_block;
449 #if USE_LDG == 1
450  size_type shmem =
451  (vec_scalar_size*bs+vec_scalar_size)*tensor_align+sr;
452 #else
453  size_type shmem =
454  ((vec_scalar_size+mat_scalar_size)*bs+vec_scalar_size)*tensor_align+sr;
455 #endif
456  shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
457  if (shmem <= max_shmem_per_block) {
458  size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
459  size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
460  size_type num_warp =
461  std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
462  min_warps_per_block),
463  max_warps_per_block);
464  while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
465  --num_warp;
466  TensorReadEntry entry;
467  entry.block_size = bs;
468  entry.shmem = shmem;
469  entry.num_blocks = num_blocks;
470  entry.num_warp = num_warp;
471 
472  // Prefer at least 3 blocks
473  size_type factor = std::min(num_blocks,3u);
474  entry.reads = (static_cast<double>(tensor_reads) /
475  static_cast<double>(factor*num_blocks*num_warp));
476  reads_per_thread.push_back(entry);
477  }
478  }
480  reads_per_thread.size() == 0, std::logic_error,
481  "Stochastic problem dimension is too large to fit in shared memory");
482  size_type idx = 0;
483  double min_reads = reads_per_thread[0].reads;
484  for (int i=1; i<reads_per_thread.size(); ++i) {
485  if (reads_per_thread[i].reads < min_reads) {
486  idx = i;
487  min_reads = reads_per_thread[i].reads;
488  }
489  }
490 
491  const size_type block_size = reads_per_thread[idx].block_size;
492  const size_type shmem = reads_per_thread[idx].shmem;
493  const size_type num_blocks = reads_per_thread[idx].num_blocks;
494  const size_type num_warp = reads_per_thread[idx].num_warp;
495 
496 #endif
497 
498  // Setup thread blocks and grid
499  const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
500  const dim3 dGrid( row_count, 1, 1 );
501 
502 #if 0
503  std::cout << "block_size = " << block_size
504  << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
505  << " regs_per_thread = " << regs_per_thread
506  << " num blocks = " << num_blocks
507  << " num warps = " << num_warp
508  << " num rows = " << tensor_dimension
509  << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
510  << " avg entries/row = " << avg_tensor_entries_per_row
511  << std::endl;
512 #endif
513 
514  // Finally launch our kernel
515  //cudaProfilerStart();
516  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
517  ( MultiplyKernel( A, x, y, block_size ) );
518  //cudaProfilerStop();
519  }
520 
521 };
522 
523 //----------------------------------------------------------------------------
524 //----------------------------------------------------------------------------
525 
526 } // namespace Stokhos
527 
528 #endif /* #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION void sync_warp(const int &mask)
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
void push_back(const value_type &x)
CRS matrix of dense blocks.
size_type size() const
size_type get_kernel_registers(Kernel kernel)