Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_LinearSparse3Tensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
11 #define STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
12 
13 #include <iostream>
14 
15 #include "Kokkos_Core.hpp"
16 
17 #include "Stokhos_Multiply.hpp"
20 
21 #include "cuda_profiler_api.h"
22 
23 namespace Stokhos {
24 
25 //----------------------------------------------------------------------------
26 
27 /*
28  * This version uses 4 warps per block, one warp per spatial row, and loops
29  * through stochastic rows 32 at a time.
30  */
31 template< typename TensorScalar,
32  typename MatrixScalar,
33  typename VectorScalar,
34  int BlockSize >
35 class Multiply<
36  BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >,
37  MatrixScalar, Kokkos::Cuda >,
38  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
39  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
40 {
41 public:
42 
43  typedef Kokkos::Cuda execution_space;
44  typedef execution_space::size_type size_type;
45 
48  typedef Kokkos::View< VectorScalar**,
49  Kokkos::LayoutLeft,
50  Kokkos::Cuda > vector_type;
51 
52  template <int MAX_COL>
53  class ApplyKernelSymmetric {
54  public:
55 
59 
61  const vector_type & x,
62  const vector_type & y )
63  : m_A( A ), m_x( x ), m_y( y ) {}
64 
65  __device__
66  void operator()(void) const
67  {
68  // Number of bases in the stochastic system:
69  const size_type dim = m_A.block.dimension();
70 
71  volatile VectorScalar * const sh =
72  kokkos_impl_cuda_shared_memory<VectorScalar>();
73  volatile VectorScalar * const sh_y0 =
74  sh + blockDim.x*threadIdx.y;
75  volatile VectorScalar * const sh_a0 =
76  sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
77  volatile VectorScalar * const sh_x0 =
78  sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
79  volatile size_type * const sh_col =
80  (size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
81 
82  // blockIdx.x == row in the deterministic (finite element) system
83  const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
84  if (row < m_A.graph.row_map.extent(0)-1) {
85  const size_type colBeg = m_A.graph.row_map[ row ];
86  const size_type colEnd = m_A.graph.row_map[ row + 1 ];
87 
88  // Read tensor entries
89  const TensorScalar c0 = m_A.block.value(0);
90  const TensorScalar c1 = m_A.block.value(1);
91 
92  // Zero y
93  VectorScalar y0 = 0.0;
94 
95  // Read column offsets for this row
96  for ( size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
97  lcol += blockDim.x )
98  sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
99 
100  // Loop over stochastic rows
101  for ( size_type stoch_row = threadIdx.x; stoch_row < dim;
102  stoch_row += blockDim.x) {
103 
104  VectorScalar yi = 0.0;
105 
106  // Loop over columns in the discrete (finite element) system.
107  // We should probably only loop over a portion at a time to
108  // make better use of L2 cache.
109  //
110  // We could also apply some warps to columns in parallel.
111  // This requires inter-warp reduction of y0 and yi.
112  for ( size_type col_offset = colBeg; col_offset < colEnd;
113  col_offset += 1 ) {
114 
115  // Get column index
116  const size_type lcol = col_offset-colBeg;
117  const size_type col = sh_col[lcol];
118 
119  // Read a[i] and x[i] (coalesced)
120  const MatrixScalar ai = m_A.values( stoch_row, col_offset );
121  const VectorScalar xi = m_x( stoch_row, col );
122 
123  // Store a[0] and x[0] to shared memory for this column
124  if (stoch_row == 0) {
125  sh_a0[lcol] = ai;
126  sh_x0[lcol] = xi;
127  }
128 
129  // Retrieve a[0] and x[0] from shared memory for this column
130  const MatrixScalar a0 = sh_a0[lcol];
131  const VectorScalar x0 = sh_x0[lcol];
132 
133  // Subtract off contribution of first iteration of loop
134  if (stoch_row == 0) y0 += (c0-3.0*c1)*a0*x0;
135 
136  yi += c1*(a0*xi + ai*x0);
137  y0 += c1*ai*xi;
138  }
139 
140  // Write y back to global memory
141  m_y( stoch_row, row ) = yi;
142 
143  // Sync warps so rows don't get too out-of-sync to make use of
144  // locality in spatial matrix and L2 cache
145  __syncthreads();
146  }
147 
148  // Reduction of 'y0' within warp
149  sh_y0[ threadIdx.x ] = y0;
150  if ( threadIdx.x + 16 < blockDim.x )
151  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
152  if ( threadIdx.x + 8 < blockDim.x )
153  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
154  if ( threadIdx.x + 4 < blockDim.x )
155  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
156  if ( threadIdx.x + 2 < blockDim.x )
157  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
158  if ( threadIdx.x + 1 < blockDim.x )
159  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
160 
161  // Store y0 back in global memory
162  if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
163 
164  }
165  }
166  };
167 
168  template <int MAX_COL>
169  class ApplyKernelAsymmetric {
170  public:
171 
175 
177  const vector_type & x,
178  const vector_type & y )
179  : m_A( A ), m_x( x ), m_y( y ) {}
180 
181  __device__
182  void operator()(void) const
183  {
184  // Number of bases in the stochastic system:
185  const size_type dim = m_A.block.dimension();
186 
187  volatile VectorScalar * const sh =
188  kokkos_impl_cuda_shared_memory<VectorScalar>();
189  volatile VectorScalar * const sh_y0 =
190  sh + blockDim.x*threadIdx.y;
191  volatile VectorScalar * const sh_a0 =
192  sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
193  volatile VectorScalar * const sh_x0 =
194  sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
195  volatile size_type * const sh_col =
196  (size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
197 
198  // blockIdx.x == row in the deterministic (finite element) system
199  const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
200  if (row < m_A.graph.row_map.extent(0)-1) {
201  const size_type colBeg = m_A.graph.row_map[ row ];
202  const size_type colEnd = m_A.graph.row_map[ row + 1 ];
203 
204  // Read tensor entries
205  const TensorScalar c0 = m_A.block.value(0);
206  const TensorScalar c1 = m_A.block.value(1);
207  const TensorScalar c2 = m_A.block.value(2);
208 
209  // Zero y
210  VectorScalar y0 = 0.0;
211 
212  // Read column offsets for this row
213  for ( size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
214  lcol += blockDim.x )
215  sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
216 
217  // Loop over stochastic rows
218  for ( size_type stoch_row = threadIdx.x; stoch_row < dim;
219  stoch_row += blockDim.x) {
220 
221  VectorScalar yi = 0.0;
222 
223  // Loop over columns in the discrete (finite element) system.
224  // We should probably only loop over a portion at a time to
225  // make better use of L2 cache.
226  //
227  // We could also apply some warps to columns in parallel.
228  // This requires inter-warp reduction of y0 and yi.
229  for ( size_type col_offset = colBeg; col_offset < colEnd;
230  col_offset += 1 ) {
231 
232  // Get column index
233  const size_type lcol = col_offset-colBeg;
234  const size_type col = sh_col[lcol];
235 
236  // Read a[i] and x[i] (coalesced)
237  const MatrixScalar ai = m_A.values( stoch_row, col_offset );
238  const VectorScalar xi = m_x( stoch_row, col );
239 
240  // Store a[0] and x[0] to shared memory for this column
241  if (stoch_row == 0) {
242  sh_a0[lcol] = ai;
243  sh_x0[lcol] = xi;
244  }
245 
246  // Retrieve a[0] and x[0] from shared memory for this column
247  const MatrixScalar a0 = sh_a0[lcol];
248  const VectorScalar x0 = sh_x0[lcol];
249 
250  // Subtract off contribution of first iteration of loop
251  if (stoch_row == 0) y0 += (c0-3.0*c1-c2)*a0*x0;
252 
253  yi += c1*(a0*xi + ai*x0) + c2*ai*xi;
254  y0 += c1*ai*xi;
255  }
256 
257  // Write y back to global memory
258  m_y( stoch_row, row ) = yi;
259 
260  // Sync warps so rows don't get too out-of-sync to make use of
261  // locality in spatial matrix and L2 cache
262  __syncthreads();
263  }
264 
265  // Reduction of 'y0' within warp
266  sh_y0[ threadIdx.x ] = y0;
267  if ( threadIdx.x + 16 < blockDim.x )
268  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
269  if ( threadIdx.x + 8 < blockDim.x )
270  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
271  if ( threadIdx.x + 4 < blockDim.x )
272  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
273  if ( threadIdx.x + 2 < blockDim.x )
274  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
275  if ( threadIdx.x + 1 < blockDim.x )
276  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
277 
278  // Store y0 back in global memory
279  if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
280 
281  }
282  }
283  };
284 
285 
286  //------------------------------------
287 
288  static void apply( const matrix_type & A,
289  const vector_type & x,
290  const vector_type & y )
291  {
292  const size_type num_spatial_rows = A.graph.row_map.extent(0) - 1;
293  const size_type num_stoch_rows = A.block.dimension();
294 
295  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
296  const size_type num_rows_per_block = 4;
297  size_type num_blocks = num_spatial_rows / num_rows_per_block;
298  if (num_blocks * num_rows_per_block < num_spatial_rows)
299  ++num_blocks;
300  const dim3 dBlock( warp_size, num_rows_per_block );
301  const dim3 dGrid( num_blocks , 1, 1 );
302 
303  const int MAX_COL = 32; // Maximum number of nonzero columns per row
304  const size_type shmem =
305  sizeof(VectorScalar) * (dBlock.x*dBlock.y + 2*MAX_COL*dBlock.y) +
306  sizeof(size_type) * (MAX_COL*dBlock.y);
307 
308 #if 0
309 
310  std::cout << "Multiply< BlockCrsMatrix< LinearSparse3Tensor ... > >::apply"
311  << std::endl
312  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
313  << " block(" << dBlock.x << "," << dBlock.y << "," << dBlock.z << ")"<< std::endl
314  << " shmem(" << shmem/1024 << " kB)" << std::endl
315  << " num_spatial_rows(" << num_spatial_rows << ")" << std::endl
316  << " num_stoch_rows(" << num_stoch_rows << ")" << std::endl;
317 #endif
318 
319  //cudaProfilerStart();
320  if (A.block.symmetric())
321  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
322  ( ApplyKernelSymmetric<MAX_COL>( A, x, y ) );
323  else
324  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
325  ( ApplyKernelAsymmetric<MAX_COL>( A, x, y ) );
326  //cudaProfilerStop();
327  }
328 };
329 
330 //----------------------------------------------------------------------------
331 //----------------------------------------------------------------------------
332 
333 } // namespace Stokhos
334 
335 #endif /* #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP */
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
CRS matrix of dense blocks.