Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_LinearSparse3Tensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
43 #define STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
44 
45 #include <iostream>
46 
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
52 
53 #include "cuda_profiler_api.h"
54 
55 namespace Stokhos {
56 
57 //----------------------------------------------------------------------------
58 
59 /*
60  * This version uses 4 warps per block, one warp per spatial row, and loops
61  * through stochastic rows 32 at a time.
62  */
63 template< typename TensorScalar,
64  typename MatrixScalar,
65  typename VectorScalar,
66  int BlockSize >
67 class Multiply<
68  BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >,
69  MatrixScalar, Kokkos::Cuda >,
70  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
71  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
72 {
73 public:
74 
75  typedef Kokkos::Cuda execution_space;
76  typedef execution_space::size_type size_type;
77 
80  typedef Kokkos::View< VectorScalar**,
81  Kokkos::LayoutLeft,
82  Kokkos::Cuda > vector_type;
83 
84  template <int MAX_COL>
85  class ApplyKernelSymmetric {
86  public:
87 
91 
93  const vector_type & x,
94  const vector_type & y )
95  : m_A( A ), m_x( x ), m_y( y ) {}
96 
97  __device__
98  void operator()(void) const
99  {
100  // Number of bases in the stochastic system:
101  const size_type dim = m_A.block.dimension();
102 
103  volatile VectorScalar * const sh =
104  kokkos_impl_cuda_shared_memory<VectorScalar>();
105  volatile VectorScalar * const sh_y0 =
106  sh + blockDim.x*threadIdx.y;
107  volatile VectorScalar * const sh_a0 =
108  sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
109  volatile VectorScalar * const sh_x0 =
110  sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
111  volatile size_type * const sh_col =
112  (size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
113 
114  // blockIdx.x == row in the deterministic (finite element) system
115  const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
116  if (row < m_A.graph.row_map.extent(0)-1) {
117  const size_type colBeg = m_A.graph.row_map[ row ];
118  const size_type colEnd = m_A.graph.row_map[ row + 1 ];
119 
120  // Read tensor entries
121  const TensorScalar c0 = m_A.block.value(0);
122  const TensorScalar c1 = m_A.block.value(1);
123 
124  // Zero y
125  VectorScalar y0 = 0.0;
126 
127  // Read column offsets for this row
128  for ( size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
129  lcol += blockDim.x )
130  sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
131 
132  // Loop over stochastic rows
133  for ( size_type stoch_row = threadIdx.x; stoch_row < dim;
134  stoch_row += blockDim.x) {
135 
136  VectorScalar yi = 0.0;
137 
138  // Loop over columns in the discrete (finite element) system.
139  // We should probably only loop over a portion at a time to
140  // make better use of L2 cache.
141  //
142  // We could also apply some warps to columns in parallel.
143  // This requires inter-warp reduction of y0 and yi.
144  for ( size_type col_offset = colBeg; col_offset < colEnd;
145  col_offset += 1 ) {
146 
147  // Get column index
148  const size_type lcol = col_offset-colBeg;
149  const size_type col = sh_col[lcol];
150 
151  // Read a[i] and x[i] (coalesced)
152  const MatrixScalar ai = m_A.values( stoch_row, col_offset );
153  const VectorScalar xi = m_x( stoch_row, col );
154 
155  // Store a[0] and x[0] to shared memory for this column
156  if (stoch_row == 0) {
157  sh_a0[lcol] = ai;
158  sh_x0[lcol] = xi;
159  }
160 
161  // Retrieve a[0] and x[0] from shared memory for this column
162  const MatrixScalar a0 = sh_a0[lcol];
163  const VectorScalar x0 = sh_x0[lcol];
164 
165  // Subtract off contribution of first iteration of loop
166  if (stoch_row == 0) y0 += (c0-3.0*c1)*a0*x0;
167 
168  yi += c1*(a0*xi + ai*x0);
169  y0 += c1*ai*xi;
170  }
171 
172  // Write y back to global memory
173  m_y( stoch_row, row ) = yi;
174 
175  // Sync warps so rows don't get too out-of-sync to make use of
176  // locality in spatial matrix and L2 cache
177  __syncthreads();
178  }
179 
180  // Reduction of 'y0' within warp
181  sh_y0[ threadIdx.x ] = y0;
182  if ( threadIdx.x + 16 < blockDim.x )
183  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
184  if ( threadIdx.x + 8 < blockDim.x )
185  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
186  if ( threadIdx.x + 4 < blockDim.x )
187  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
188  if ( threadIdx.x + 2 < blockDim.x )
189  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
190  if ( threadIdx.x + 1 < blockDim.x )
191  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
192 
193  // Store y0 back in global memory
194  if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
195 
196  }
197  }
198  };
199 
200  template <int MAX_COL>
201  class ApplyKernelAsymmetric {
202  public:
203 
207 
209  const vector_type & x,
210  const vector_type & y )
211  : m_A( A ), m_x( x ), m_y( y ) {}
212 
213  __device__
214  void operator()(void) const
215  {
216  // Number of bases in the stochastic system:
217  const size_type dim = m_A.block.dimension();
218 
219  volatile VectorScalar * const sh =
220  kokkos_impl_cuda_shared_memory<VectorScalar>();
221  volatile VectorScalar * const sh_y0 =
222  sh + blockDim.x*threadIdx.y;
223  volatile VectorScalar * const sh_a0 =
224  sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
225  volatile VectorScalar * const sh_x0 =
226  sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
227  volatile size_type * const sh_col =
228  (size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
229 
230  // blockIdx.x == row in the deterministic (finite element) system
231  const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
232  if (row < m_A.graph.row_map.extent(0)-1) {
233  const size_type colBeg = m_A.graph.row_map[ row ];
234  const size_type colEnd = m_A.graph.row_map[ row + 1 ];
235 
236  // Read tensor entries
237  const TensorScalar c0 = m_A.block.value(0);
238  const TensorScalar c1 = m_A.block.value(1);
239  const TensorScalar c2 = m_A.block.value(2);
240 
241  // Zero y
242  VectorScalar y0 = 0.0;
243 
244  // Read column offsets for this row
245  for ( size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
246  lcol += blockDim.x )
247  sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
248 
249  // Loop over stochastic rows
250  for ( size_type stoch_row = threadIdx.x; stoch_row < dim;
251  stoch_row += blockDim.x) {
252 
253  VectorScalar yi = 0.0;
254 
255  // Loop over columns in the discrete (finite element) system.
256  // We should probably only loop over a portion at a time to
257  // make better use of L2 cache.
258  //
259  // We could also apply some warps to columns in parallel.
260  // This requires inter-warp reduction of y0 and yi.
261  for ( size_type col_offset = colBeg; col_offset < colEnd;
262  col_offset += 1 ) {
263 
264  // Get column index
265  const size_type lcol = col_offset-colBeg;
266  const size_type col = sh_col[lcol];
267 
268  // Read a[i] and x[i] (coalesced)
269  const MatrixScalar ai = m_A.values( stoch_row, col_offset );
270  const VectorScalar xi = m_x( stoch_row, col );
271 
272  // Store a[0] and x[0] to shared memory for this column
273  if (stoch_row == 0) {
274  sh_a0[lcol] = ai;
275  sh_x0[lcol] = xi;
276  }
277 
278  // Retrieve a[0] and x[0] from shared memory for this column
279  const MatrixScalar a0 = sh_a0[lcol];
280  const VectorScalar x0 = sh_x0[lcol];
281 
282  // Subtract off contribution of first iteration of loop
283  if (stoch_row == 0) y0 += (c0-3.0*c1-c2)*a0*x0;
284 
285  yi += c1*(a0*xi + ai*x0) + c2*ai*xi;
286  y0 += c1*ai*xi;
287  }
288 
289  // Write y back to global memory
290  m_y( stoch_row, row ) = yi;
291 
292  // Sync warps so rows don't get too out-of-sync to make use of
293  // locality in spatial matrix and L2 cache
294  __syncthreads();
295  }
296 
297  // Reduction of 'y0' within warp
298  sh_y0[ threadIdx.x ] = y0;
299  if ( threadIdx.x + 16 < blockDim.x )
300  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
301  if ( threadIdx.x + 8 < blockDim.x )
302  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
303  if ( threadIdx.x + 4 < blockDim.x )
304  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
305  if ( threadIdx.x + 2 < blockDim.x )
306  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
307  if ( threadIdx.x + 1 < blockDim.x )
308  sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
309 
310  // Store y0 back in global memory
311  if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
312 
313  }
314  }
315  };
316 
317 
318  //------------------------------------
319 
320  static void apply( const matrix_type & A,
321  const vector_type & x,
322  const vector_type & y )
323  {
324  const size_type num_spatial_rows = A.graph.row_map.extent(0) - 1;
325  const size_type num_stoch_rows = A.block.dimension();
326 
327  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
328  const size_type num_rows_per_block = 4;
329  size_type num_blocks = num_spatial_rows / num_rows_per_block;
330  if (num_blocks * num_rows_per_block < num_spatial_rows)
331  ++num_blocks;
332  const dim3 dBlock( warp_size, num_rows_per_block );
333  const dim3 dGrid( num_blocks , 1, 1 );
334 
335  const int MAX_COL = 32; // Maximum number of nonzero columns per row
336  const size_type shmem =
337  sizeof(VectorScalar) * (dBlock.x*dBlock.y + 2*MAX_COL*dBlock.y) +
338  sizeof(size_type) * (MAX_COL*dBlock.y);
339 
340 #if 0
341 
342  std::cout << "Multiply< BlockCrsMatrix< LinearSparse3Tensor ... > >::apply"
343  << std::endl
344  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
345  << " block(" << dBlock.x << "," << dBlock.y << "," << dBlock.z << ")"<< std::endl
346  << " shmem(" << shmem/1024 << " kB)" << std::endl
347  << " num_spatial_rows(" << num_spatial_rows << ")" << std::endl
348  << " num_stoch_rows(" << num_stoch_rows << ")" << std::endl;
349 #endif
350 
351  //cudaProfilerStart();
352  if (A.block.symmetric())
353  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
354  ( ApplyKernelSymmetric<MAX_COL>( A, x, y ) );
355  else
356  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
357  ( ApplyKernelAsymmetric<MAX_COL>( A, x, y ) );
358  //cudaProfilerStop();
359  }
360 };
361 
362 //----------------------------------------------------------------------------
363 //----------------------------------------------------------------------------
364 
365 } // namespace Stokhos
366 
367 #endif /* #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP */
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
CRS matrix of dense blocks.