Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_CooProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
44 
45 #include <iostream>
46 
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
52 
53 #include "cuda_profiler_api.h"
54 
55 namespace Stokhos {
56 
57 //----------------------------------------------------------------------------
58 
59 template< typename TensorScalar,
60  typename MatrixScalar,
61  typename VectorScalar,
62  bool Pack>
63 class Multiply<
64  BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >,
65  MatrixScalar, Kokkos::Cuda >,
66  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
67  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
68 {
69 public:
70 
71  typedef Kokkos::Cuda execution_space;
72  typedef execution_space::size_type size_type;
73 
76  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type;
77 
78  typedef int rows_type;
79  static const rows_type invalid_row = -1;
80 
81  class CooKernel {
82  public:
83 
89 
91  const vector_type & x,
92  const vector_type & y,
93  const size_type entries_per_thread,
94  const size_type block_size )
95  : m_A(A),
96  m_x(x),
97  m_y(y),
98  m_entries_per_thread(entries_per_thread),
99  m_block_size(block_size) {}
100 
101  __device__
102  void operator()(void) const
103  {
104  // Number of bases in the stochastic system:
105  const size_type dim = m_A.block.dimension();
106  const size_type num_entries = m_A.block.entry_count();
107 
108  // Thread dimensions and index
109  const size_type nid = blockDim.x * blockDim.y;
110  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
111 
112  // Shared memory
113  VectorScalar * const sh_x =
114  kokkos_impl_cuda_shared_memory<VectorScalar>();
115  VectorScalar * const sh_A = sh_x + m_block_size*dim;
116  VectorScalar * const sh_y = sh_A + m_block_size*dim;
117  volatile VectorScalar * const vals = sh_y + dim;
118  volatile rows_type * const rows =
119  reinterpret_cast<volatile rows_type*>(vals + nid);
120 
121  // Product tensor entries which this warp will iterate:
122  const size_type entries_per_warp = blockDim.x * m_entries_per_thread;
123  const size_type lBeg = threadIdx.y * entries_per_warp + threadIdx.x;
124  const size_type lEnd = ( lBeg + entries_per_warp < num_entries ?
125  lBeg + entries_per_warp : num_entries );
126 
127  // blockIdx.x == row in the deterministic (finite element) system
128  const size_type femBeg = m_A.graph.row_map[ blockIdx.x ];
129  const size_type femEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
130 
131  // Zero y
132  for (size_type l = tid; l < dim; l += nid) {
133  sh_y[l] = 0.0;
134  }
135 
136  // Initialize rows & vals arrays
137  rows[tid] = invalid_row;
138  vals[tid] = 0.0;
139 
140  // Loop over columns in the discrete (finite element) system.
141  for (size_type femEntry=femBeg; femEntry<femEnd; femEntry+=m_block_size) {
142  const size_type block_size =
143  femEntry + m_block_size < femEnd ? m_block_size : femEnd - femEntry;
144 
145  // Wait for X and A to be used in the previous iteration
146  // before reading new values.
147  __syncthreads();
148 
149  // Coalesced read blocks of X and A into shared memory
150  for (size_type col = 0; col < block_size; ++col) {
151 
152  const size_type femColumn = m_A.graph.entries( femEntry + col );
153  const VectorScalar * const x = & m_x( 0, femColumn );
154  const MatrixScalar * const A = & m_A.values( 0, femEntry + col );
155 
156  // Coalesced read by the whole block from global memory:
157  for (size_type l = tid; l < dim; l += nid) {
158  sh_x[col + l * m_block_size] = x[l];
159  sh_A[col + l * m_block_size] = A[l];
160  }
161 
162  }
163 
164  __syncthreads(); // wait for X and A to be read before using them
165 
166  // This cuda block is responsible for computing all values of 'y'
167  for (size_type l = lBeg; l < lEnd; l += blockDim.x) {
168 
169  // Read 'blockDim.x' entries from the tensor (coalesced)
170  size_type i, j, k;
171  m_A.block.coord(l,i,j,k);
172  const TensorScalar v = m_A.block.value( l );
173  j *= m_block_size;
174  k *= m_block_size;
175 
176  // Register storing local accumulation for row i
177  VectorScalar y = 0;
178 
179  // Check for carry from previous row
180  if (threadIdx.x == 0) {
181  if (i == rows[tid+31])
182  y += vals[tid+31];
183  else
184  sh_y[rows[tid+31]] += vals[tid+31];
185  }
186 
187  // Accumulate local row for the set of FEM columns
188  for (size_type col = 0; col < block_size; ++col) {
189  y += v * ( sh_A[col+j] * sh_x[col+k] + sh_A[col+k] * sh_x[col+j] );
190  }
191 
192  // Store row and value into shared arrays
193  rows[tid] = i;
194  vals[tid] = y;
195 
196  // Reduce 'y' within 'blockDim.x' to the right for threads
197  // on the same row
198  if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
199  if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
200  if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
201  if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
202  if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
203 
204  // Add local accumulation of y into sh_y for threads on
205  // distinct rows. We don't store thread 31 and instead carry it
206  // to the next iteration to eliminate race conditions between warps
207  if (threadIdx.x < 31 && i != rows[tid + 1])
208  sh_y[i] += vals[tid];
209 
210  }
211 
212  // At this point we have blockDim.y values that need to be
213  // reduced and stored. Move these row/value pairs to the beginning
214  __syncthreads();
215  if (threadIdx.x == 31) {
216  rows[threadIdx.y] = rows[tid];
217  vals[threadIdx.y] = vals[tid];
218  }
219  __syncthreads();
220 
221  // Reduce these values to the right using the first warp
222  // This assumes blockDim.x >= blockDim.y
223  if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
224  const size_type i = rows[tid];
225  if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
226  if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
227  if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
228  if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
229  if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
230 
231  if ((threadIdx.x == blockDim.y-1) ||
232  (threadIdx.x < blockDim.y-1 && i != rows[tid+1]))
233  sh_y[i] += vals[tid];
234  }
235 
236  // Reset rows and vals to prohibit carry across FEM columns
237  rows[tid] = invalid_row;
238  vals[tid] = 0.0;
239 
240  }
241 
242  // Wait for all contributions of y to be completed
243  __syncthreads();
244 
245  // Store result back in global memory
246  for (size_type l = tid; l < dim; l += nid) {
247  m_y( l, blockIdx.x ) = sh_y[ l ];
248  }
249  }
250  };
251 
252  //------------------------------------
253 
254  static void apply( const matrix_type & A,
255  const vector_type & x,
256  const vector_type & y )
257  {
258  const size_type fem_rows = A.graph.row_map.extent(0) - 1;
259  const size_type stoch_rows = A.block.dimension();
260  const size_type stoch_entries = A.block.entry_count();
261  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
262 
263 #ifdef STOKHOS_DEBUG
264  const size_type num_warp_max = 16; // Use fewer warps in debug mode to prevent
265  // launch failures
266 #else
267  const size_type num_warp_max = 20;
268 #endif
269  const size_type num_warp =
270  std::min( num_warp_max, stoch_entries / warp_size );
271  const dim3 dBlock( warp_size , num_warp, 1 );
272  const dim3 dGrid( fem_rows, 1, 1 );
273 
274  const size_type num_thread = dBlock.x*dBlock.y;
275  const size_type entries_per_thread =
276  (stoch_entries + num_thread-1) / num_thread;
277 
278  // Use at most half of shared memory to get 2 blocks per SMP
279  const size_type size_rows = sizeof(rows_type) * num_thread;
280  const size_type size_vals = sizeof(VectorScalar) * num_thread;
281  const size_type shcap =
282  Kokkos::Cuda().impl_internal_space_instance()->m_maxShmemPerBlock / 2;
283  size_type bs =
284  ((shcap-size_rows-size_vals) / (sizeof(VectorScalar)*stoch_rows) - 1) / 2;
285  if (bs % 2 == 0) --bs; // Make block-size odd to reduce bank conflicts
286  const size_type block_size_max = 31;
287  const size_type block_size = std::min(bs, block_size_max);
288  const size_type shmem =
289  sizeof(VectorScalar) * ((2*block_size+1) * stoch_rows) + // A, x, y
290  size_vals + size_rows;
291 
292 #if 0
293  //std::cout << std::endl << A.block << std::endl;
294  const size_type fem_nnz = A.values.extent(1);
295  std::cout << "Multiply< BlockCrsMatrix< CooProductTensor ... > >::apply"
296  << std::endl
297  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
298  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
299  << " block_size(" << block_size << ")" << std::endl
300  << " shmem(" << shmem/1024 << " kB)" << std::endl
301  << " fem_rows(" << fem_rows << ")" << std::endl
302  << " fem_nnz(" << fem_nnz << ")" << std::endl
303  << " stoch_rows(" << stoch_rows << ")" << std::endl
304  << " stoch_nnz(" << stoch_entries << ")" << std::endl
305  << " threads_per_block(" << num_thread << ")" << std::endl
306  << " entries_per_thread(" << entries_per_thread << ")" << std::endl
307  ;
308 #endif
309 
310  //cudaProfilerStart();
311  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
312  ( CooKernel( A, x, y, entries_per_thread, block_size ) );
313  //cudaProfilerStop();
314  }
315 };
316 
317 //----------------------------------------------------------------------------
318 //----------------------------------------------------------------------------
319 
320 } // namespace Stokhos
321 
322 #endif /* #ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
CRS matrix of dense blocks.
Sparse product tensor using &#39;COO&#39;-like storage format.