Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_CooProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
11 #define STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
12 
13 #include <iostream>
14 
15 #include "Kokkos_Core.hpp"
16 
17 #include "Stokhos_Multiply.hpp"
20 
21 #include "cuda_profiler_api.h"
22 
23 namespace Stokhos {
24 
25 //----------------------------------------------------------------------------
26 
27 template< typename TensorScalar,
28  typename MatrixScalar,
29  typename VectorScalar,
30  bool Pack>
31 class Multiply<
32  BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >,
33  MatrixScalar, Kokkos::Cuda >,
34  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
35  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
36 {
37 public:
38 
39  typedef Kokkos::Cuda execution_space;
40  typedef execution_space::size_type size_type;
41 
44  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type;
45 
46  typedef int rows_type;
47  static const rows_type invalid_row = -1;
48 
49  class CooKernel {
50  public:
51 
57 
59  const vector_type & x,
60  const vector_type & y,
61  const size_type entries_per_thread,
62  const size_type block_size )
63  : m_A(A),
64  m_x(x),
65  m_y(y),
66  m_entries_per_thread(entries_per_thread),
67  m_block_size(block_size) {}
68 
69  __device__
70  void operator()(void) const
71  {
72  // Number of bases in the stochastic system:
73  const size_type dim = m_A.block.dimension();
74  const size_type num_entries = m_A.block.entry_count();
75 
76  // Thread dimensions and index
77  const size_type nid = blockDim.x * blockDim.y;
78  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
79 
80  // Shared memory
81  VectorScalar * const sh_x =
82  kokkos_impl_cuda_shared_memory<VectorScalar>();
83  VectorScalar * const sh_A = sh_x + m_block_size*dim;
84  VectorScalar * const sh_y = sh_A + m_block_size*dim;
85  volatile VectorScalar * const vals = sh_y + dim;
86  volatile rows_type * const rows =
87  reinterpret_cast<volatile rows_type*>(vals + nid);
88 
89  // Product tensor entries which this warp will iterate:
90  const size_type entries_per_warp = blockDim.x * m_entries_per_thread;
91  const size_type lBeg = threadIdx.y * entries_per_warp + threadIdx.x;
92  const size_type lEnd = ( lBeg + entries_per_warp < num_entries ?
93  lBeg + entries_per_warp : num_entries );
94 
95  // blockIdx.x == row in the deterministic (finite element) system
96  const size_type femBeg = m_A.graph.row_map[ blockIdx.x ];
97  const size_type femEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
98 
99  // Zero y
100  for (size_type l = tid; l < dim; l += nid) {
101  sh_y[l] = 0.0;
102  }
103 
104  // Initialize rows & vals arrays
105  rows[tid] = invalid_row;
106  vals[tid] = 0.0;
107 
108  // Loop over columns in the discrete (finite element) system.
109  for (size_type femEntry=femBeg; femEntry<femEnd; femEntry+=m_block_size) {
110  const size_type block_size =
111  femEntry + m_block_size < femEnd ? m_block_size : femEnd - femEntry;
112 
113  // Wait for X and A to be used in the previous iteration
114  // before reading new values.
115  __syncthreads();
116 
117  // Coalesced read blocks of X and A into shared memory
118  for (size_type col = 0; col < block_size; ++col) {
119 
120  const size_type femColumn = m_A.graph.entries( femEntry + col );
121  const VectorScalar * const x = & m_x( 0, femColumn );
122  const MatrixScalar * const A = & m_A.values( 0, femEntry + col );
123 
124  // Coalesced read by the whole block from global memory:
125  for (size_type l = tid; l < dim; l += nid) {
126  sh_x[col + l * m_block_size] = x[l];
127  sh_A[col + l * m_block_size] = A[l];
128  }
129 
130  }
131 
132  __syncthreads(); // wait for X and A to be read before using them
133 
134  // This cuda block is responsible for computing all values of 'y'
135  for (size_type l = lBeg; l < lEnd; l += blockDim.x) {
136 
137  // Read 'blockDim.x' entries from the tensor (coalesced)
138  size_type i, j, k;
139  m_A.block.coord(l,i,j,k);
140  const TensorScalar v = m_A.block.value( l );
141  j *= m_block_size;
142  k *= m_block_size;
143 
144  // Register storing local accumulation for row i
145  VectorScalar y = 0;
146 
147  // Check for carry from previous row
148  if (threadIdx.x == 0) {
149  if (i == rows[tid+31])
150  y += vals[tid+31];
151  else
152  sh_y[rows[tid+31]] += vals[tid+31];
153  }
154 
155  // Accumulate local row for the set of FEM columns
156  for (size_type col = 0; col < block_size; ++col) {
157  y += v * ( sh_A[col+j] * sh_x[col+k] + sh_A[col+k] * sh_x[col+j] );
158  }
159 
160  // Store row and value into shared arrays
161  rows[tid] = i;
162  vals[tid] = y;
163 
164  // Reduce 'y' within 'blockDim.x' to the right for threads
165  // on the same row
166  if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
167  if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
168  if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
169  if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
170  if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
171 
172  // Add local accumulation of y into sh_y for threads on
173  // distinct rows. We don't store thread 31 and instead carry it
174  // to the next iteration to eliminate race conditions between warps
175  if (threadIdx.x < 31 && i != rows[tid + 1])
176  sh_y[i] += vals[tid];
177 
178  }
179 
180  // At this point we have blockDim.y values that need to be
181  // reduced and stored. Move these row/value pairs to the beginning
182  __syncthreads();
183  if (threadIdx.x == 31) {
184  rows[threadIdx.y] = rows[tid];
185  vals[threadIdx.y] = vals[tid];
186  }
187  __syncthreads();
188 
189  // Reduce these values to the right using the first warp
190  // This assumes blockDim.x >= blockDim.y
191  if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
192  const size_type i = rows[tid];
193  if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
194  if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
195  if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
196  if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
197  if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
198 
199  if ((threadIdx.x == blockDim.y-1) ||
200  (threadIdx.x < blockDim.y-1 && i != rows[tid+1]))
201  sh_y[i] += vals[tid];
202  }
203 
204  // Reset rows and vals to prohibit carry across FEM columns
205  rows[tid] = invalid_row;
206  vals[tid] = 0.0;
207 
208  }
209 
210  // Wait for all contributions of y to be completed
211  __syncthreads();
212 
213  // Store result back in global memory
214  for (size_type l = tid; l < dim; l += nid) {
215  m_y( l, blockIdx.x ) = sh_y[ l ];
216  }
217  }
218  };
219 
220  //------------------------------------
221 
222  static void apply( const matrix_type & A,
223  const vector_type & x,
224  const vector_type & y )
225  {
226  const size_type fem_rows = A.graph.row_map.extent(0) - 1;
227  const size_type stoch_rows = A.block.dimension();
228  const size_type stoch_entries = A.block.entry_count();
229  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
230 
231 #ifdef STOKHOS_DEBUG
232  const size_type num_warp_max = 16; // Use fewer warps in debug mode to prevent
233  // launch failures
234 #else
235  const size_type num_warp_max = 20;
236 #endif
237  const size_type num_warp =
238  std::min( num_warp_max, stoch_entries / warp_size );
239  const dim3 dBlock( warp_size , num_warp, 1 );
240  const dim3 dGrid( fem_rows, 1, 1 );
241 
242  const size_type num_thread = dBlock.x*dBlock.y;
243  const size_type entries_per_thread =
244  (stoch_entries + num_thread-1) / num_thread;
245 
246  // Use at most half of shared memory to get 2 blocks per SMP
247  const size_type size_rows = sizeof(rows_type) * num_thread;
248  const size_type size_vals = sizeof(VectorScalar) * num_thread;
249  const size_type shcap =
250  Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
251  size_type bs =
252  ((shcap-size_rows-size_vals) / (sizeof(VectorScalar)*stoch_rows) - 1) / 2;
253  if (bs % 2 == 0) --bs; // Make block-size odd to reduce bank conflicts
254  const size_type block_size_max = 31;
255  const size_type block_size = std::min(bs, block_size_max);
256  const size_type shmem =
257  sizeof(VectorScalar) * ((2*block_size+1) * stoch_rows) + // A, x, y
258  size_vals + size_rows;
259 
260 #if 0
261  //std::cout << std::endl << A.block << std::endl;
262  const size_type fem_nnz = A.values.extent(1);
263  std::cout << "Multiply< BlockCrsMatrix< CooProductTensor ... > >::apply"
264  << std::endl
265  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
266  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
267  << " block_size(" << block_size << ")" << std::endl
268  << " shmem(" << shmem/1024 << " kB)" << std::endl
269  << " fem_rows(" << fem_rows << ")" << std::endl
270  << " fem_nnz(" << fem_nnz << ")" << std::endl
271  << " stoch_rows(" << stoch_rows << ")" << std::endl
272  << " stoch_nnz(" << stoch_entries << ")" << std::endl
273  << " threads_per_block(" << num_thread << ")" << std::endl
274  << " entries_per_thread(" << entries_per_thread << ")" << std::endl
275  ;
276 #endif
277 
278  //cudaProfilerStart();
279  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
280  ( CooKernel( A, x, y, entries_per_thread, block_size ) );
281  //cudaProfilerStop();
282  }
283 };
284 
285 //----------------------------------------------------------------------------
286 //----------------------------------------------------------------------------
287 
288 } // namespace Stokhos
289 
290 #endif /* #ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
CRS matrix of dense blocks.
Sparse product tensor using &#39;COO&#39;-like storage format.