Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_TiledCrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
11 #define STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
12 
13 #include <iostream>
14 
15 #include "Kokkos_Core.hpp"
16 
17 #include "Stokhos_Multiply.hpp"
20 
21 #include "cuda_profiler_api.h"
22 
23 namespace Stokhos {
24 
25 #if 1
26 
27 //----------------------------------------------------------------------------
28 
29 template< typename TensorScalar ,
30  typename MatrixScalar ,
31  typename VectorScalar >
32 class Multiply<
33  BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
34  MatrixScalar, Kokkos::Cuda >,
35  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
36  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
37 {
38 public:
39 
40  typedef Kokkos::Cuda execution_space ;
41  typedef execution_space::size_type size_type ;
42 
45  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
46 
47  class ProductTensorLoop {
48  public:
49 
54 
56  const vector_type & x,
57  const vector_type & y,
58  const size_type block_size )
59  : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
60 
61  __device__
62  void operator()(void) const
63  {
64  const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
65 
66  // Number of bases in the stochastic system:
67  const size_type dim = m_A.block.dimension();
68 
69  // Number of Cijk tiles
70  const size_type n_tile = m_A.block.num_tiles();
71  const size_type tile_size = m_A.block.tile_size();
72  const size_type tile_dim = n_tile == 1 ? dim : tile_size;
73  //const size_type tile_dim = tile_size;
74 
75  VectorScalar * const sh_x_k =
76  kokkos_impl_cuda_shared_memory<VectorScalar>();
77  VectorScalar * const sh_x_j =
78  n_tile == 1 ? sh_x_k : sh_x_k + m_block_size*tile_dim;
79  VectorScalar * const sh_A_k =
80  sh_x_j + m_block_size*tile_dim;
81  VectorScalar * const sh_A_j =
82  n_tile == 1 ? sh_A_k : sh_A_k + m_block_size*tile_dim;
83  VectorScalar * const sh_y = sh_A_j + m_block_size*tile_dim;
84  volatile VectorScalar * const sh_t = sh_y + tile_dim;
85 
86  const size_type nid = blockDim.x * blockDim.y;
87  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
88 
89  // blockIdx.x == row in the deterministic (finite element) system
90  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
91  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
92  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
93  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
94  if (remBlock > 0) ++numBlock;
95 
96  // Zero y
97  for (size_type i=tid; i<dim; i+=nid) {
98  m_y(i,blockIdx.x) = 0.0;
99  }
100 
101  // Loop over Cijk tiles
102  for (size_type tile = 0; tile<n_tile; ++tile) {
103 
104  const size_type i_offset = m_A.block.offset(tile, 0);
105  const size_type j_offset = m_A.block.offset(tile, 1);
106  const size_type k_offset = m_A.block.offset(tile, 2);
107  const size_type i_range = m_A.block.range(tile, 0);
108  const size_type j_range = m_A.block.range(tile, 1);
109  const size_type k_range = m_A.block.range(tile, 2);
110  const size_type n_row = m_A.block.num_rows(tile);
111 
112  // Zero y
113  for (size_type i=tid; i<i_range; i+=nid) {
114  sh_y[i] = 0.0;
115  }
116 
117  // Loop over finite element column blocks.
118  size_type iBlockEntry = iBlockEntryBeg;
119  for (size_type block=0; block<numBlock;
120  ++block, iBlockEntry+=m_block_size) {
121 
122  const size_type block_size =
123  (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
124 
125  // Wait for X and A to be used in the previous iteration
126  // before reading new values.
127  __syncthreads();
128 
129  // Coalesced read blocks of X and A into shared memory
130  for (size_type col=0; col<block_size; ++col) {
131 
132  const size_type iBlockColumn =
133  m_A.graph.entries( iBlockEntry + col );
134  const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
135  const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
136  const MatrixScalar * const A_k = & m_A.values( k_offset , iBlockEntry + col );
137  const MatrixScalar * const A_j = & m_A.values( j_offset , iBlockEntry + col );
138 
139  for (size_type j=tid; j<j_range; j+=nid) {
140  sh_x_j[col+j*m_block_size] = x_j[j];
141  sh_A_j[col+j*m_block_size] = A_j[j];
142  }
143  if (n_tile > 1) {
144  for (size_type k=tid; k<k_range; k+=nid) {
145  sh_x_k[col+k*m_block_size] = x_k[k];
146  sh_A_k[col+k*m_block_size] = A_k[k];
147  }
148  }
149 
150  }
151 
152  __syncthreads(); // wait for X and A to be read
153 
154  // Loop over stochastic rows in this tile
155  for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
156  VectorScalar s = 0;
157 
158  // Product tensor entries which this warp will iterate:
159  const size_type lBeg = m_A.block.entry_begin(tile, i);
160  const size_type lEnd = m_A.block.entry_end(tile, i);
161 
162  // Loop through sparse tensor contributions with coalesced reads.
163  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
164 
165  const size_type kj = m_A.block.coord( l );
166  const TensorScalar v = m_A.block.value( l );
167  const size_type j = ( kj & 0x0ffff ) * m_block_size ;
168  const size_type k = ( kj >> 16 ) * m_block_size ;
169 
170  for ( size_type col = 0; col < block_size; ++col ) {
171  s += v * ( sh_A_j[col+j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+j] );
172  }
173 
174  }
175 
176  // Reduction of 'y' within 'CudaTraits::WarpSize'
177  sh_t[tid] = s;
178  if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
179  if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
180  if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
181  if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
182  if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
183  if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
184 
185  }
186 
187  }
188 
189  // Wait for all threads to complete the tile
190  __syncthreads();
191 
192  // Store partial sum for this tile back in global memory
193  for (size_type i=tid; i<i_range; i+=nid) {
194  m_y( i+i_offset , blockIdx.x ) += sh_y[i];
195  }
196  }
197 
198  }
199  };
200 
201  //------------------------------------
202 
203  static void apply( const matrix_type & A ,
204  const vector_type & x ,
205  const vector_type & y )
206  {
207  const size_type row_count = A.graph.row_map.extent(0) - 1;
208  const size_type tensor_dimension = A.block.dimension();
209  const size_type tile_size = A.block.tile_size();
210  const size_type num_tiles = A.block.num_tiles();
211  //const size_type tile_dim = std::min(tensor_dimension, tile_size);
212  const size_type tile_dim = tile_size;
213 
214 #ifdef STOKHOS_DEBUG
215  const size_type nWarp = 12; // Use fewer warps in debug mode to prevent
216  // launch failures
217 #else
218  const size_type nWarp = 16;
219 #endif
220  const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
221  const dim3 dGrid( row_count , 1 , 1 );
222 
223  const size_type shmem_factor = num_tiles == 1 ? 2 : 4;
224  const size_type tensor_align = num_tiles == 1 ? tensor_dimension : tile_dim;
225  const size_type shcap =
226  Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
227  size_type bs = ((shcap / sizeof(VectorScalar) - dBlock.x*dBlock.y) / tensor_align - 1) / shmem_factor;
228  if (bs % 2 == 0)
229  --bs;
230  const size_type block_size_max = 31;
231  const size_type block_size = std::min(bs, block_size_max);
232  // const int block_size = 9;
233  const size_type shmem =
234  sizeof(VectorScalar) * ((shmem_factor*block_size+1) * tensor_align + dBlock.x*dBlock.y);
235 
236  /*
237  //const size_type shcap = Kokkos::Impl::CudaTraits::SharedMemoryCapacity;
238  const size_type block_size = 9;
239  size_type shmem;
240  if (num_tiles > 1)
241  shmem = sizeof(VectorScalar) * ((4*block_size+1)*tile_dim + dBlock.x*dBlock.y);
242  else
243  shmem = sizeof(VectorScalar) * ((2*block_size+1)*tensor_dimension + dBlock.x*dBlock.y);
244  */
245 
246 #if 0
247 
248  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
249  << std::endl
250  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
251  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
252  << " block_size(" << block_size << ")" << std::endl
253  << " shmem(" << shmem/1024 << " kB)" << std::endl
254  << " row_count(" << row_count << ")" << std::endl
255  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
256  << " tile_size(" << tile_size << ")" << std::endl
257  << " num_tiles(" << num_tiles << ")" << std::endl
258  ;
259 #endif
260  //cudaProfilerStart();
261  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
262  ( ProductTensorLoop( A , x , y, block_size ) );
263  //cudaProfilerStop();
264  }
265 };
266 
267 #elif 0
268 
269 //----------------------------------------------------------------------------
270 
271 template< typename TensorScalar ,
272  typename MatrixScalar ,
273  typename VectorScalar >
274 class Multiply<
275  BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
276  MatrixScalar, Kokkos::Cuda >,
277  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
278  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
279 {
280 public:
281 
282  typedef Kokkos::Cuda execution_space ;
283  typedef execution_space::size_type size_type ;
284 
285  typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
286  typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
287  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
288 
289  class ProductTensorLoop {
290  public:
291 
292  const matrix_type m_A;
293  const vector_type m_x;
294  const vector_type m_y;
295  const size_type m_tile;
296 
297  ProductTensorLoop( const matrix_type & A,
298  const vector_type & x,
299  const vector_type & y,
300  const size_type tile )
301  : m_A( A ), m_x( x ), m_y( y ), m_tile(tile) {}
302 
303  __device__
304  void operator()(void) const
305  {
306  const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
307 
308  // Number of bases in the stochastic system:
309  const size_type dim = m_A.block.dimension();
310  const size_type tile_size = m_A.block.tile_size();
311 
312  VectorScalar * const sh_x_k =
313  kokkos_impl_cuda_shared_memory<VectorScalar>();
314  VectorScalar * const sh_x_j = sh_x_k + tile_size;
315  VectorScalar * const sh_A_k = sh_x_j + tile_size;
316  VectorScalar * const sh_A_j = sh_A_k + tile_size;
317  VectorScalar * const sh_y = sh_A_j + tile_size;
318  volatile VectorScalar * const sh_t = sh_y + tile_size;
319 
320  const size_type nid = blockDim.x * blockDim.y;
321  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
322 
323  // Divide warps into 4 groups for reading x_j, x_k, A_j, and A_k
324  // const size_type warps_per_group = blockDim.y / 4;
325  // const size_type group_lane = threadIdx.y % warps_per_group;
326  // const size_type gid = threadIdx.x + blockDim.x * group_lane;
327  // const size_type ngid = warps_per_group * blockDim.x;
328  // const size_type group = threadIdx.y / warps_per_group;
329 
330  const size_type i_offset = m_A.block.offset(m_tile, 0);
331  const size_type j_offset = m_A.block.offset(m_tile, 1);
332  const size_type k_offset = m_A.block.offset(m_tile, 2);
333  const size_type i_range = m_A.block.range(m_tile, 0);
334  const size_type j_range = m_A.block.range(m_tile, 1);
335  const size_type k_range = m_A.block.range(m_tile, 2);
336  const size_type n_row = m_A.block.num_rows(m_tile);
337 
338  // blockIdx.x == row in the deterministic (finite element) system
339  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
340  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
341 
342  // Zero y
343  for (size_type i=tid; i<i_range; i+=nid) {
344  sh_y[i] = 0.0;
345  }
346 
347  const size_type * __restrict cijk_j = &m_A.block.coord(0,0);
348  const size_type * __restrict cijk_k = &m_A.block.coord(0,1);
349  const TensorScalar * __restrict cijk_v = &m_A.block.value(0);
350 
351  // Loop over finite element column blocks.
352  for (size_type iBlockEntry = iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
353  ++iBlockEntry) {
354 
355  __syncthreads(); // wait for threads from previous iteration to finish
356 
357  // Coalesced read blocks of X and A into shared memory
358  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry );
359  const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
360  const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
361  const MatrixScalar * const A_k = & m_A.values( k_offset , iBlockEntry );
362  const MatrixScalar * const A_j = & m_A.values( j_offset , iBlockEntry );
363 
364  for (size_type j=tid; j<j_range; j+=nid) {
365  sh_x_j[j] = x_j[j];
366  sh_A_j[j] = A_j[j];
367  }
368  for (size_type k=tid; k<k_range; k+=nid) {
369  sh_x_k[k] = x_k[k];
370  sh_A_k[k] = A_k[k];
371  }
372  // if (group == 0)
373  // for (size_type j=gid; j<j_range; j+=ngid) sh_x_j[j] = x_j[j];
374  // if (group == 1)
375  // for (size_type j=gid; j<j_range; j+=ngid) sh_A_j[j] = A_j[j];
376  // if (group == 2)
377  // for (size_type k=gid; k<k_range; k+=ngid) sh_x_k[k] = x_k[k];
378  // if (group == 3)
379  // for (size_type k=gid; k<k_range; k+=ngid) sh_A_k[k] = A_k[k];
380 
381  __syncthreads(); // wait for X and A to be read
382 
383  // Loop over stochastic rows in this tile
384  for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
385  VectorScalar s = 0;
386 
387  // Product tensor entries which this warp will iterate:
388  const size_type lBeg = m_A.block.entry_begin(m_tile, i);
389  const size_type lEnd = m_A.block.entry_end(m_tile, i);
390 
391  // Loop through sparse tensor contributions with coalesced reads.
392  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
393 
394  // Read entries from the tensor
395  // const int j = m_A.block.coord(l,0);
396  // const int k = m_A.block.coord(l,1);
397  // const MatrixScalar v = m_A.block.value(l);
398  const size_type j = cijk_j[l];
399  const size_type k = cijk_k[l];
400  const TensorScalar v = cijk_v[l];
401 
402  s += v * ( sh_A_j[j] * sh_x_k[k] + sh_A_k[k] * sh_x_j[j] );
403 
404  }
405 
406  // Reduction of 'y' within 'CudaTraits::WarpSize'
407  sh_t[tid] = s;
408  if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
409  if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
410  if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
411  if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
412  if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
413  if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
414 
415  }
416 
417  }
418 
419  // Wait for all threads to complete the tile
420  __syncthreads();
421 
422  // Store partial sum for this tile back in global memory
423  for (size_type i=tid; i<i_range; i+=nid) {
424  m_y( i+i_offset , blockIdx.x ) += sh_y[i];
425  }
426 
427  }
428  };
429 
430  class Zero {
431  public:
432  typedef typename vector_type::value_type value_type;
434  typedef typename execution_space::size_type size_type;
435 
436  Zero( const vector_type & x ) : m_x( x ), m_d(x.extent(0)) {}
437 
438  KOKKOS_INLINE_FUNCTION
439  void operator()( const size_type j ) const {
440  for (size_type i=0; i<m_d; ++i)
441  m_x(i,j) = 0.0;
442  }
443 
444  private:
445  const vector_type m_x;
446  const size_type m_d;
447 
448  };
449 
450  //------------------------------------
451 
452  static void apply( const matrix_type & A ,
453  const vector_type & x ,
454  const vector_type & y )
455  {
456  const size_type row_count = A.graph.row_map.extent(0) - 1;
457  const size_type tensor_dimension = A.block.dimension();
458  const size_type tile_size = A.block.tile_size();
459  const size_type num_tiles = A.block.num_tiles();
460 
461  const size_type nWarp = 16;
462  const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
463  const dim3 dGrid( row_count , 1 , 1 );
464 
465  const size_type shmem =
466  sizeof(VectorScalar) * (5*tile_size + dBlock.x*dBlock.y);
467 
468 #if 1
469 
470  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
471  << std::endl
472  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
473  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
474  << " shmem(" << shmem/1024 << " kB)" << std::endl
475  << " row_count(" << row_count << ")" << std::endl
476  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
477  << " tile_size(" << tile_size << ")" << std::endl
478  << " num_tiles(" << num_tiles << ")" << std::endl
479  ;
480 #endif
481  //cudaProfilerStart();
482  // Zero y
483  Kokkos::parallel_for( row_count , Zero(y) );
484 
485  // Loop over Cijk tiles
486  for (size_type tile = 0; tile<num_tiles; ++tile) {
487  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
488  ( ProductTensorLoop( A , x , y, tile ) );
489  }
490  //cudaProfilerStop();
491  }
492 };
493 
494 #else
495 
496 //----------------------------------------------------------------------------
497 
498 // #define MAX_TENSOR_OFFSETS 2000
499 // __constant__ Kokkos::Cuda::size_type tensor_row_begin[MAX_TENSOR_OFFSETS];
500 
501 template< typename TensorScalar ,
502  typename MatrixScalar ,
503  typename VectorScalar >
504 class Multiply<
505  BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
506  MatrixScalar, Kokkos::Cuda >,
507  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
508  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
509 {
510 public:
511 
512  typedef Kokkos::Cuda execution_space ;
513  typedef execution_space::size_type size_type ;
514 
515  typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
516  typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
517  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
518 
519  class ProductTensorLoop {
520  public:
521 
522  const matrix_type m_A;
523  const vector_type m_x;
524  const vector_type m_y;
525 
526  ProductTensorLoop( const matrix_type & A,
527  const vector_type & x,
528  const vector_type & y )
529  : m_A( A ), m_x( x ), m_y( y ) {}
530 
531  __device__
532  void operator()(void) const
533  {
534  const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
535 
536  // Number of bases in the stochastic system:
537  const size_type dim = m_A.block.dimension();
538  const size_type tile_size = m_A.block.tile_size();
539  const size_type max_num_rows = m_A.block.max_num_rows();
540 
541  const size_type nid = blockDim.x * blockDim.y * blockDim.z;
542  const size_type tid =
543  threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * threadIdx.z );
544  const size_type thread_lane = threadIdx.x + blockDim.x * threadIdx.y;
545 
546  // blockDim.x == FEM column block-size
547  VectorScalar * const sh_x_k =
548  kokkos_impl_cuda_shared_memory<VectorScalar>();
549  //VectorScalar * const sh_x_j = sh_x_k + blockDim.x*tile_size;
550  VectorScalar * const sh_x_j = sh_x_k;
551  VectorScalar * const sh_A_k = sh_x_j + blockDim.x*tile_size;
552  //VectorScalar * const sh_A_j = sh_A_k + blockDim.x*tile_size;
553  VectorScalar * const sh_A_j = sh_A_k;
554  VectorScalar * const sh_y = sh_A_j + blockDim.x*tile_size;
555  volatile VectorScalar * const sh_t = sh_y + tile_size;
556  //volatile VectorScalar * const sh_v = sh_t + nid;
557  size_type * const sh_j = (size_type*) (sh_t + nid);
558  size_type * const sh_k = (size_type*) (sh_j + nid);
559 
560  // blockIdx.x == row in the deterministic (finite element) system
561  // These are not coalesced, but there is nothing we can do about it
562  // since we only do one row per block
563  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
564  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
565  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / blockDim.x;
566  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % blockDim.x;
567  if (remBlock > 0) ++numBlock;
568 
569  // Zero y
570  for (size_type i=tid; i<dim; i+=nid) {
571  m_y(i,blockIdx.x) = 0.0;
572  }
573 
574  // Number of Cijk tiles
575  const size_type n_tile = m_A.block.num_tiles();
576 
577  // Loop over Cijk tiles
578  for (size_type tile = 0; tile<n_tile; ++tile) {
579 
580  // These are not coalesced
581  const size_type i_offset = m_A.block.offset(tile, 0);
582  const size_type j_offset = m_A.block.offset(tile, 1);
583  const size_type k_offset = m_A.block.offset(tile, 2);
584  const size_type i_range = m_A.block.range(tile, 0);
585  const size_type j_range = m_A.block.range(tile, 1);
586  const size_type k_range = m_A.block.range(tile, 2);
587  const size_type n_row = m_A.block.num_rows(tile);
588 
589  // Zero y
590  for (size_type i=tid; i<i_range; i+=nid) {
591  sh_y[i] = 0.0;
592  }
593 
594  // Loop over finite element column blocks.
595  // threadIdx.x == FEM column within current block
596  size_type iBlockEntry = iBlockEntryBeg;
597  for (size_type block=0; block<numBlock;
598  ++block, iBlockEntry+=blockDim.x) {
599 
600  const size_type block_size =
601  (block == numBlock-1 && remBlock > 0) ? remBlock : blockDim.x;
602 
603  // Coalesced read blocks of X and A into shared memory
604  for (size_type col=0; col<block_size; ++col) {
605 
606  // This is not a coalesced read
607  const size_type iBlockColumn =
608  m_A.graph.entries( iBlockEntry + col );
609 
610  const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
611  const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
612  const MatrixScalar * const A_k =
613  & m_A.values( k_offset , iBlockEntry + col );
614  const MatrixScalar * const A_j =
615  & m_A.values( j_offset , iBlockEntry + col );
616  for (size_type j=tid; j<j_range; j+=nid) {
617  sh_x_j[col+j*blockDim.x] = x_j[j];
618  sh_A_j[col+j*blockDim.x] = A_j[j];
619  }
620  // for (size_type k=tid; k<k_range; k+=nid) {
621  // sh_x_k[col+k*blockDim.x] = x_k[k];
622  // sh_A_k[col+k*blockDim.x] = A_k[k];
623  // }
624 
625  }
626 
627  __syncthreads(); // wait for X and A to be read
628 
629  // Loop over stochastic rows in this tile
630  for (size_type i=threadIdx.z; i<i_range; i+=blockDim.z) {
631  VectorScalar s = 0;
632 
633  // Product tensor entries which this warp will iterate:
634  // These are not coalesced
635  const size_type lBeg = m_A.block.entry_begin(tile, i);
636  const size_type lEnd = m_A.block.entry_end(tile, i);
637  // const size_type lBeg = tensor_row_begin[tile*(max_num_rows+1)+i];
638  // const size_type lEnd = tensor_row_begin[tile*(max_num_rows+1)+i+1];
639 
640  // Loop through sparse tensor contributions with coalesced reads.
641  for (size_type l=lBeg; l<lEnd; l+=WarpSize) {
642  //for (size_type l=lBeg+threadIdx.y; l<lEnd; l+=blockDim.y) {
643 
644  // Read entries from the tensor
645  if (l+thread_lane < lEnd) {
646  sh_j[tid] = m_A.block.coord(l+thread_lane,0);
647  sh_k[tid] = m_A.block.coord(l+thread_lane,1);
648  sh_t[tid] = m_A.block.value(l+thread_lane);
649  }
650 
651  const int mm = (l+WarpSize<lEnd) ? WarpSize : lEnd-l;
652  for (size_type m=threadIdx.y; m<mm; m+=blockDim.y) {
653 
654  if (threadIdx.x < block_size) {
655  // const int j = m_A.block.coord(l+m,0);
656  // const int k = m_A.block.coord(l+m,1);
657  // const MatrixScalar v = m_A.block.value(l+m);
658  const size_type j = sh_j[m+WarpSize*threadIdx.z];
659  const size_type k = sh_k[m+WarpSize*threadIdx.z];
660  const TensorScalar v = sh_t[m+WarpSize*threadIdx.z];
661  const size_type jj = threadIdx.x + j*blockDim.x;
662  const size_type kk = threadIdx.x + k*blockDim.x;
663  s += v * ( sh_A_j[jj] * sh_x_k[kk] + sh_A_k[kk] * sh_x_j[jj] );
664  }
665 
666  }
667 
668  }
669 
670  // Reduction of 'y' within 'CudaTraits::WarpSize'
671  sh_t[tid] = s;
672  if ( thread_lane + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
673  if ( thread_lane + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
674  if ( thread_lane + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
675  if ( thread_lane + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
676  if ( thread_lane + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
677  if ( thread_lane == 0 ) sh_y[i] += sh_t[tid];
678 
679  }
680 
681  }
682 
683  // Wait for all threads to complete the tile
684  __syncthreads();
685 
686  // Store partial sum for this tile back in global memory
687  for (size_type i=tid; i<i_range; i+=nid) {
688  m_y( i+i_offset , blockIdx.x ) += sh_y[i];
689  }
690  }
691 
692  }
693  };
694 
695  //------------------------------------
696 
697  static void apply( const matrix_type & A ,
698  const vector_type & x ,
699  const vector_type & y )
700  {
701  const size_type row_count = A.graph.row_map.extent(0) - 1;
702  const size_type tensor_dimension = A.block.dimension();
703  const size_type tile_size = A.block.tile_size();
704  const size_type num_tiles = A.block.num_tiles();
705  const size_type max_num_rows = A.block.max_num_rows();
706 
707  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
708  const size_type block_size = 8;
709  const size_type nWarp = 16;
710  const dim3 dBlock( block_size, warp_size / block_size, nWarp );
711  const dim3 dGrid( row_count , 1 , 1 );
712 
713  const size_type shmem =
714  sizeof(VectorScalar) * ( (2*block_size+1)*tile_size +
715  dBlock.x*dBlock.y*dBlock.z ) +
716  sizeof(size_type) * 2 * dBlock.x*dBlock.y*dBlock.z;
717 
718  const size_type cmem = sizeof(size_type)*(max_num_rows+1)*num_tiles;
719 
720 #if 1
721 
722  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
723  << std::endl
724  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
725  << " block(" << dBlock.x << "," << dBlock.y << "," << dBlock.z
726  << ")" << std::endl
727  << " block_size(" << block_size << ")" << std::endl
728  << " shmem(" << shmem/1024 << " kB)" << std::endl
729  << " row_count(" << row_count << ")" << std::endl
730  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
731  << " tile_size(" << tile_size << ")" << std::endl
732  << " num_tiles(" << num_tiles << ")" << std::endl
733  << " cmem(" << cmem/1024 << " kB)" << std::endl
734  ;
735 #endif
736 
737  // Copy tensor row offsets to constant device memory
738  // cudaMemcpyToSymbol(tensor_row_begin, A.block.row_map_ptr(), cmem, 0,
739  // cudaMemcpyDeviceToDevice);
740 
741  //cudaProfilerStart();
742  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
743  ( ProductTensorLoop( A , x , y ) );
744  //cudaProfilerStop();
745  }
746 };
747 
748 #endif
749 
750 //----------------------------------------------------------------------------
751 //----------------------------------------------------------------------------
752 
753 } // namespace Stokhos
754 
755 #endif /* #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP */
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
CRS matrix of dense blocks.