Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_TiledCrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
44 
45 #include <iostream>
46 
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
52 
53 #include "cuda_profiler_api.h"
54 
55 namespace Stokhos {
56 
57 #if 1
58 
59 //----------------------------------------------------------------------------
60 
61 template< typename TensorScalar ,
62  typename MatrixScalar ,
63  typename VectorScalar >
64 class Multiply<
65  BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
66  MatrixScalar, Kokkos::Cuda >,
67  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
68  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
69 {
70 public:
71 
72  typedef Kokkos::Cuda execution_space ;
73  typedef execution_space::size_type size_type ;
74 
77  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
78 
79  class ProductTensorLoop {
80  public:
81 
86 
88  const vector_type & x,
89  const vector_type & y,
90  const size_type block_size )
91  : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
92 
93  __device__
94  void operator()(void) const
95  {
96  const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
97 
98  // Number of bases in the stochastic system:
99  const size_type dim = m_A.block.dimension();
100 
101  // Number of Cijk tiles
102  const size_type n_tile = m_A.block.num_tiles();
103  const size_type tile_size = m_A.block.tile_size();
104  const size_type tile_dim = n_tile == 1 ? dim : tile_size;
105  //const size_type tile_dim = tile_size;
106 
107  VectorScalar * const sh_x_k =
108  kokkos_impl_cuda_shared_memory<VectorScalar>();
109  VectorScalar * const sh_x_j =
110  n_tile == 1 ? sh_x_k : sh_x_k + m_block_size*tile_dim;
111  VectorScalar * const sh_A_k =
112  sh_x_j + m_block_size*tile_dim;
113  VectorScalar * const sh_A_j =
114  n_tile == 1 ? sh_A_k : sh_A_k + m_block_size*tile_dim;
115  VectorScalar * const sh_y = sh_A_j + m_block_size*tile_dim;
116  volatile VectorScalar * const sh_t = sh_y + tile_dim;
117 
118  const size_type nid = blockDim.x * blockDim.y;
119  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
120 
121  // blockIdx.x == row in the deterministic (finite element) system
122  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
123  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
124  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
125  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
126  if (remBlock > 0) ++numBlock;
127 
128  // Zero y
129  for (size_type i=tid; i<dim; i+=nid) {
130  m_y(i,blockIdx.x) = 0.0;
131  }
132 
133  // Loop over Cijk tiles
134  for (size_type tile = 0; tile<n_tile; ++tile) {
135 
136  const size_type i_offset = m_A.block.offset(tile, 0);
137  const size_type j_offset = m_A.block.offset(tile, 1);
138  const size_type k_offset = m_A.block.offset(tile, 2);
139  const size_type i_range = m_A.block.range(tile, 0);
140  const size_type j_range = m_A.block.range(tile, 1);
141  const size_type k_range = m_A.block.range(tile, 2);
142  const size_type n_row = m_A.block.num_rows(tile);
143 
144  // Zero y
145  for (size_type i=tid; i<i_range; i+=nid) {
146  sh_y[i] = 0.0;
147  }
148 
149  // Loop over finite element column blocks.
150  size_type iBlockEntry = iBlockEntryBeg;
151  for (size_type block=0; block<numBlock;
152  ++block, iBlockEntry+=m_block_size) {
153 
154  const size_type block_size =
155  (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
156 
157  // Wait for X and A to be used in the previous iteration
158  // before reading new values.
159  __syncthreads();
160 
161  // Coalesced read blocks of X and A into shared memory
162  for (size_type col=0; col<block_size; ++col) {
163 
164  const size_type iBlockColumn =
165  m_A.graph.entries( iBlockEntry + col );
166  const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
167  const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
168  const MatrixScalar * const A_k = & m_A.values( k_offset , iBlockEntry + col );
169  const MatrixScalar * const A_j = & m_A.values( j_offset , iBlockEntry + col );
170 
171  for (size_type j=tid; j<j_range; j+=nid) {
172  sh_x_j[col+j*m_block_size] = x_j[j];
173  sh_A_j[col+j*m_block_size] = A_j[j];
174  }
175  if (n_tile > 1) {
176  for (size_type k=tid; k<k_range; k+=nid) {
177  sh_x_k[col+k*m_block_size] = x_k[k];
178  sh_A_k[col+k*m_block_size] = A_k[k];
179  }
180  }
181 
182  }
183 
184  __syncthreads(); // wait for X and A to be read
185 
186  // Loop over stochastic rows in this tile
187  for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
188  VectorScalar s = 0;
189 
190  // Product tensor entries which this warp will iterate:
191  const size_type lBeg = m_A.block.entry_begin(tile, i);
192  const size_type lEnd = m_A.block.entry_end(tile, i);
193 
194  // Loop through sparse tensor contributions with coalesced reads.
195  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
196 
197  const size_type kj = m_A.block.coord( l );
198  const TensorScalar v = m_A.block.value( l );
199  const size_type j = ( kj & 0x0ffff ) * m_block_size ;
200  const size_type k = ( kj >> 16 ) * m_block_size ;
201 
202  for ( size_type col = 0; col < block_size; ++col ) {
203  s += v * ( sh_A_j[col+j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+j] );
204  }
205 
206  }
207 
208  // Reduction of 'y' within 'CudaTraits::WarpSize'
209  sh_t[tid] = s;
210  if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
211  if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
212  if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
213  if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
214  if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
215  if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
216 
217  }
218 
219  }
220 
221  // Wait for all threads to complete the tile
222  __syncthreads();
223 
224  // Store partial sum for this tile back in global memory
225  for (size_type i=tid; i<i_range; i+=nid) {
226  m_y( i+i_offset , blockIdx.x ) += sh_y[i];
227  }
228  }
229 
230  }
231  };
232 
233  //------------------------------------
234 
235  static void apply( const matrix_type & A ,
236  const vector_type & x ,
237  const vector_type & y )
238  {
239  const size_type row_count = A.graph.row_map.extent(0) - 1;
240  const size_type tensor_dimension = A.block.dimension();
241  const size_type tile_size = A.block.tile_size();
242  const size_type num_tiles = A.block.num_tiles();
243  //const size_type tile_dim = std::min(tensor_dimension, tile_size);
244  const size_type tile_dim = tile_size;
245 
246 #ifdef STOKHOS_DEBUG
247  const size_type nWarp = 12; // Use fewer warps in debug mode to prevent
248  // launch failures
249 #else
250  const size_type nWarp = 16;
251 #endif
252  const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
253  const dim3 dGrid( row_count , 1 , 1 );
254 
255  const size_type shmem_factor = num_tiles == 1 ? 2 : 4;
256  const size_type tensor_align = num_tiles == 1 ? tensor_dimension : tile_dim;
257  const size_type shcap =
258  Kokkos::Cuda().impl_internal_space_instance()->m_maxShmemPerBlock / 2;
259  size_type bs = ((shcap / sizeof(VectorScalar) - dBlock.x*dBlock.y) / tensor_align - 1) / shmem_factor;
260  if (bs % 2 == 0)
261  --bs;
262  const size_type block_size_max = 31;
263  const size_type block_size = std::min(bs, block_size_max);
264  // const int block_size = 9;
265  const size_type shmem =
266  sizeof(VectorScalar) * ((shmem_factor*block_size+1) * tensor_align + dBlock.x*dBlock.y);
267 
268  /*
269  //const size_type shcap = Kokkos::Impl::CudaTraits::SharedMemoryCapacity;
270  const size_type block_size = 9;
271  size_type shmem;
272  if (num_tiles > 1)
273  shmem = sizeof(VectorScalar) * ((4*block_size+1)*tile_dim + dBlock.x*dBlock.y);
274  else
275  shmem = sizeof(VectorScalar) * ((2*block_size+1)*tensor_dimension + dBlock.x*dBlock.y);
276  */
277 
278 #if 0
279 
280  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
281  << std::endl
282  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
283  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
284  << " block_size(" << block_size << ")" << std::endl
285  << " shmem(" << shmem/1024 << " kB)" << std::endl
286  << " row_count(" << row_count << ")" << std::endl
287  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
288  << " tile_size(" << tile_size << ")" << std::endl
289  << " num_tiles(" << num_tiles << ")" << std::endl
290  ;
291 #endif
292  //cudaProfilerStart();
293  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
294  ( ProductTensorLoop( A , x , y, block_size ) );
295  //cudaProfilerStop();
296  }
297 };
298 
299 #elif 0
300 
301 //----------------------------------------------------------------------------
302 
303 template< typename TensorScalar ,
304  typename MatrixScalar ,
305  typename VectorScalar >
306 class Multiply<
307  BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
308  MatrixScalar, Kokkos::Cuda >,
309  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
310  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
311 {
312 public:
313 
314  typedef Kokkos::Cuda execution_space ;
315  typedef execution_space::size_type size_type ;
316 
317  typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
318  typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
319  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
320 
321  class ProductTensorLoop {
322  public:
323 
324  const matrix_type m_A;
325  const vector_type m_x;
326  const vector_type m_y;
327  const size_type m_tile;
328 
329  ProductTensorLoop( const matrix_type & A,
330  const vector_type & x,
331  const vector_type & y,
332  const size_type tile )
333  : m_A( A ), m_x( x ), m_y( y ), m_tile(tile) {}
334 
335  __device__
336  void operator()(void) const
337  {
338  const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
339 
340  // Number of bases in the stochastic system:
341  const size_type dim = m_A.block.dimension();
342  const size_type tile_size = m_A.block.tile_size();
343 
344  VectorScalar * const sh_x_k =
345  kokkos_impl_cuda_shared_memory<VectorScalar>();
346  VectorScalar * const sh_x_j = sh_x_k + tile_size;
347  VectorScalar * const sh_A_k = sh_x_j + tile_size;
348  VectorScalar * const sh_A_j = sh_A_k + tile_size;
349  VectorScalar * const sh_y = sh_A_j + tile_size;
350  volatile VectorScalar * const sh_t = sh_y + tile_size;
351 
352  const size_type nid = blockDim.x * blockDim.y;
353  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
354 
355  // Divide warps into 4 groups for reading x_j, x_k, A_j, and A_k
356  // const size_type warps_per_group = blockDim.y / 4;
357  // const size_type group_lane = threadIdx.y % warps_per_group;
358  // const size_type gid = threadIdx.x + blockDim.x * group_lane;
359  // const size_type ngid = warps_per_group * blockDim.x;
360  // const size_type group = threadIdx.y / warps_per_group;
361 
362  const size_type i_offset = m_A.block.offset(m_tile, 0);
363  const size_type j_offset = m_A.block.offset(m_tile, 1);
364  const size_type k_offset = m_A.block.offset(m_tile, 2);
365  const size_type i_range = m_A.block.range(m_tile, 0);
366  const size_type j_range = m_A.block.range(m_tile, 1);
367  const size_type k_range = m_A.block.range(m_tile, 2);
368  const size_type n_row = m_A.block.num_rows(m_tile);
369 
370  // blockIdx.x == row in the deterministic (finite element) system
371  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
372  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
373 
374  // Zero y
375  for (size_type i=tid; i<i_range; i+=nid) {
376  sh_y[i] = 0.0;
377  }
378 
379  const size_type * __restrict cijk_j = &m_A.block.coord(0,0);
380  const size_type * __restrict cijk_k = &m_A.block.coord(0,1);
381  const TensorScalar * __restrict cijk_v = &m_A.block.value(0);
382 
383  // Loop over finite element column blocks.
384  for (size_type iBlockEntry = iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
385  ++iBlockEntry) {
386 
387  __syncthreads(); // wait for threads from previous iteration to finish
388 
389  // Coalesced read blocks of X and A into shared memory
390  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry );
391  const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
392  const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
393  const MatrixScalar * const A_k = & m_A.values( k_offset , iBlockEntry );
394  const MatrixScalar * const A_j = & m_A.values( j_offset , iBlockEntry );
395 
396  for (size_type j=tid; j<j_range; j+=nid) {
397  sh_x_j[j] = x_j[j];
398  sh_A_j[j] = A_j[j];
399  }
400  for (size_type k=tid; k<k_range; k+=nid) {
401  sh_x_k[k] = x_k[k];
402  sh_A_k[k] = A_k[k];
403  }
404  // if (group == 0)
405  // for (size_type j=gid; j<j_range; j+=ngid) sh_x_j[j] = x_j[j];
406  // if (group == 1)
407  // for (size_type j=gid; j<j_range; j+=ngid) sh_A_j[j] = A_j[j];
408  // if (group == 2)
409  // for (size_type k=gid; k<k_range; k+=ngid) sh_x_k[k] = x_k[k];
410  // if (group == 3)
411  // for (size_type k=gid; k<k_range; k+=ngid) sh_A_k[k] = A_k[k];
412 
413  __syncthreads(); // wait for X and A to be read
414 
415  // Loop over stochastic rows in this tile
416  for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
417  VectorScalar s = 0;
418 
419  // Product tensor entries which this warp will iterate:
420  const size_type lBeg = m_A.block.entry_begin(m_tile, i);
421  const size_type lEnd = m_A.block.entry_end(m_tile, i);
422 
423  // Loop through sparse tensor contributions with coalesced reads.
424  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
425 
426  // Read entries from the tensor
427  // const int j = m_A.block.coord(l,0);
428  // const int k = m_A.block.coord(l,1);
429  // const MatrixScalar v = m_A.block.value(l);
430  const size_type j = cijk_j[l];
431  const size_type k = cijk_k[l];
432  const TensorScalar v = cijk_v[l];
433 
434  s += v * ( sh_A_j[j] * sh_x_k[k] + sh_A_k[k] * sh_x_j[j] );
435 
436  }
437 
438  // Reduction of 'y' within 'CudaTraits::WarpSize'
439  sh_t[tid] = s;
440  if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
441  if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
442  if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
443  if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
444  if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
445  if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
446 
447  }
448 
449  }
450 
451  // Wait for all threads to complete the tile
452  __syncthreads();
453 
454  // Store partial sum for this tile back in global memory
455  for (size_type i=tid; i<i_range; i+=nid) {
456  m_y( i+i_offset , blockIdx.x ) += sh_y[i];
457  }
458 
459  }
460  };
461 
462  class Zero {
463  public:
464  typedef typename vector_type::value_type value_type;
466  typedef typename execution_space::size_type size_type;
467 
468  Zero( const vector_type & x ) : m_x( x ), m_d(x.extent(0)) {}
469 
470  KOKKOS_INLINE_FUNCTION
471  void operator()( const size_type j ) const {
472  for (size_type i=0; i<m_d; ++i)
473  m_x(i,j) = 0.0;
474  }
475 
476  private:
477  const vector_type m_x;
478  const size_type m_d;
479 
480  };
481 
482  //------------------------------------
483 
484  static void apply( const matrix_type & A ,
485  const vector_type & x ,
486  const vector_type & y )
487  {
488  const size_type row_count = A.graph.row_map.extent(0) - 1;
489  const size_type tensor_dimension = A.block.dimension();
490  const size_type tile_size = A.block.tile_size();
491  const size_type num_tiles = A.block.num_tiles();
492 
493  const size_type nWarp = 16;
494  const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
495  const dim3 dGrid( row_count , 1 , 1 );
496 
497  const size_type shmem =
498  sizeof(VectorScalar) * (5*tile_size + dBlock.x*dBlock.y);
499 
500 #if 1
501 
502  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
503  << std::endl
504  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
505  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
506  << " shmem(" << shmem/1024 << " kB)" << std::endl
507  << " row_count(" << row_count << ")" << std::endl
508  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
509  << " tile_size(" << tile_size << ")" << std::endl
510  << " num_tiles(" << num_tiles << ")" << std::endl
511  ;
512 #endif
513  //cudaProfilerStart();
514  // Zero y
515  Kokkos::parallel_for( row_count , Zero(y) );
516 
517  // Loop over Cijk tiles
518  for (size_type tile = 0; tile<num_tiles; ++tile) {
519  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
520  ( ProductTensorLoop( A , x , y, tile ) );
521  }
522  //cudaProfilerStop();
523  }
524 };
525 
526 #else
527 
528 //----------------------------------------------------------------------------
529 
530 // #define MAX_TENSOR_OFFSETS 2000
531 // __constant__ Kokkos::Cuda::size_type tensor_row_begin[MAX_TENSOR_OFFSETS];
532 
533 template< typename TensorScalar ,
534  typename MatrixScalar ,
535  typename VectorScalar >
536 class Multiply<
537  BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
538  MatrixScalar, Kokkos::Cuda >,
539  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
540  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
541 {
542 public:
543 
544  typedef Kokkos::Cuda execution_space ;
545  typedef execution_space::size_type size_type ;
546 
547  typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
548  typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
549  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
550 
551  class ProductTensorLoop {
552  public:
553 
554  const matrix_type m_A;
555  const vector_type m_x;
556  const vector_type m_y;
557 
558  ProductTensorLoop( const matrix_type & A,
559  const vector_type & x,
560  const vector_type & y )
561  : m_A( A ), m_x( x ), m_y( y ) {}
562 
563  __device__
564  void operator()(void) const
565  {
566  const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
567 
568  // Number of bases in the stochastic system:
569  const size_type dim = m_A.block.dimension();
570  const size_type tile_size = m_A.block.tile_size();
571  const size_type max_num_rows = m_A.block.max_num_rows();
572 
573  const size_type nid = blockDim.x * blockDim.y * blockDim.z;
574  const size_type tid =
575  threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * threadIdx.z );
576  const size_type thread_lane = threadIdx.x + blockDim.x * threadIdx.y;
577 
578  // blockDim.x == FEM column block-size
579  VectorScalar * const sh_x_k =
580  kokkos_impl_cuda_shared_memory<VectorScalar>();
581  //VectorScalar * const sh_x_j = sh_x_k + blockDim.x*tile_size;
582  VectorScalar * const sh_x_j = sh_x_k;
583  VectorScalar * const sh_A_k = sh_x_j + blockDim.x*tile_size;
584  //VectorScalar * const sh_A_j = sh_A_k + blockDim.x*tile_size;
585  VectorScalar * const sh_A_j = sh_A_k;
586  VectorScalar * const sh_y = sh_A_j + blockDim.x*tile_size;
587  volatile VectorScalar * const sh_t = sh_y + tile_size;
588  //volatile VectorScalar * const sh_v = sh_t + nid;
589  size_type * const sh_j = (size_type*) (sh_t + nid);
590  size_type * const sh_k = (size_type*) (sh_j + nid);
591 
592  // blockIdx.x == row in the deterministic (finite element) system
593  // These are not coalesced, but there is nothing we can do about it
594  // since we only do one row per block
595  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
596  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
597  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / blockDim.x;
598  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % blockDim.x;
599  if (remBlock > 0) ++numBlock;
600 
601  // Zero y
602  for (size_type i=tid; i<dim; i+=nid) {
603  m_y(i,blockIdx.x) = 0.0;
604  }
605 
606  // Number of Cijk tiles
607  const size_type n_tile = m_A.block.num_tiles();
608 
609  // Loop over Cijk tiles
610  for (size_type tile = 0; tile<n_tile; ++tile) {
611 
612  // These are not coalesced
613  const size_type i_offset = m_A.block.offset(tile, 0);
614  const size_type j_offset = m_A.block.offset(tile, 1);
615  const size_type k_offset = m_A.block.offset(tile, 2);
616  const size_type i_range = m_A.block.range(tile, 0);
617  const size_type j_range = m_A.block.range(tile, 1);
618  const size_type k_range = m_A.block.range(tile, 2);
619  const size_type n_row = m_A.block.num_rows(tile);
620 
621  // Zero y
622  for (size_type i=tid; i<i_range; i+=nid) {
623  sh_y[i] = 0.0;
624  }
625 
626  // Loop over finite element column blocks.
627  // threadIdx.x == FEM column within current block
628  size_type iBlockEntry = iBlockEntryBeg;
629  for (size_type block=0; block<numBlock;
630  ++block, iBlockEntry+=blockDim.x) {
631 
632  const size_type block_size =
633  (block == numBlock-1 && remBlock > 0) ? remBlock : blockDim.x;
634 
635  // Coalesced read blocks of X and A into shared memory
636  for (size_type col=0; col<block_size; ++col) {
637 
638  // This is not a coalesced read
639  const size_type iBlockColumn =
640  m_A.graph.entries( iBlockEntry + col );
641 
642  const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
643  const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
644  const MatrixScalar * const A_k =
645  & m_A.values( k_offset , iBlockEntry + col );
646  const MatrixScalar * const A_j =
647  & m_A.values( j_offset , iBlockEntry + col );
648  for (size_type j=tid; j<j_range; j+=nid) {
649  sh_x_j[col+j*blockDim.x] = x_j[j];
650  sh_A_j[col+j*blockDim.x] = A_j[j];
651  }
652  // for (size_type k=tid; k<k_range; k+=nid) {
653  // sh_x_k[col+k*blockDim.x] = x_k[k];
654  // sh_A_k[col+k*blockDim.x] = A_k[k];
655  // }
656 
657  }
658 
659  __syncthreads(); // wait for X and A to be read
660 
661  // Loop over stochastic rows in this tile
662  for (size_type i=threadIdx.z; i<i_range; i+=blockDim.z) {
663  VectorScalar s = 0;
664 
665  // Product tensor entries which this warp will iterate:
666  // These are not coalesced
667  const size_type lBeg = m_A.block.entry_begin(tile, i);
668  const size_type lEnd = m_A.block.entry_end(tile, i);
669  // const size_type lBeg = tensor_row_begin[tile*(max_num_rows+1)+i];
670  // const size_type lEnd = tensor_row_begin[tile*(max_num_rows+1)+i+1];
671 
672  // Loop through sparse tensor contributions with coalesced reads.
673  for (size_type l=lBeg; l<lEnd; l+=WarpSize) {
674  //for (size_type l=lBeg+threadIdx.y; l<lEnd; l+=blockDim.y) {
675 
676  // Read entries from the tensor
677  if (l+thread_lane < lEnd) {
678  sh_j[tid] = m_A.block.coord(l+thread_lane,0);
679  sh_k[tid] = m_A.block.coord(l+thread_lane,1);
680  sh_t[tid] = m_A.block.value(l+thread_lane);
681  }
682 
683  const int mm = (l+WarpSize<lEnd) ? WarpSize : lEnd-l;
684  for (size_type m=threadIdx.y; m<mm; m+=blockDim.y) {
685 
686  if (threadIdx.x < block_size) {
687  // const int j = m_A.block.coord(l+m,0);
688  // const int k = m_A.block.coord(l+m,1);
689  // const MatrixScalar v = m_A.block.value(l+m);
690  const size_type j = sh_j[m+WarpSize*threadIdx.z];
691  const size_type k = sh_k[m+WarpSize*threadIdx.z];
692  const TensorScalar v = sh_t[m+WarpSize*threadIdx.z];
693  const size_type jj = threadIdx.x + j*blockDim.x;
694  const size_type kk = threadIdx.x + k*blockDim.x;
695  s += v * ( sh_A_j[jj] * sh_x_k[kk] + sh_A_k[kk] * sh_x_j[jj] );
696  }
697 
698  }
699 
700  }
701 
702  // Reduction of 'y' within 'CudaTraits::WarpSize'
703  sh_t[tid] = s;
704  if ( thread_lane + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
705  if ( thread_lane + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
706  if ( thread_lane + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
707  if ( thread_lane + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
708  if ( thread_lane + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
709  if ( thread_lane == 0 ) sh_y[i] += sh_t[tid];
710 
711  }
712 
713  }
714 
715  // Wait for all threads to complete the tile
716  __syncthreads();
717 
718  // Store partial sum for this tile back in global memory
719  for (size_type i=tid; i<i_range; i+=nid) {
720  m_y( i+i_offset , blockIdx.x ) += sh_y[i];
721  }
722  }
723 
724  }
725  };
726 
727  //------------------------------------
728 
729  static void apply( const matrix_type & A ,
730  const vector_type & x ,
731  const vector_type & y )
732  {
733  const size_type row_count = A.graph.row_map.extent(0) - 1;
734  const size_type tensor_dimension = A.block.dimension();
735  const size_type tile_size = A.block.tile_size();
736  const size_type num_tiles = A.block.num_tiles();
737  const size_type max_num_rows = A.block.max_num_rows();
738 
739  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
740  const size_type block_size = 8;
741  const size_type nWarp = 16;
742  const dim3 dBlock( block_size, warp_size / block_size, nWarp );
743  const dim3 dGrid( row_count , 1 , 1 );
744 
745  const size_type shmem =
746  sizeof(VectorScalar) * ( (2*block_size+1)*tile_size +
747  dBlock.x*dBlock.y*dBlock.z ) +
748  sizeof(size_type) * 2 * dBlock.x*dBlock.y*dBlock.z;
749 
750  const size_type cmem = sizeof(size_type)*(max_num_rows+1)*num_tiles;
751 
752 #if 1
753 
754  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
755  << std::endl
756  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
757  << " block(" << dBlock.x << "," << dBlock.y << "," << dBlock.z
758  << ")" << std::endl
759  << " block_size(" << block_size << ")" << std::endl
760  << " shmem(" << shmem/1024 << " kB)" << std::endl
761  << " row_count(" << row_count << ")" << std::endl
762  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
763  << " tile_size(" << tile_size << ")" << std::endl
764  << " num_tiles(" << num_tiles << ")" << std::endl
765  << " cmem(" << cmem/1024 << " kB)" << std::endl
766  ;
767 #endif
768 
769  // Copy tensor row offsets to constant device memory
770  // cudaMemcpyToSymbol(tensor_row_begin, A.block.row_map_ptr(), cmem, 0,
771  // cudaMemcpyDeviceToDevice);
772 
773  //cudaProfilerStart();
774  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
775  ( ProductTensorLoop( A , x , y ) );
776  //cudaProfilerStop();
777  }
778 };
779 
780 #endif
781 
782 //----------------------------------------------------------------------------
783 //----------------------------------------------------------------------------
784 
785 } // namespace Stokhos
786 
787 #endif /* #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP */
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
CRS matrix of dense blocks.