Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_SimpleTiledCrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
44 
45 #include <iostream>
46 
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
52 
54 
55 #include "cuda_profiler_api.h"
56 
57 namespace Stokhos {
58 
59 //----------------------------------------------------------------------------
60 
61 #if 0
62 
63 // This version is slow because it requires more shared memory and has
64 // much more frequent reductions
65 template< typename TensorScalar ,
66  typename MatrixScalar ,
67  typename VectorScalar >
68 class Multiply<
69  BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
70  MatrixScalar, Kokkos::Cuda >,
71  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
72  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
73 {
74 public:
75 
76  typedef Kokkos::Cuda execution_space ;
77  typedef execution_space::size_type size_type ;
78 
79  typedef SimpleTiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
80  typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
81  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
82 
83  class ProductTensorLoop {
84  public:
85 
86  const matrix_type m_A;
87  const vector_type m_x;
88  const vector_type m_y;
89  const size_type m_block_size ;
90 
91  ProductTensorLoop( const matrix_type & A,
92  const vector_type & x,
93  const vector_type & y,
94  const size_type block_size )
95  : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
96 
97  __device__
98  void operator()(void) const
99  {
100  // Number of bases in the stochastic system:
101  const size_type dim = m_A.block.dimension();
102  const size_type max_tile_size = m_A.block.max_jk_tile_size();
103 
104  volatile VectorScalar * const sh_x_k =
105  kokkos_impl_cuda_shared_memory<VectorScalar>();
106  volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size;
107  volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size;
108  volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size;
109  volatile VectorScalar * const sh_y = sh_A_j+m_block_size*max_tile_size;
110 
111  const size_type nid = blockDim.x * blockDim.y;
112  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
113 
114  // blockIdx.x == row in the deterministic (finite element) system
115  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
116  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
117  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
118  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
119  if (remBlock > 0) ++numBlock;
120 
121  // Loop over i tiles
122  const size_type n_i_tile = m_A.block.num_i_tiles();
123  for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
124  const size_type i_begin = m_A.block.i_begin(i_tile);
125  const size_type i_size = m_A.block.i_size(i_tile);
126 
127  // Zero y
128  for (size_type i=tid; i<i_size; i+=nid) {
129  sh_y[i] = 0.0;
130  }
131 
132  // Loop over finite element column blocks.
133  size_type iBlockEntry = iBlockEntryBeg;
134  for (size_type block=0; block<numBlock; ++block,
135  iBlockEntry+=m_block_size) {
136  const size_type block_size =
137  (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
138 
139  // Loop over j tiles
140  const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
141  for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
142  const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
143  const size_type j_size = m_A.block.j_size(i_tile, j_tile);
144 
145  // Wait for X and A to be used in the previous iteration
146  // before reading new values.
147  __syncthreads();
148 
149  // Coalesced read j-blocks of X and A into shared memory
150  for (size_type col=0; col<block_size; ++col) {
151  const size_type iBlockColumn =
152  m_A.graph.entries(iBlockEntry + col);
153  const VectorScalar * const x_j =
154  &m_x(j_begin, iBlockColumn);
155  const MatrixScalar * const A_j =
156  &m_A.values(j_begin, iBlockEntry + col);
157  for (size_type j=tid; j<j_size; j+=nid) {
158  sh_x_j[col+j*m_block_size] = x_j[j];
159  sh_A_j[col+j*m_block_size] = A_j[j];
160  }
161  }
162 
163  // Loop over k tiles
164  const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
165  for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
166  const size_type k_begin =
167  m_A.block.k_begin(i_tile, j_tile, k_tile);
168  const size_type k_size =
169  m_A.block.k_size(i_tile, j_tile, k_tile);
170 
171  // Wait for X and A to be used in the previous iteration
172  // before reading new values.
173  __syncthreads();
174 
175  // Coalesced read j-blocks of X and A into shared memory
176  for (size_type col=0; col<block_size; ++col) {
177  const size_type iBlockColumn =
178  m_A.graph.entries(iBlockEntry + col);
179  const VectorScalar * const x_k =
180  &m_x(k_begin, iBlockColumn);
181  const MatrixScalar * const A_k =
182  &m_A.values(k_begin, iBlockEntry + col);
183  for (size_type k=tid; k<k_size; k+=nid) {
184  sh_x_k[col+k*m_block_size] = x_k[k];
185  sh_A_k[col+k*m_block_size] = A_k[k];
186  }
187  }
188 
189  __syncthreads(); // wait for X and A to be read
190 
191  // Loop over stochastic rows in this tile
192  for (size_type i=threadIdx.y; i<i_size; i+=blockDim.y) {
193  VectorScalar s = 0;
194 
195  // Product tensor entries which this warp will iterate:
196  const size_type lBeg =
197  m_A.block.entry_begin(i_tile, j_tile, k_tile, i);
198  const size_type lEnd =
199  m_A.block.entry_end(i_tile, j_tile, k_tile, i);
200 
201  // Loop through sparse tensor contributions with
202  // coalesced reads.
203  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
204  const size_type kj = m_A.block.coord( l );
205  const TensorScalar v = m_A.block.value( l );
206  const size_type j = ( kj & 0x0ffff ) * m_block_size ;
207  const size_type k = ( kj >> 16 ) * m_block_size ;
208 
209  for ( size_type col = 0; col < block_size; ++col ) {
210  s += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
211  sh_A_k[col+k] * sh_x_j[col+j] );
212  }
213  }
214 
215  // Reduction of 'y' within 'blockDim.x'
216  if (blockDim.x >= 2) s += shfl_down(s, 1, blockDim.x);
217  if (blockDim.x >= 4) s += shfl_down(s, 2, blockDim.x);
218  if (blockDim.x >= 8) s += shfl_down(s, 4, blockDim.x);
219  if (blockDim.x >= 16) s += shfl_down(s, 8, blockDim.x);
220  if (blockDim.x >= 32) s += shfl_down(s, 16, blockDim.x);
221  if ( threadIdx.x == 0 ) sh_y[i] += s;
222 
223  } // i-loop
224 
225  } // k-tile loop
226 
227  } // j-tile loop
228 
229  } // block column loop
230 
231  // Wait for all threads to complete the i-tile
232  __syncthreads();
233 
234  // Store sum for this tile back in global memory
235  for (size_type i=tid; i<i_size; i+=nid) {
236  m_y( i+i_begin , blockIdx.x ) = sh_y[i];
237  }
238 
239  } // i-tile loop
240 
241  } // operator()
242 
243  }; // ProductTensorLoop
244 
245  //------------------------------------
246 
247  static void apply( const matrix_type & A ,
248  const vector_type & x ,
249  const vector_type & y )
250  {
251  const size_type row_count = A.graph.row_map.extent(0) - 1;
252  const size_type tensor_dimension = A.block.dimension();
253  const size_type i_tile_size = A.block.max_i_tile_size();
254  const size_type jk_tile_size = A.block.max_jk_tile_size();
255 
256 #ifdef STOKHOS_DEBUG
257  const size_type nWarp = 4; // Use fewer warps in debug mode to prevent
258  // launch failures
259 #else
260  const size_type nWarp = 16;
261 #endif
262  const size_type threads_per_row = 16;
263  const size_type rows_per_warp =
264  Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
265  const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
266  const dim3 dGrid( row_count , 1 , 1 );
267 
268  const size_type shcap =
269  Kokkos::Impl::CudaTraits::SharedMemoryCapacity / 2;
270  size_type bs =
271  (shcap / sizeof(VectorScalar) - i_tile_size) / (4*jk_tile_size);
272  if (bs % 2 == 0) --bs;
273  const size_type block_size_max = 31;
274  const size_type block_size = std::min(bs, block_size_max);
275  // const int block_size = 9;
276  const size_type shmem =
277  sizeof(VectorScalar) * (4*jk_tile_size*block_size + i_tile_size);
278 
279  /*
280  ProductTensorLoop kernel( A , x , y, block_size )
281  int res;
282  cuFuncGetAttribute(&res, CU_FUNC_ATTRIBUTE_NUM_REGS, &kernel.operator());
283  */
284 
285 #if 0
286 
287  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
288  << std::endl
289  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
290  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
291  << " block_size(" << block_size << ")" << std::endl
292  << " shmem(" << shmem/1024 << " kB)" << std::endl
293  << " row_count(" << row_count << ")" << std::endl
294  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
295  << " tile_size(" << tile_size << ")" << std::endl
296  << " num_tiles(" << num_tiles << ")" << std::endl
297  ;
298 #endif
299  //cudaProfilerStart();
300  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
301  ( ProductTensorLoop( A , x , y, block_size ) );
302  //cudaProfilerStop();
303  }
304 };
305 
306 #else
307 
308 // This is currently the fastest version, but doesn't have fully coalesced
309 // reads of the sparse tensor, nor coalesced writes of y
310 template< typename TensorScalar ,
311  typename MatrixScalar ,
312  typename VectorScalar >
313 class Multiply<
314  BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
315  MatrixScalar, Kokkos::Cuda >,
316  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
317  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
318 {
319 public:
320 
321  typedef Kokkos::Cuda execution_space ;
322  typedef execution_space::size_type size_type ;
323 
326  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
327 
328  class ProductTensorLoop {
329  public:
330 
335 
337  const vector_type & x,
338  const vector_type & y,
339  const size_type block_size )
340  : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
341 
342  __device__
343  void operator()(void) const
344  {
345  // Number of bases in the stochastic system:
346  const size_type dim = m_A.block.dimension();
347  const size_type max_tile_size = m_A.block.max_jk_tile_size();
348 
349  const size_type nid = blockDim.x * blockDim.y;
350  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
351  // const size_type nid2 = nid / 2;
352  // const bool lower = tid < nid2;
353  // const size_type tid2 = lower ? tid : tid - nid2;
354 
355  volatile VectorScalar * const sh_x_k =
356  kokkos_impl_cuda_shared_memory<VectorScalar>();
357  volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size;
358  volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size;
359  volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size;
360  //volatile size_type * const sh_c = (volatile size_type*) (sh_A_j + m_block_size*max_tile_size);
361  __shared__ volatile size_type sh_c[32];
362 
363  // blockIdx.y == row in the deterministic (finite element) system
364  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.y ];
365  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.y + 1 ];
366  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
367  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
368  if (remBlock > 0) ++numBlock;
369 
370  // blockIdx.y == i_tile
371  const size_type i_tile = blockIdx.x;
372  const size_type i_begin = m_A.block.i_begin(i_tile);
373  const size_type i_size = m_A.block.i_size(i_tile);
374  const size_type i1 = threadIdx.y;
375  //const size_type i2 = threadIdx.y + blockDim.y;
376  if (i1 >= i_size) return;
377 
378  VectorScalar s1 = 0;
379  //VectorScalar s2 = 0;
380 
381  // Loop over finite element column blocks.
382  size_type iBlockEntry = iBlockEntryBeg;
383  for (size_type block=0; block<numBlock; ++block,
384  iBlockEntry+=m_block_size) {
385  const size_type block_size =
386  (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
387 
388  __syncthreads();
389  // for (size_type col=tid; col<block_size; col+=nid)
390  // sh_c[col] = m_A.graph.entries(iBlockEntry + col);
391  if (tid == 0) {
392  for (size_type col=0; col<block_size; ++col)
393  sh_c[col] = m_A.graph.entries(iBlockEntry + col);
394  }
395 
396  // Loop over j tiles
397  const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
398  for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
399  const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
400  const size_type j_size = m_A.block.j_size(i_tile, j_tile);
401 
402  // Wait for X and A to be used in the previous iteration
403  // before reading new values.
404  __syncthreads();
405 
406  // Coalesced read j-blocks of X and A into shared memory
407  for (size_type col=0; col<block_size; ++col) {
408  const VectorScalar * const x_j = &m_x(j_begin, sh_c[col]);
409  const MatrixScalar * const A_j = &m_A.values(j_begin, iBlockEntry + col);
410  for (size_type j=tid; j<j_size; j+=nid) {
411  sh_x_j[col+j*m_block_size] = x_j[j];
412  sh_A_j[col+j*m_block_size] = A_j[j];
413  }
414  // if (lower) {
415  // for (size_type j=tid2; j<j_size; j+=nid2)
416  // sh_x_j[col+j*m_block_size] = x_j[j];
417  // }
418  // else {
419  // for (size_type j=tid2; j<j_size; j+=nid2)
420  // sh_A_j[col+j*m_block_size] = A_j[j];
421  // }
422  }
423 
424  // Loop over k tiles
425  const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
426  for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
427  const size_type k_begin =
428  m_A.block.k_begin(i_tile, j_tile, k_tile);
429  const size_type k_size =
430  m_A.block.k_size(i_tile, j_tile, k_tile);
431 
432  // Wait for X and A to be used in the previous iteration
433  // before reading new values.
434  __syncthreads();
435 
436  // Coalesced read j-blocks of X and A into shared memory
437  for (size_type col=0; col<block_size; ++col) {
438  const VectorScalar * const x_k =
439  &m_x(k_begin, sh_c[col]);
440  const MatrixScalar * const A_k =
441  &m_A.values(k_begin, iBlockEntry + col);
442  for (size_type k=tid; k<k_size; k+=nid) {
443  sh_x_k[col+k*m_block_size] = x_k[k];
444  sh_A_k[col+k*m_block_size] = A_k[k];
445  }
446  // if (lower) {
447  // for (size_type k=tid2; k<k_size; k+=nid2)
448  // sh_x_k[col+k*m_block_size] = x_k[k];
449  // }
450  // else {
451  // for (size_type k=tid2; k<k_size; k+=nid2)
452  // sh_A_k[col+k*m_block_size] = A_k[k];
453  // }
454  }
455 
456  __syncthreads(); // wait for X and A to be read
457 
458  // Product tensor entries which this warp will iterate:
459  size_type lBeg =
460  m_A.block.entry_begin(i_tile, j_tile, k_tile, i1);
461  size_type lEnd =
462  m_A.block.entry_end(i_tile, j_tile, k_tile, i1);
463 
464  // Loop through sparse tensor contributions with
465  // coalesced reads (these aren't fully coalesced unless
466  // blockDim.x >= 16 for double precision. We need to reorder
467  // the tensor entries for fewer threads per row).
468  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
469  const size_type kj = m_A.block.coord( l );
470  const TensorScalar v = m_A.block.value( l );
471  const size_type j = ( kj & 0x0ffff ) * m_block_size ;
472  const size_type k = ( kj >> 16 ) * m_block_size ;
473 
474  for ( size_type col = 0; col < block_size; ++col ) {
475  s1 += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
476  sh_A_k[col+k] * sh_x_j[col+j] );
477  }
478  }
479 
480  // if (i2 < i_size) {
481  // // Product tensor entries which this warp will iterate:
482  // size_type lBeg =
483  // m_A.block.entry_begin(i_tile, j_tile, k_tile, i2);
484  // size_type lEnd =
485  // m_A.block.entry_end(i_tile, j_tile, k_tile, i2);
486 
487  // // Loop through sparse tensor contributions with
488  // // coalesced reads.
489  // for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
490  // const size_type kj = m_A.block.coord( l );
491  // const TensorScalar v = m_A.block.value( l );
492  // const size_type j = ( kj & 0x0ffff ) * m_block_size ;
493  // const size_type k = ( kj >> 16 ) * m_block_size ;
494 
495  // for ( size_type col = 0; col < block_size; ++col ) {
496  // s2 += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
497  // sh_A_k[col+k] * sh_x_j[col+j] );
498  // }
499  // }
500  // }
501 
502  } // k-tile loop
503 
504  } // j-tile loop
505 
506  } // block column loop
507 
508  // Wait for all threads to complete the i-tile
509  //__syncthreads();
510 
511  // Reduction of 'y' within 'blockDim.x'
512  if (blockDim.x >= 2) s1 += shfl_down(s1, 1, blockDim.x);
513  if (blockDim.x >= 4) s1 += shfl_down(s1, 2, blockDim.x);
514  if (blockDim.x >= 8) s1 += shfl_down(s1, 4, blockDim.x);
515  if (blockDim.x >= 16) s1 += shfl_down(s1, 8, blockDim.x);
516  if (blockDim.x >= 32) s1 += shfl_down(s1, 16, blockDim.x);
517  if ( threadIdx.x == 0 ) m_y( i1+i_begin , blockIdx.y ) = s1;
518 
519  // if (i2 < i_size) {
520  // if (blockDim.x >= 2) s2 += shfl_down(s2, 1, blockDim.x);
521  // if (blockDim.x >= 4) s2 += shfl_down(s2, 2, blockDim.x);
522  // if (blockDim.x >= 8) s2 += shfl_down(s2, 4, blockDim.x);
523  // if (blockDim.x >= 16) s2 += shfl_down(s2, 8, blockDim.x);
524  // if (blockDim.x >= 32) s2 += shfl_down(s2, 16, blockDim.x);
525  // if ( threadIdx.x == 0 ) m_y( i2+i_begin , blockIdx.y ) = s2;
526  // }
527 
528  } // operator()
529 
530  }; // ProductTensorLoop
531 
532  //------------------------------------
533 
534  static void apply( const matrix_type & A ,
535  const vector_type & x ,
536  const vector_type & y )
537  {
538  const size_type row_count = A.graph.row_map.extent(0) - 1;
539  const size_type tensor_dimension = A.block.dimension();
540  const size_type tile_size = A.block.max_jk_tile_size();
541  const size_type n_i_tile = A.block.num_i_tiles();
542 
543 #ifdef STOKHOS_DEBUG
544  const size_type nWarp = 2; // Use fewer warps in debug mode to prevent
545  // launch failures
546 #else
547  const size_type nWarp = 16;
548 #endif
549  const size_type threads_per_row = 4;
550  const size_type rows_per_warp =
551  Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
552  const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
553  const dim3 dGrid( n_i_tile , row_count , 1 );
554 
555  const size_type shcap =
556  Kokkos::Impl::CudaTraits::SharedMemoryCapacity / 2;
557  size_type bs = ((shcap / sizeof(VectorScalar)) / tile_size) / 4;
558  if (bs % 2 == 0) --bs;
559  const size_type block_size_max = 31;
560  const size_type block_size = std::min(bs, block_size_max);
561  // const int block_size = 9;
562  const size_type shmem = sizeof(VectorScalar)*(4*block_size*tile_size); // + sizeof(size_type)*block_size_max;
563 
564  // Try sorting rows based on number of non-zeros, split threads between
565  // A, x reads, do the diagonals properly
566  // Splitting threads between A & x reads doesn't seem to improve things
567 
568  /*
569  ProductTensorLoop kernel( A , x , y, block_size )
570  int res;
571  cuFuncGetAttribute(&res, CU_FUNC_ATTRIBUTE_NUM_REGS, &kernel.operator());
572  */
573 
574 #if 0
575 
576  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
577  << std::endl
578  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
579  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
580  << " block_size(" << block_size << ")" << std::endl
581  << " shmem(" << shmem/1024 << " kB)" << std::endl
582  << " row_count(" << row_count << ")" << std::endl
583  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
584  << " tile_size(" << tile_size << ")" << std::endl
585  << " num_i_tiles(" << n_i_tile << ")" << std::endl
586  ;
587 #endif
588  //cudaProfilerStart();
589  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
590  ( ProductTensorLoop( A , x , y, block_size ) );
591  //cudaProfilerStop();
592  }
593 };
594 
595 #endif
596 
597 //----------------------------------------------------------------------------
598 //----------------------------------------------------------------------------
599 
600 } // namespace Stokhos
601 
602 #endif /* #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
CRS matrix of dense blocks.