Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_SimpleTiledCrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
11 #define STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
12 
13 #include <iostream>
14 
15 #include "Kokkos_Core.hpp"
16 
17 #include "Stokhos_Multiply.hpp"
20 
22 
23 #include "cuda_profiler_api.h"
24 
25 namespace Stokhos {
26 
27 //----------------------------------------------------------------------------
28 
29 #if 0
30 
31 // This version is slow because it requires more shared memory and has
32 // much more frequent reductions
33 template< typename TensorScalar ,
34  typename MatrixScalar ,
35  typename VectorScalar >
36 class Multiply<
37  BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
38  MatrixScalar, Kokkos::Cuda >,
39  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
40  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
41 {
42 public:
43 
44  typedef Kokkos::Cuda execution_space ;
45  typedef execution_space::size_type size_type ;
46 
47  typedef SimpleTiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
48  typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
49  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
50 
51  class ProductTensorLoop {
52  public:
53 
54  const matrix_type m_A;
55  const vector_type m_x;
56  const vector_type m_y;
57  const size_type m_block_size ;
58 
59  ProductTensorLoop( const matrix_type & A,
60  const vector_type & x,
61  const vector_type & y,
62  const size_type block_size )
63  : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
64 
65  __device__
66  void operator()(void) const
67  {
68  // Number of bases in the stochastic system:
69  const size_type dim = m_A.block.dimension();
70  const size_type max_tile_size = m_A.block.max_jk_tile_size();
71 
72  volatile VectorScalar * const sh_x_k =
73  kokkos_impl_cuda_shared_memory<VectorScalar>();
74  volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size;
75  volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size;
76  volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size;
77  volatile VectorScalar * const sh_y = sh_A_j+m_block_size*max_tile_size;
78 
79  const size_type nid = blockDim.x * blockDim.y;
80  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
81 
82  // blockIdx.x == row in the deterministic (finite element) system
83  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
84  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
85  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
86  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
87  if (remBlock > 0) ++numBlock;
88 
89  // Loop over i tiles
90  const size_type n_i_tile = m_A.block.num_i_tiles();
91  for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
92  const size_type i_begin = m_A.block.i_begin(i_tile);
93  const size_type i_size = m_A.block.i_size(i_tile);
94 
95  // Zero y
96  for (size_type i=tid; i<i_size; i+=nid) {
97  sh_y[i] = 0.0;
98  }
99 
100  // Loop over finite element column blocks.
101  size_type iBlockEntry = iBlockEntryBeg;
102  for (size_type block=0; block<numBlock; ++block,
103  iBlockEntry+=m_block_size) {
104  const size_type block_size =
105  (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
106 
107  // Loop over j tiles
108  const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
109  for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
110  const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
111  const size_type j_size = m_A.block.j_size(i_tile, j_tile);
112 
113  // Wait for X and A to be used in the previous iteration
114  // before reading new values.
115  __syncthreads();
116 
117  // Coalesced read j-blocks of X and A into shared memory
118  for (size_type col=0; col<block_size; ++col) {
119  const size_type iBlockColumn =
120  m_A.graph.entries(iBlockEntry + col);
121  const VectorScalar * const x_j =
122  &m_x(j_begin, iBlockColumn);
123  const MatrixScalar * const A_j =
124  &m_A.values(j_begin, iBlockEntry + col);
125  for (size_type j=tid; j<j_size; j+=nid) {
126  sh_x_j[col+j*m_block_size] = x_j[j];
127  sh_A_j[col+j*m_block_size] = A_j[j];
128  }
129  }
130 
131  // Loop over k tiles
132  const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
133  for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
134  const size_type k_begin =
135  m_A.block.k_begin(i_tile, j_tile, k_tile);
136  const size_type k_size =
137  m_A.block.k_size(i_tile, j_tile, k_tile);
138 
139  // Wait for X and A to be used in the previous iteration
140  // before reading new values.
141  __syncthreads();
142 
143  // Coalesced read j-blocks of X and A into shared memory
144  for (size_type col=0; col<block_size; ++col) {
145  const size_type iBlockColumn =
146  m_A.graph.entries(iBlockEntry + col);
147  const VectorScalar * const x_k =
148  &m_x(k_begin, iBlockColumn);
149  const MatrixScalar * const A_k =
150  &m_A.values(k_begin, iBlockEntry + col);
151  for (size_type k=tid; k<k_size; k+=nid) {
152  sh_x_k[col+k*m_block_size] = x_k[k];
153  sh_A_k[col+k*m_block_size] = A_k[k];
154  }
155  }
156 
157  __syncthreads(); // wait for X and A to be read
158 
159  // Loop over stochastic rows in this tile
160  for (size_type i=threadIdx.y; i<i_size; i+=blockDim.y) {
161  VectorScalar s = 0;
162 
163  // Product tensor entries which this warp will iterate:
164  const size_type lBeg =
165  m_A.block.entry_begin(i_tile, j_tile, k_tile, i);
166  const size_type lEnd =
167  m_A.block.entry_end(i_tile, j_tile, k_tile, i);
168 
169  // Loop through sparse tensor contributions with
170  // coalesced reads.
171  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
172  const size_type kj = m_A.block.coord( l );
173  const TensorScalar v = m_A.block.value( l );
174  const size_type j = ( kj & 0x0ffff ) * m_block_size ;
175  const size_type k = ( kj >> 16 ) * m_block_size ;
176 
177  for ( size_type col = 0; col < block_size; ++col ) {
178  s += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
179  sh_A_k[col+k] * sh_x_j[col+j] );
180  }
181  }
182 
183  // Reduction of 'y' within 'blockDim.x'
184  if (blockDim.x >= 2) s += shfl_down(s, 1, blockDim.x);
185  if (blockDim.x >= 4) s += shfl_down(s, 2, blockDim.x);
186  if (blockDim.x >= 8) s += shfl_down(s, 4, blockDim.x);
187  if (blockDim.x >= 16) s += shfl_down(s, 8, blockDim.x);
188  if (blockDim.x >= 32) s += shfl_down(s, 16, blockDim.x);
189  if ( threadIdx.x == 0 ) sh_y[i] += s;
190 
191  } // i-loop
192 
193  } // k-tile loop
194 
195  } // j-tile loop
196 
197  } // block column loop
198 
199  // Wait for all threads to complete the i-tile
200  __syncthreads();
201 
202  // Store sum for this tile back in global memory
203  for (size_type i=tid; i<i_size; i+=nid) {
204  m_y( i+i_begin , blockIdx.x ) = sh_y[i];
205  }
206 
207  } // i-tile loop
208 
209  } // operator()
210 
211  }; // ProductTensorLoop
212 
213  //------------------------------------
214 
215  static void apply( const matrix_type & A ,
216  const vector_type & x ,
217  const vector_type & y )
218  {
219  const size_type row_count = A.graph.row_map.extent(0) - 1;
220  const size_type tensor_dimension = A.block.dimension();
221  const size_type i_tile_size = A.block.max_i_tile_size();
222  const size_type jk_tile_size = A.block.max_jk_tile_size();
223 
224 #ifdef STOKHOS_DEBUG
225  const size_type nWarp = 4; // Use fewer warps in debug mode to prevent
226  // launch failures
227 #else
228  const size_type nWarp = 16;
229 #endif
230  const size_type threads_per_row = 16;
231  const size_type rows_per_warp =
232  Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
233  const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
234  const dim3 dGrid( row_count , 1 , 1 );
235 
236  const size_type shcap =
237  Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
238  size_type bs =
239  (shcap / sizeof(VectorScalar) - i_tile_size) / (4*jk_tile_size);
240  if (bs % 2 == 0) --bs;
241  const size_type block_size_max = 31;
242  const size_type block_size = std::min(bs, block_size_max);
243  // const int block_size = 9;
244  const size_type shmem =
245  sizeof(VectorScalar) * (4*jk_tile_size*block_size + i_tile_size);
246 
247  /*
248  ProductTensorLoop kernel( A , x , y, block_size )
249  int res;
250  cuFuncGetAttribute(&res, CU_FUNC_ATTRIBUTE_NUM_REGS, &kernel.operator());
251  */
252 
253 #if 0
254 
255  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
256  << std::endl
257  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
258  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
259  << " block_size(" << block_size << ")" << std::endl
260  << " shmem(" << shmem/1024 << " kB)" << std::endl
261  << " row_count(" << row_count << ")" << std::endl
262  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
263  << " tile_size(" << tile_size << ")" << std::endl
264  << " num_tiles(" << num_tiles << ")" << std::endl
265  ;
266 #endif
267  //cudaProfilerStart();
268  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
269  ( ProductTensorLoop( A , x , y, block_size ) );
270  //cudaProfilerStop();
271  }
272 };
273 
274 #else
275 
276 // This is currently the fastest version, but doesn't have fully coalesced
277 // reads of the sparse tensor, nor coalesced writes of y
278 template< typename TensorScalar ,
279  typename MatrixScalar ,
280  typename VectorScalar >
281 class Multiply<
282  BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
283  MatrixScalar, Kokkos::Cuda >,
284  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
285  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
286 {
287 public:
288 
289  typedef Kokkos::Cuda execution_space ;
290  typedef execution_space::size_type size_type ;
291 
294  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
295 
296  class ProductTensorLoop {
297  public:
298 
303 
305  const vector_type & x,
306  const vector_type & y,
307  const size_type block_size )
308  : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
309 
310  __device__
311  void operator()(void) const
312  {
313  // Number of bases in the stochastic system:
314  const size_type dim = m_A.block.dimension();
315  const size_type max_tile_size = m_A.block.max_jk_tile_size();
316 
317  const size_type nid = blockDim.x * blockDim.y;
318  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
319  // const size_type nid2 = nid / 2;
320  // const bool lower = tid < nid2;
321  // const size_type tid2 = lower ? tid : tid - nid2;
322 
323  volatile VectorScalar * const sh_x_k =
324  kokkos_impl_cuda_shared_memory<VectorScalar>();
325  volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size;
326  volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size;
327  volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size;
328  //volatile size_type * const sh_c = (volatile size_type*) (sh_A_j + m_block_size*max_tile_size);
329  __shared__ volatile size_type sh_c[32];
330 
331  // blockIdx.y == row in the deterministic (finite element) system
332  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.y ];
333  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.y + 1 ];
334  size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
335  const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
336  if (remBlock > 0) ++numBlock;
337 
338  // blockIdx.y == i_tile
339  const size_type i_tile = blockIdx.x;
340  const size_type i_begin = m_A.block.i_begin(i_tile);
341  const size_type i_size = m_A.block.i_size(i_tile);
342  const size_type i1 = threadIdx.y;
343  //const size_type i2 = threadIdx.y + blockDim.y;
344  if (i1 >= i_size) return;
345 
346  VectorScalar s1 = 0;
347  //VectorScalar s2 = 0;
348 
349  // Loop over finite element column blocks.
350  size_type iBlockEntry = iBlockEntryBeg;
351  for (size_type block=0; block<numBlock; ++block,
352  iBlockEntry+=m_block_size) {
353  const size_type block_size =
354  (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
355 
356  __syncthreads();
357  // for (size_type col=tid; col<block_size; col+=nid)
358  // sh_c[col] = m_A.graph.entries(iBlockEntry + col);
359  if (tid == 0) {
360  for (size_type col=0; col<block_size; ++col)
361  sh_c[col] = m_A.graph.entries(iBlockEntry + col);
362  }
363 
364  // Loop over j tiles
365  const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
366  for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
367  const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
368  const size_type j_size = m_A.block.j_size(i_tile, j_tile);
369 
370  // Wait for X and A to be used in the previous iteration
371  // before reading new values.
372  __syncthreads();
373 
374  // Coalesced read j-blocks of X and A into shared memory
375  for (size_type col=0; col<block_size; ++col) {
376  const VectorScalar * const x_j = &m_x(j_begin, sh_c[col]);
377  const MatrixScalar * const A_j = &m_A.values(j_begin, iBlockEntry + col);
378  for (size_type j=tid; j<j_size; j+=nid) {
379  sh_x_j[col+j*m_block_size] = x_j[j];
380  sh_A_j[col+j*m_block_size] = A_j[j];
381  }
382  // if (lower) {
383  // for (size_type j=tid2; j<j_size; j+=nid2)
384  // sh_x_j[col+j*m_block_size] = x_j[j];
385  // }
386  // else {
387  // for (size_type j=tid2; j<j_size; j+=nid2)
388  // sh_A_j[col+j*m_block_size] = A_j[j];
389  // }
390  }
391 
392  // Loop over k tiles
393  const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
394  for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
395  const size_type k_begin =
396  m_A.block.k_begin(i_tile, j_tile, k_tile);
397  const size_type k_size =
398  m_A.block.k_size(i_tile, j_tile, k_tile);
399 
400  // Wait for X and A to be used in the previous iteration
401  // before reading new values.
402  __syncthreads();
403 
404  // Coalesced read j-blocks of X and A into shared memory
405  for (size_type col=0; col<block_size; ++col) {
406  const VectorScalar * const x_k =
407  &m_x(k_begin, sh_c[col]);
408  const MatrixScalar * const A_k =
409  &m_A.values(k_begin, iBlockEntry + col);
410  for (size_type k=tid; k<k_size; k+=nid) {
411  sh_x_k[col+k*m_block_size] = x_k[k];
412  sh_A_k[col+k*m_block_size] = A_k[k];
413  }
414  // if (lower) {
415  // for (size_type k=tid2; k<k_size; k+=nid2)
416  // sh_x_k[col+k*m_block_size] = x_k[k];
417  // }
418  // else {
419  // for (size_type k=tid2; k<k_size; k+=nid2)
420  // sh_A_k[col+k*m_block_size] = A_k[k];
421  // }
422  }
423 
424  __syncthreads(); // wait for X and A to be read
425 
426  // Product tensor entries which this warp will iterate:
427  size_type lBeg =
428  m_A.block.entry_begin(i_tile, j_tile, k_tile, i1);
429  size_type lEnd =
430  m_A.block.entry_end(i_tile, j_tile, k_tile, i1);
431 
432  // Loop through sparse tensor contributions with
433  // coalesced reads (these aren't fully coalesced unless
434  // blockDim.x >= 16 for double precision. We need to reorder
435  // the tensor entries for fewer threads per row).
436  for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
437  const size_type kj = m_A.block.coord( l );
438  const TensorScalar v = m_A.block.value( l );
439  const size_type j = ( kj & 0x0ffff ) * m_block_size ;
440  const size_type k = ( kj >> 16 ) * m_block_size ;
441 
442  for ( size_type col = 0; col < block_size; ++col ) {
443  s1 += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
444  sh_A_k[col+k] * sh_x_j[col+j] );
445  }
446  }
447 
448  // if (i2 < i_size) {
449  // // Product tensor entries which this warp will iterate:
450  // size_type lBeg =
451  // m_A.block.entry_begin(i_tile, j_tile, k_tile, i2);
452  // size_type lEnd =
453  // m_A.block.entry_end(i_tile, j_tile, k_tile, i2);
454 
455  // // Loop through sparse tensor contributions with
456  // // coalesced reads.
457  // for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
458  // const size_type kj = m_A.block.coord( l );
459  // const TensorScalar v = m_A.block.value( l );
460  // const size_type j = ( kj & 0x0ffff ) * m_block_size ;
461  // const size_type k = ( kj >> 16 ) * m_block_size ;
462 
463  // for ( size_type col = 0; col < block_size; ++col ) {
464  // s2 += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
465  // sh_A_k[col+k] * sh_x_j[col+j] );
466  // }
467  // }
468  // }
469 
470  } // k-tile loop
471 
472  } // j-tile loop
473 
474  } // block column loop
475 
476  // Wait for all threads to complete the i-tile
477  //__syncthreads();
478 
479  // Reduction of 'y' within 'blockDim.x'
480  if (blockDim.x >= 2) s1 += shfl_down(s1, 1, blockDim.x);
481  if (blockDim.x >= 4) s1 += shfl_down(s1, 2, blockDim.x);
482  if (blockDim.x >= 8) s1 += shfl_down(s1, 4, blockDim.x);
483  if (blockDim.x >= 16) s1 += shfl_down(s1, 8, blockDim.x);
484  if (blockDim.x >= 32) s1 += shfl_down(s1, 16, blockDim.x);
485  if ( threadIdx.x == 0 ) m_y( i1+i_begin , blockIdx.y ) = s1;
486 
487  // if (i2 < i_size) {
488  // if (blockDim.x >= 2) s2 += shfl_down(s2, 1, blockDim.x);
489  // if (blockDim.x >= 4) s2 += shfl_down(s2, 2, blockDim.x);
490  // if (blockDim.x >= 8) s2 += shfl_down(s2, 4, blockDim.x);
491  // if (blockDim.x >= 16) s2 += shfl_down(s2, 8, blockDim.x);
492  // if (blockDim.x >= 32) s2 += shfl_down(s2, 16, blockDim.x);
493  // if ( threadIdx.x == 0 ) m_y( i2+i_begin , blockIdx.y ) = s2;
494  // }
495 
496  } // operator()
497 
498  }; // ProductTensorLoop
499 
500  //------------------------------------
501 
502  static void apply( const matrix_type & A ,
503  const vector_type & x ,
504  const vector_type & y )
505  {
506  const size_type row_count = A.graph.row_map.extent(0) - 1;
507  const size_type tensor_dimension = A.block.dimension();
508  const size_type tile_size = A.block.max_jk_tile_size();
509  const size_type n_i_tile = A.block.num_i_tiles();
510 
511 #ifdef STOKHOS_DEBUG
512  const size_type nWarp = 2; // Use fewer warps in debug mode to prevent
513  // launch failures
514 #else
515  const size_type nWarp = 16;
516 #endif
517  const size_type threads_per_row = 4;
518  const size_type rows_per_warp =
519  Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
520  const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
521  const dim3 dGrid( n_i_tile , row_count , 1 );
522 
523  const size_type shcap =
524  Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
525  size_type bs = ((shcap / sizeof(VectorScalar)) / tile_size) / 4;
526  if (bs % 2 == 0) --bs;
527  const size_type block_size_max = 31;
528  const size_type block_size = std::min(bs, block_size_max);
529  // const int block_size = 9;
530  const size_type shmem = sizeof(VectorScalar)*(4*block_size*tile_size); // + sizeof(size_type)*block_size_max;
531 
532  // Try sorting rows based on number of non-zeros, split threads between
533  // A, x reads, do the diagonals properly
534  // Splitting threads between A & x reads doesn't seem to improve things
535 
536  /*
537  ProductTensorLoop kernel( A , x , y, block_size )
538  int res;
539  cuFuncGetAttribute(&res, CU_FUNC_ATTRIBUTE_NUM_REGS, &kernel.operator());
540  */
541 
542 #if 0
543 
544  std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
545  << std::endl
546  << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
547  << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
548  << " block_size(" << block_size << ")" << std::endl
549  << " shmem(" << shmem/1024 << " kB)" << std::endl
550  << " row_count(" << row_count << ")" << std::endl
551  << " tensor_dimension(" << tensor_dimension << ")" << std::endl
552  << " tile_size(" << tile_size << ")" << std::endl
553  << " num_i_tiles(" << n_i_tile << ")" << std::endl
554  ;
555 #endif
556  //cudaProfilerStart();
557  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
558  ( ProductTensorLoop( A , x , y, block_size ) );
559  //cudaProfilerStop();
560  }
561 };
562 
563 #endif
564 
565 //----------------------------------------------------------------------------
566 //----------------------------------------------------------------------------
567 
568 } // namespace Stokhos
569 
570 #endif /* #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
CRS matrix of dense blocks.