Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_CrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
44 
45 #include <iostream>
46 
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
52 
55 
57 
58 #include "cuda_profiler_api.h"
59 
60 namespace Stokhos {
61 
62 //----------------------------------------------------------------------------
63 
64 // Matrix-vector product specialization for CrsProductTensor layout
65 // To do:
66 // * Incorporate texture/read-only cache
67 // * Change tensor layout to allow coalesced reads with smaller
68 // threads/row (will only probably help smaller problem sizes)
69 // * Get average FEM entries/row from FEM graph (hardcoded to 27)
70 template< typename TensorScalar,
71  typename MatrixScalar,
72  typename VectorScalar >
73 class Multiply<
74  BlockCrsMatrix< CrsProductTensor< TensorScalar, Kokkos::Cuda >,
75  MatrixScalar, Kokkos::Cuda >,
76  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
77  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
78 {
79 public:
80 
81  typedef Kokkos::Cuda execution_space;
82  typedef execution_space::size_type size_type;
83 
86  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type;
87 
88 #define USE_LDG 0
89 
90 #if USE_LDG == 0
91 
92  // The multiply kernel
93  class MultiplyKernel {
94  public:
95 
100 
102  const vector_type & x,
103  const vector_type & y,
104  const size_type block_size )
105  : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
106 
107  __device__
108  void operator()(void) const
109  {
110  // Number of bases in the stochastic system:
111  const size_type dim = m_A.block.dimension();
112 
113  // Get shared memory for loading x, A, and y
114  volatile VectorScalar * const sh_x =
115  kokkos_impl_cuda_shared_memory<VectorScalar>();
116  volatile MatrixScalar * const sh_A = sh_x + BlockSize*dim;
117  volatile VectorScalar * const sh_y = sh_A + BlockSize*dim;
118 #if !HAVE_CUDA_SHUFFLE
119  volatile VectorScalar * const sh_t = sh_y + dim;
120 #endif
121 
122  const size_type nid = blockDim.x * blockDim.y;
123  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
124 
125  // Mask used for shuffle/warp-sync operations
126  const int mask = blockDim.x == 32 ? 0xffffffff :
127  ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
128 
129  // Zero y
130  for ( size_type i = tid; i < dim; i += nid ) {
131  sh_y[i] = 0.0;
132  }
133 
134  // Loop over columns in the discrete (finite element) system.
135  // blockIdx.x == row in the deterministic (finite element) system
136  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
137  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
138  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
139  iBlockEntry += BlockSize) {
140  const size_type block_size =
141  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
142  iBlockEntryEnd-iBlockEntry : BlockSize;
143 
144  // Wait for X and A to be used in the previous iteration
145  // before reading new values.
146  __syncthreads();
147 
148  // Coalesced read blocks of X and A into shared memory
149  for ( size_type col = 0; col < block_size; ++col ) {
150 
151  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
152  const VectorScalar * const x = & m_x( 0, iBlockColumn );
153  const MatrixScalar * const A = & m_A.values( 0, iBlockEntry + col );
154 
155  // Coalesced read by the whole block from global memory:
156  for ( size_type i = tid; i < dim; i += nid ) {
157  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
158  sh_A[col + i * BlockSize] = A[i]; // m_A.values( i, iBlockEntry );
159  }
160 
161  }
162 
163  __syncthreads(); // wait for X and A to be read before using them
164 
165  // This cuda block is responsible for computing all values of 'y'
166  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
167  VectorScalar y = 0;
168 
169  // Product tensor entries which this warp will iterate:
170  const size_type lBeg = m_A.block.entry_begin( i );
171  const size_type lEnd = m_A.block.entry_end( i );
172 
173  // Loop through sparse tensor contributions with coalesced reads.
174  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
175 
176  // Read 'blockDim.x' entries from the tensor (coalesced)
177  const size_type kj = m_A.block.coord( l );
178  const TensorScalar v = m_A.block.value( l );
179  const size_type j = ( kj & 0x0ffff ) * BlockSize ;
180  const size_type k = ( kj >> 16 ) * BlockSize ;
181 
182  for ( size_type col = 0; col < block_size; ++col ) {
183  y += v * ( sh_A[col+j] * sh_x[col+k] +
184  sh_A[col+k] * sh_x[col+j] );
185  }
186 
187  }
188 
189  // Reduction of 'y' within 'blockDim.x'
190 #if HAVE_CUDA_SHUFFLE
191  if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x, mask);
192  if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x, mask);
193  if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x, mask);
194  if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x, mask);
195  if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x, mask);
196  if ( threadIdx.x == 0 ) sh_y[i] += y;
197 #else
198  sh_t[ tid ] = y;
199  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
200  sync_warp(mask);
201  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
202  sync_warp(mask);
203  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
204  sync_warp(mask);
205  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
206  sync_warp(mask);
207  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
208  sync_warp(mask);
209  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
210 #endif
211 
212  }
213 
214  }
215 
216  // Wait for all contributions of y to be completed
217  __syncthreads();
218 
219  // Store result back in global memory
220  for ( size_type i = tid; i < dim; i += nid ) {
221  m_y( i, blockIdx.x ) = sh_y[ i ];
222  }
223  }
224  };
225 
226 #elif USE_LDG == 1
227 
228  // The multiply kernel -- using read-only data cache for A
229  // Currently is slower than one above
230  class MultiplyKernel {
231  public:
232 
233  const matrix_type m_A;
234  const vector_type m_x;
235  const vector_type m_y;
236  const size_type BlockSize;
237 
238  MultiplyKernel( const matrix_type & A,
239  const vector_type & x,
240  const vector_type & y,
241  const size_type block_size )
242  : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
243 
244  __device__
245  void operator()(void) const
246  {
247  // Number of bases in the stochastic system:
248  const size_type dim = m_A.block.dimension();
249 
250  volatile VectorScalar * const sh_x =
251  kokkos_impl_cuda_shared_memory<VectorScalar>();
252  volatile VectorScalar * const sh_y = sh_x + BlockSize*dim;
253 #if !HAVE_CUDA_SHUFFLE
254  volatile VectorScalar * const sh_t = sh_y + dim;
255 #endif
256 
257  const size_type nid = blockDim.x * blockDim.y;
258  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
259 
260  // Mask used for shuffle/warp-sync operations
261  const int mask = blockDim.x == 32 ? 0xffffffff :
262  ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
263 
264  // Zero y
265  for ( size_type i = tid; i < dim; i += nid ) {
266  sh_y[i] = 0.0;
267  }
268 
269  // Loop over columns in the discrete (finite element) system.
270  // blockIdx.x == row in the deterministic (finite element) system
271  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
272  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
273  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
274  iBlockEntry += BlockSize) {
275  const size_type block_size =
276  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
277  iBlockEntryEnd-iBlockEntry : BlockSize;
278 
279  // Wait for X and A to be used in the previous iteration
280  // before reading new values.
281  __syncthreads();
282 
283  // Coalesced read blocks of X into shared memory
284  for ( size_type col = 0; col < block_size; ++col ) {
285 
286  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
287  const VectorScalar * const x = & m_x( 0, iBlockColumn );
288 
289  // Coalesced read by the whole block from global memory:
290  for ( size_type i = tid; i < dim; i += nid ) {
291  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
292  }
293 
294  }
295 
296  __syncthreads(); // wait for X to be read before using them
297 
298  // This cuda block is responsible for computing all values of 'y'
299  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
300  VectorScalar y = 0;
301 
302  // Product tensor entries which this warp will iterate:
303  const size_type lBeg = m_A.block.entry_begin( i );
304  const size_type lEnd = m_A.block.entry_end( i );
305 
306  // Loop through sparse tensor contributions with coalesced reads.
307  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
308 
309  // Read 'blockDim.x' entries from the tensor (coalesced)
310  const size_type kj = m_A.block.coord( l );
311  const TensorScalar v = m_A.block.value( l );
312  const size_type j = ( kj & 0x0ffff ) ;
313  const size_type k = ( kj >> 16 ) ;
314 
315  for ( size_type col = 0; col < block_size; ++col ) {
316  const size_type bCol = iBlockEntry + col;
317 #if (__CUDA_ARCH__ >= 350)
318  y += v * ( __ldg(&m_A.values(j,bCol)) * sh_x[col+k*BlockSize] +
319  __ldg(&m_A.values(k,bCol)) * sh_x[col+j*BlockSize] );
320 #else
321  y += v * ( m_A.values(j,bCol) * sh_x[col+k*BlockSize] +
322  m_A.values(k,bCol) * sh_x[col+j*BlockSize] );
323 #endif
324  }
325 
326  }
327 
328  // Reduction of 'y' within 'blockDim.x'
329 #if HAVE_CUDA_SHUFFLE
330  if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x, mask);
331  if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x, mask);
332  if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x, mask);
333  if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x, mask);
334  if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x, mask);
335  if ( threadIdx.x == 0 ) sh_y[i] += y;
336 #else
337  sh_t[ tid ] = y;
338  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
339  sync_warp(mask);
340  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
341  sync_warp(mask);
342  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
343  sync_warp(mask);
344  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
345  sync_warp(mask);
346  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
347  sync_warp(mask);
348  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
349 #endif
350 
351  }
352 
353  }
354 
355  // Wait for all contributions of y to be completed
356  __syncthreads();
357 
358  // Store result back in global memory
359  for ( size_type i = tid; i < dim; i += nid ) {
360  m_y( i, blockIdx.x ) = sh_y[ i ];
361  }
362  }
363  };
364 
365 #endif
366 
367  //------------------------------------
368 
369  struct TensorReadEntry {
370  size_type block_size, shmem, num_blocks, num_warp;
371  double reads;
372  };
373 
374  static void apply( const matrix_type & A,
375  const vector_type & x,
376  const vector_type & y )
377  {
378  const size_type row_count = A.graph.row_map.extent(0) - 1;
379  const size_type tensor_dimension = A.block.dimension();
380  const size_type tensor_align = tensor_dimension;
381  const size_type avg_tensor_entries_per_row = A.block.avg_entries_per_row();
382 
383  // Should compute this from FEM graph
384  const size_type fem_nnz_per_row = 27;
385 
386  // Get device properties we need for whatever device is currently selected
387  DeviceProp device_prop;
388  const size_type shcap = device_prop.shared_memory_capacity;
389  const size_type sh_granularity = device_prop.shared_memory_granularity;
390  const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
391  const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
392  const size_type warp_size = device_prop.warp_size;
393  const size_type warp_granularity = device_prop.warp_granularity;
394  const size_type max_warps_per_block =
395  std::min(device_prop.max_threads_per_block / warp_size,
396  device_prop.max_warps_per_sm);
397  const size_type min_warps_per_block = 1;
398  const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
399  const size_type max_regs_per_block = device_prop.max_regs_per_block;
400  const size_type reg_bank_size = device_prop.reg_bank_size;
401 
402  // Compute number of warps we can fit on each SM based on register limits
403  // Use Cuda introspection to determine number of registers per thread
404  //const size_type regs_per_thread = 46;
405  const size_type regs_per_thread =
406  device_prop.get_kernel_registers(
407  Kokkos::Impl::cuda_parallel_launch_local_memory<MultiplyKernel>);
408  const size_type regs_per_warp =
409  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
410  const size_type warps_per_sm =
411  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
412  const size_type warps_per_block =
413  (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
414 
415  // Compute number of threads per stochastic row based on number of
416  // nonzero entries per row.
417  // For double, 16 threads/row is still coalesced, but not for float.
418  // We should reorder the tensor values for a given vector width to
419  // maintain coalesced reads. This would help smaller problems too by
420  // allowing fewer threads/row.
421  const size_type threads_per_row =
422  avg_tensor_entries_per_row >= 88 ? 32 : 16;
423  const size_type rows_per_warp = warp_size / threads_per_row;
424 
425  const size_type vec_scalar_size = sizeof(VectorScalar);
426 #if USE_LDG == 0
427  const size_type mat_scalar_size = sizeof(MatrixScalar);
428 #endif
429 
430 #define USE_FIXED_BLOCKSIZE 0
431 
432 #if USE_FIXED_BLOCKSIZE
433 
434  const size_type num_blocks = 3;
435  size_type nw = warps_per_sm / num_blocks;
436  while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
437  const size_type num_warp = nw;
438  const size_type sh_per_block = shcap / num_blocks;
439  const size_type sr =
440  device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*num_warp;
441 #if USE_LDG == 1
442  size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
443  vec_scalar_size;
444 #else
445  size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
446  (vec_scalar_size+mat_scalar_size);
447 #endif
448  if (bs % 2 == 0) --bs;
449  const size_type block_size_max = 31;
450  const size_type block_size = std::min(bs, block_size_max);
451  //const size_type block_size = 7;
452 #if USE_LDG == 1
453  const size_type shmem =
454  ( (vec_scalar_size*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
455 #else
456  const size_type shmem =
457  ( ((vec_scalar_size+mat_scalar_size)*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
458 #endif
459 
460 #else
461 
462  // We want to maximize the number of blocks per SM (to maximize throughput)
463  // as well as the block_size (to minimize tensor reads), subject to
464  // shared memory and register constraints. Here we try to do this computing
465  // the number of tensor reads per block per thread for each choice of
466  // block_size, and then choose the configuration with the smallest value.
467  // This isn't perfect, but seems to generally work OK. It could be
468  // improved with a better model of:
469  // * Number of blocks versus warps per block (to minimize synchronization)
470  // * Thread efficiency for small numbers of rows per thread
471  const size_type block_size_min = 3;
472  const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
473  const size_type block_size_max =
474  half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
475  Teuchos::Array<TensorReadEntry> reads_per_thread;
476  for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
477  // We don't know the number of warps yet, so we just have to bound
478  // sr by the maximum number possible (which is all warps in 1 block)
479  const size_type sr =
480  device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*warps_per_block;
481 #if USE_LDG == 1
482  size_type shmem =
483  (vec_scalar_size*bs+vec_scalar_size)*tensor_align+sr;
484 #else
485  size_type shmem =
486  ((vec_scalar_size+mat_scalar_size)*bs+vec_scalar_size)*tensor_align+sr;
487 #endif
488  shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
489  if (shmem <= max_shmem_per_block) {
490  size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
491  size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
492  size_type num_warp =
493  std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
494  min_warps_per_block),
495  max_warps_per_block);
496  while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
497  --num_warp;
498  TensorReadEntry entry;
499  entry.block_size = bs;
500  entry.shmem = shmem;
501  entry.num_blocks = num_blocks;
502  entry.num_warp = num_warp;
503 
504  // Prefer at least 3 blocks
505  size_type factor = std::min(num_blocks,3u);
506  entry.reads = (static_cast<double>(tensor_reads) /
507  static_cast<double>(factor*num_blocks*num_warp));
508  reads_per_thread.push_back(entry);
509  }
510  }
512  reads_per_thread.size() == 0, std::logic_error,
513  "Stochastic problem dimension is too large to fit in shared memory");
514  size_type idx = 0;
515  double min_reads = reads_per_thread[0].reads;
516  for (int i=1; i<reads_per_thread.size(); ++i) {
517  if (reads_per_thread[i].reads < min_reads) {
518  idx = i;
519  min_reads = reads_per_thread[i].reads;
520  }
521  }
522 
523  const size_type block_size = reads_per_thread[idx].block_size;
524  const size_type shmem = reads_per_thread[idx].shmem;
525  const size_type num_blocks = reads_per_thread[idx].num_blocks;
526  const size_type num_warp = reads_per_thread[idx].num_warp;
527 
528 #endif
529 
530  // Setup thread blocks and grid
531  const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
532  const dim3 dGrid( row_count, 1, 1 );
533 
534 #if 0
535  std::cout << "block_size = " << block_size
536  << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
537  << " regs_per_thread = " << regs_per_thread
538  << " num blocks = " << num_blocks
539  << " num warps = " << num_warp
540  << " num rows = " << tensor_dimension
541  << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
542  << " avg entries/row = " << avg_tensor_entries_per_row
543  << std::endl;
544 #endif
545 
546  // Finally launch our kernel
547  //cudaProfilerStart();
548  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
549  ( MultiplyKernel( A, x, y, block_size ) );
550  //cudaProfilerStop();
551  }
552 
553 };
554 
555 //----------------------------------------------------------------------------
556 //----------------------------------------------------------------------------
557 
558 } // namespace Stokhos
559 
560 #endif /* #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION void sync_warp(const int &mask)
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
void push_back(const value_type &x)
CRS matrix of dense blocks.
size_type size() const
size_type get_kernel_registers(Kernel kernel)