42 #ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
47 #include "Kokkos_Core.hpp"
53 #include "cuda_profiler_api.h"
59 template<
typename TensorScalar,
60 typename MatrixScalar,
61 typename VectorScalar,
65 MatrixScalar, Kokkos::Cuda >,
66 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
67 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
76 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >
vector_type;
98 m_entries_per_thread(entries_per_thread),
99 m_block_size(block_size) {}
105 const size_type dim = m_A.block.dimension();
106 const size_type num_entries = m_A.block.entry_count();
109 const size_type nid = blockDim.x * blockDim.y;
110 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
113 VectorScalar *
const sh_x =
114 kokkos_impl_cuda_shared_memory<VectorScalar>();
115 VectorScalar *
const sh_A = sh_x + m_block_size*dim;
116 VectorScalar *
const sh_y = sh_A + m_block_size*dim;
117 volatile VectorScalar *
const vals = sh_y + dim;
119 reinterpret_cast<volatile rows_type*
>(vals + nid);
122 const size_type entries_per_warp = blockDim.x * m_entries_per_thread;
123 const size_type lBeg = threadIdx.y * entries_per_warp + threadIdx.x;
124 const size_type lEnd = ( lBeg + entries_per_warp < num_entries ?
125 lBeg + entries_per_warp : num_entries );
128 const size_type femBeg = m_A.graph.row_map[ blockIdx.x ];
129 const size_type femEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
132 for (
size_type l = tid; l < dim; l += nid) {
137 rows[tid] = invalid_row;
141 for (
size_type femEntry=femBeg; femEntry<femEnd; femEntry+=m_block_size) {
143 femEntry + m_block_size < femEnd ? m_block_size : femEnd - femEntry;
150 for (
size_type col = 0; col < block_size; ++col) {
152 const size_type femColumn = m_A.graph.entries( femEntry + col );
153 const VectorScalar *
const x = & m_x( 0, femColumn );
154 const MatrixScalar *
const A = & m_A.values( 0, femEntry + col );
157 for (
size_type l = tid; l < dim; l += nid) {
158 sh_x[col + l * m_block_size] = x[l];
159 sh_A[col + l * m_block_size] = A[l];
167 for (
size_type l = lBeg; l < lEnd; l += blockDim.x) {
171 m_A.block.coord(l,i,j,k);
172 const TensorScalar v = m_A.block.value( l );
180 if (threadIdx.x == 0) {
181 if (i == rows[tid+31])
184 sh_y[rows[tid+31]] += vals[tid+31];
188 for (
size_type col = 0; col < block_size; ++col) {
189 y += v * ( sh_A[col+
j] * sh_x[col+k] + sh_A[col+k] * sh_x[col+
j] );
198 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
199 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
200 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
201 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
202 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
207 if (threadIdx.x < 31 && i != rows[tid + 1])
208 sh_y[i] += vals[tid];
215 if (threadIdx.x == 31) {
216 rows[threadIdx.y] = rows[tid];
217 vals[threadIdx.y] = vals[tid];
223 if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
225 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
226 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
227 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
228 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
229 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
231 if ((threadIdx.x == blockDim.y-1) ||
232 (threadIdx.x < blockDim.y-1 && i != rows[tid+1]))
233 sh_y[i] += vals[tid];
237 rows[tid] = invalid_row;
246 for (
size_type l = tid; l < dim; l += nid) {
247 m_y( l, blockIdx.x ) = sh_y[ l ];
261 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
270 std::min( num_warp_max, stoch_entries / warp_size );
271 const dim3 dBlock( warp_size , num_warp, 1 );
272 const dim3 dGrid( fem_rows, 1, 1 );
274 const size_type num_thread = dBlock.x*dBlock.y;
276 (stoch_entries + num_thread-1) / num_thread;
280 const size_type size_vals =
sizeof(VectorScalar) * num_thread;
282 Kokkos::Cuda().impl_internal_space_instance()->m_maxShmemPerBlock / 2;
284 ((shcap-size_rows-size_vals) / (
sizeof(VectorScalar)*stoch_rows) - 1) / 2;
285 if (bs % 2 == 0) --bs;
289 sizeof(VectorScalar) * ((2*block_size+1) * stoch_rows) +
290 size_vals + size_rows;
295 std::cout <<
"Multiply< BlockCrsMatrix< CooProductTensor ... > >::apply"
297 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
298 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
299 <<
" block_size(" << block_size <<
")" << std::endl
300 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
301 <<
" fem_rows(" << fem_rows <<
")" << std::endl
302 <<
" fem_nnz(" << fem_nnz <<
")" << std::endl
303 <<
" stoch_rows(" << stoch_rows <<
")" << std::endl
304 <<
" stoch_nnz(" << stoch_entries <<
")" << std::endl
305 <<
" threads_per_block(" << num_thread <<
")" << std::endl
306 <<
" entries_per_thread(" << entries_per_thread <<
")" << std::endl
311 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
312 ( CooKernel( A, x, y, entries_per_thread, block_size ) );
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_block_size const size_type m_block_size
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_x const vector_type m_x
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::operator() __device__ void operator()(void) const
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::CooKernel CooKernel(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type entries_per_thread, const size_type block_size)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::tensor_type CooProductTensor< TensorScalar, execution_space, Pack > tensor_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::apply static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::vector_type Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::rows_type int rows_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_entries_per_thread const size_type m_entries_per_thread
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_y const vector_type m_y
CRS matrix of dense blocks.
Sparse product tensor using 'COO'-like storage format.
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::matrix_type BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::execution_space Kokkos::Cuda execution_space
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::size_type execution_space::size_type size_type