10 #ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
11 #define STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
15 #include "Kokkos_Core.hpp"
21 #include "cuda_profiler_api.h"
27 template<
typename TensorScalar,
28 typename MatrixScalar,
29 typename VectorScalar,
33 MatrixScalar, Kokkos::Cuda >,
34 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
35 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
44 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda >
vector_type;
66 m_entries_per_thread(entries_per_thread),
67 m_block_size(block_size) {}
73 const size_type dim = m_A.block.dimension();
74 const size_type num_entries = m_A.block.entry_count();
77 const size_type nid = blockDim.x * blockDim.y;
78 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
81 VectorScalar *
const sh_x =
82 kokkos_impl_cuda_shared_memory<VectorScalar>();
83 VectorScalar *
const sh_A = sh_x + m_block_size*dim;
84 VectorScalar *
const sh_y = sh_A + m_block_size*dim;
85 volatile VectorScalar *
const vals = sh_y + dim;
87 reinterpret_cast<volatile rows_type*
>(vals + nid);
90 const size_type entries_per_warp = blockDim.x * m_entries_per_thread;
91 const size_type lBeg = threadIdx.y * entries_per_warp + threadIdx.x;
92 const size_type lEnd = ( lBeg + entries_per_warp < num_entries ?
93 lBeg + entries_per_warp : num_entries );
96 const size_type femBeg = m_A.graph.row_map[ blockIdx.x ];
97 const size_type femEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
100 for (
size_type l = tid; l < dim; l += nid) {
105 rows[tid] = invalid_row;
109 for (
size_type femEntry=femBeg; femEntry<femEnd; femEntry+=m_block_size) {
111 femEntry + m_block_size < femEnd ? m_block_size : femEnd - femEntry;
118 for (
size_type col = 0; col < block_size; ++col) {
120 const size_type femColumn = m_A.graph.entries( femEntry + col );
121 const VectorScalar *
const x = & m_x( 0, femColumn );
122 const MatrixScalar *
const A = & m_A.values( 0, femEntry + col );
125 for (
size_type l = tid; l < dim; l += nid) {
126 sh_x[col + l * m_block_size] = x[l];
127 sh_A[col + l * m_block_size] = A[l];
135 for (
size_type l = lBeg; l < lEnd; l += blockDim.x) {
139 m_A.block.coord(l,i,j,k);
140 const TensorScalar v = m_A.block.value( l );
148 if (threadIdx.x == 0) {
149 if (i == rows[tid+31])
152 sh_y[rows[tid+31]] += vals[tid+31];
156 for (
size_type col = 0; col < block_size; ++col) {
157 y += v * ( sh_A[col+
j] * sh_x[col+k] + sh_A[col+k] * sh_x[col+
j] );
166 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
167 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
168 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
169 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
170 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
175 if (threadIdx.x < 31 && i != rows[tid + 1])
176 sh_y[i] += vals[tid];
183 if (threadIdx.x == 31) {
184 rows[threadIdx.y] = rows[tid];
185 vals[threadIdx.y] = vals[tid];
191 if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
193 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
194 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
195 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
196 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
197 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
199 if ((threadIdx.x == blockDim.y-1) ||
200 (threadIdx.x < blockDim.y-1 && i != rows[tid+1]))
201 sh_y[i] += vals[tid];
205 rows[tid] = invalid_row;
214 for (
size_type l = tid; l < dim; l += nid) {
215 m_y( l, blockIdx.x ) = sh_y[ l ];
229 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
238 std::min( num_warp_max, stoch_entries / warp_size );
239 const dim3 dBlock( warp_size , num_warp, 1 );
240 const dim3 dGrid( fem_rows, 1, 1 );
242 const size_type num_thread = dBlock.x*dBlock.y;
244 (stoch_entries + num_thread-1) / num_thread;
248 const size_type size_vals =
sizeof(VectorScalar) * num_thread;
250 Kokkos::Cuda().impl_internal_space_instance()->m_deviceProp.sharedMemPerBlock / 2;
252 ((shcap-size_rows-size_vals) / (
sizeof(VectorScalar)*stoch_rows) - 1) / 2;
253 if (bs % 2 == 0) --bs;
257 sizeof(VectorScalar) * ((2*block_size+1) * stoch_rows) +
258 size_vals + size_rows;
263 std::cout <<
"Multiply< BlockCrsMatrix< CooProductTensor ... > >::apply"
265 <<
" grid(" << dGrid.x <<
"," << dGrid.y <<
")" << std::endl
266 <<
" block(" << dBlock.x <<
"," << dBlock.y <<
")" << std::endl
267 <<
" block_size(" << block_size <<
")" << std::endl
268 <<
" shmem(" << shmem/1024 <<
" kB)" << std::endl
269 <<
" fem_rows(" << fem_rows <<
")" << std::endl
270 <<
" fem_nnz(" << fem_nnz <<
")" << std::endl
271 <<
" stoch_rows(" << stoch_rows <<
")" << std::endl
272 <<
" stoch_nnz(" << stoch_entries <<
")" << std::endl
273 <<
" threads_per_block(" << num_thread <<
")" << std::endl
274 <<
" entries_per_thread(" << entries_per_thread <<
")" << std::endl
279 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
280 ( CooKernel( A, x, y, entries_per_thread, block_size ) );
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_block_size const size_type m_block_size
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_A const matrix_type m_A
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_x const vector_type m_x
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::operator() __device__ void operator()(void) const
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::CooKernel CooKernel(const matrix_type &A, const vector_type &x, const vector_type &y, const size_type entries_per_thread, const size_type block_size)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::tensor_type CooProductTensor< TensorScalar, execution_space, Pack > tensor_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::apply static void apply(const matrix_type &A, const vector_type &x, const vector_type &y)
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::vector_type Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::rows_type int rows_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_entries_per_thread const size_type m_entries_per_thread
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::CooKernel::m_y const vector_type m_y
CRS matrix of dense blocks.
Sparse product tensor using 'COO'-like storage format.
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::matrix_type BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::execution_space Kokkos::Cuda execution_space
Stokhos::Multiply< BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >, MatrixScalar, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda >, Kokkos::View< VectorScalar **, Kokkos::LayoutLeft, Kokkos::Cuda > >::size_type execution_space::size_type size_type