10 #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
11 #define STOKHOS_CRSPRODUCTTENSOR_HPP
13 #include "Kokkos_Core.hpp"
45 template<
typename ValueType,
class ExecutionSpace,
class Memory =
void >
54 typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space
host_mirror_space ;
63 #elif defined(__MIC__)
74 #if defined( KOKKOS_ENABLE_CUDA )
75 std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
88 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type >
vec_type;
89 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
coord_array_type;
90 typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type >
coord2_array_type;
91 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type >
value_array_type;
92 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
entry_array_type;
93 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
row_map_array_type;
124 KOKKOS_INLINE_FUNCTION
127 KOKKOS_INLINE_FUNCTION
141 KOKKOS_INLINE_FUNCTION
155 KOKKOS_INLINE_FUNCTION
173 KOKKOS_INLINE_FUNCTION
177 KOKKOS_INLINE_FUNCTION
181 KOKKOS_INLINE_FUNCTION
186 KOKKOS_INLINE_FUNCTION
191 KOKKOS_INLINE_FUNCTION
196 KOKKOS_INLINE_FUNCTION
201 KOKKOS_INLINE_FUNCTION
206 KOKKOS_INLINE_FUNCTION
211 KOKKOS_INLINE_FUNCTION
216 KOKKOS_INLINE_FUNCTION
221 KOKKOS_INLINE_FUNCTION
226 KOKKOS_INLINE_FUNCTION
231 KOKKOS_INLINE_FUNCTION
235 template <
typename OrdinalType>
254 std::vector< size_t > coord_work( dimension, (
size_t) 0 );
256 for (
typename Cijk_type::i_iterator i_it=Cijk.
i_begin();
257 i_it!=Cijk.
i_end(); ++i_it) {
258 OrdinalType i = index(i_it);
259 for (
typename Cijk_type::ik_iterator k_it = Cijk.
k_begin(i_it);
260 k_it != Cijk.
k_end(i_it); ++k_it) {
261 OrdinalType k = index(k_it);
262 for (
typename Cijk_type::ikj_iterator j_it = Cijk.
j_begin(k_it);
263 j_it != Cijk.
j_end(k_it); ++j_it) {
264 OrdinalType
j = index(j_it);
281 coord_work[i] += pad;
287 std::vector< CijkRowCount > row_count( dimension );
289 row_count[i].count = coord_work[i];
290 row_count[i].basis = i;
297 std::vector<size_type> sorted_row_map( dimension );
299 coord_work[i] = row_count[i].count;
300 sorted_row_map[ row_count[i].basis ] = i;
320 typename coord_array_type::host_mirror_type
322 typename coord2_array_type::host_mirror_type
324 typename value_array_type::host_mirror_type
326 typename entry_array_type::host_mirror_type
328 typename entry_array_type::host_mirror_type
335 sum += coord_work[i];
336 host_row_map(i+1) =
sum;
337 host_num_entry(i) = 0;
341 coord_work[iCoord] = host_row_map[iCoord];
350 for (
typename Cijk_type::i_iterator i_it=Cijk.
i_begin();
351 i_it!=Cijk.
i_end(); ++i_it) {
352 OrdinalType i = index(i_it);
354 for (
typename Cijk_type::ik_iterator k_it = Cijk.
k_begin(i_it);
355 k_it != Cijk.
k_end(i_it); ++k_it) {
356 OrdinalType k = index(k_it);
357 for (
typename Cijk_type::ikj_iterator j_it = Cijk.
j_begin(k_it);
358 j_it != Cijk.
j_end(k_it); ++j_it) {
359 OrdinalType
j = index(j_it);
360 ValueType c = Stokhos::value(j_it);
362 const size_type n = coord_work[row]; ++coord_work[row];
363 host_value(n) = (j != k) ? c : 0.5*c;
364 host_coord2(n,0) =
j;
365 host_coord2(n,1) = k;
366 host_coord(n) = ( k << 16 ) | j;
367 ++host_num_entry(row);
373 host_num_entry(row) =
417 typename coord_array_type::host_mirror_type
419 typename coord2_array_type::host_mirror_type
421 typename value_array_type::host_mirror_type
423 typename entry_array_type::host_mirror_type
425 typename entry_array_type::host_mirror_type
431 host_num_entry(0) = 1;
435 host_coord2(0,0) = 0;
436 host_coord2(0,1) = 0;
468 template <
class DstDevice,
class DstMemory >
481 template<
class Device,
typename OrdinalType,
typename ValueType>
482 CrsProductTensor<ValueType, Device>
491 template<
class Device,
typename OrdinalType,
typename ValueType,
493 CrsProductTensor<ValueType, Device, Memory>
500 basis, Cijk, params );
503 template<
class Device,
typename OrdinalType,
typename ValueType>
504 CrsProductTensor<ValueType, Device>
510 template<
class Device,
typename OrdinalType,
typename ValueType,
512 CrsProductTensor<ValueType, Device, Memory>
518 template <
class ValueType,
class Device,
class Memory >
520 typename CrsProductTensor<ValueType,Device,Memory>::host_mirror_type
526 template <
class ValueType,
527 class DstDevice,
class DstMemory,
528 class SrcDevice,
class SrcMemory >
536 template <
typename ValueType,
typename Device >
547 #define USE_AUTO_VECTORIZATION 1
549 #define USE_AUTO_VECTORIZATION 0
552 #if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
555 template<
typename MatrixValue ,
typename VectorValue >
556 KOKKOS_INLINE_FUNCTION
557 static void apply(
const tensor_type & tensor ,
558 const MatrixValue *
const a ,
559 const VectorValue *
const x ,
560 VectorValue *
const y ,
561 const VectorValue & alpha = VectorValue(1) )
565 const size_type * cj = &tensor.coord(0,0);
566 const size_type * ck = &tensor.coord(0,1);
567 const size_type nDim = tensor.dimension();
569 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
570 const size_type nEntry = tensor.num_entry(iy);
571 const size_type iEntryBeg = tensor.entry_begin(iy);
572 const size_type iEntryEnd = iEntryBeg + nEntry;
573 VectorValue ytmp = 0;
575 #pragma simd vectorlength(tensor_type::vectorsize)
577 #pragma vector aligned
578 for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
579 const size_type
j = cj[iEntry];
580 const size_type k = ck[iEntry];
581 ytmp += tensor.value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
584 y[iy] += alpha * ytmp ;
588 #elif defined(__MIC__)
591 template<
typename MatrixValue ,
typename VectorValue >
592 KOKKOS_INLINE_FUNCTION
593 static void apply(
const tensor_type & tensor ,
594 const MatrixValue *
const a ,
595 const VectorValue *
const x ,
596 VectorValue *
const y ,
597 const VectorValue & alpha = VectorValue(1) )
599 const size_type nDim = tensor.dimension();
600 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
602 const size_type nEntry = tensor.num_entry(iy);
603 const size_type iEntryBeg = tensor.entry_begin(iy);
604 const size_type iEntryEnd = iEntryBeg + nEntry;
605 size_type iEntry = iEntryBeg;
607 VectorValue ytmp = 0 ;
609 const size_type nBlock = nEntry / tensor_type::vectorsize;
610 const size_type nEntryB = nBlock * tensor_type::vectorsize;
611 const size_type iEnd = iEntryBeg + nEntryB;
613 typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
616 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
617 const size_type *
j = &tensor.coord(iEntry,0);
618 const size_type *k = &tensor.coord(iEntry,1);
619 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
620 c(&(tensor.value(iEntry)));
624 aj.multiply_add(ak, xj);
625 vy.multiply_add(c, aj);
632 const size_type rem = iEntryEnd-iEntry;
634 typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
635 const size_type *j = &tensor.coord(iEntry,0);
636 const size_type *k = &tensor.coord(iEntry,1);
637 TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
638 c(&(tensor.value(iEntry)));
642 aj.multiply_add(ak, xj);
647 y[iy] += alpha * ytmp ;
654 template<
typename MatrixValue ,
typename VectorValue >
655 KOKKOS_INLINE_FUNCTION
657 const MatrixValue *
const a ,
658 const VectorValue *
const x ,
659 VectorValue *
const y ,
660 const VectorValue & alpha = VectorValue(1) )
663 for (
size_type iy = 0 ; iy < nDim ; ++iy ) {
667 const size_type iEntryEnd = iEntryBeg + nEntry;
670 VectorValue ytmp = 0 ;
673 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
674 const size_type nBlock = nEntry / tensor_type::vectorsize;
675 const size_type nEntryB = nBlock * tensor_type::vectorsize;
676 const size_type iEnd = iEntryBeg + nEntryB;
681 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
684 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.
value(iEntry)));
688 aj.multiply_add(ak, xj);
689 vy.multiply_add(c, aj);
695 for ( ; iEntry<iEntryEnd; ++iEntry) {
699 ytmp += tensor.
value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
702 y[iy] += alpha * ytmp ;
707 KOKKOS_INLINE_FUNCTION
711 KOKKOS_INLINE_FUNCTION
724 template<
typename ValueType ,
typename MatrixValue ,
typename VectorValue ,
733 typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space >
block_vector_type ;
754 KOKKOS_INLINE_FUNCTION
759 VectorValue *
const y = &
m_y(0,iBlockRow);
767 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
768 const VectorValue *
const x = &
m_x( 0 ,
m_A.
graph.entries(iEntry) );
769 const MatrixValue *
const a = &
m_A.
values( 0 , iEntry );
780 KOKKOS_INLINE_FUNCTION
781 std::pair< size_type , size_type >
786 enum { work_align = 64 /
sizeof(VectorValue) };
788 enum { work_mask = work_align - 1 };
791 ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
792 thread_count ) << work_shift ;
795 std::min( thread_rank * work_per_thread , work_count );
797 std::min( work_begin + work_per_thread , work_count );
799 return std::make_pair( work_begin , work_end );
806 KOKKOS_INLINE_FUNCTION
807 void operator()(
const typename Kokkos::TeamPolicy< execution_space >::member_type &
device )
const
809 const size_type iBlockRow = device.league_rank();
813 if (iBlockRow >= row_count)
816 const size_type num_thread = device.team_size();
817 const size_type thread_idx = device.team_rank();
818 std::pair<size_type,size_type> work_range =
823 VectorValue *
const y = &
m_y(0,iBlockRow);
826 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
835 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
837 const MatrixValue* sh_A[BlockSize];
838 const VectorValue* sh_x[BlockSize];
841 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
843 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
845 for (
size_type col = 0; col < block_size; ++col ) {
847 sh_x[col] = &
m_x( 0 , iBlockColumn );
848 sh_A[col] = &
m_A.
values( 0 , iBlockEntry + col );
851 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
855 const size_type iEntryEnd = iEntryBeg + nEntry;
858 VectorValue ytmp = 0 ;
863 const size_type iEnd = iEntryBeg + nEntryB;
873 ValTV c(&(tensor.
value(iEntry)));
875 for (
size_type col = 0; col < block_size; ++col ) {
876 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
877 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
881 aj.multiply_add(ak, xj);
882 vy.multiply_add(c, aj);
896 ValTV2 c(&(tensor.
value(iEntry)));
898 for (
size_type col = 0; col < block_size; ++col ) {
899 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
900 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
904 aj.multiply_add(ak, xj);
915 device.team_barrier();
926 KOKKOS_INLINE_FUNCTION
927 void operator()(
const typename Kokkos::TeamPolicy< execution_space >::member_type & device )
const
929 const size_type iBlockRow = device.league_rank();
933 if (iBlockRow >= row_count)
936 const size_type num_thread = device.team_size();
937 const size_type thread_idx = device.team_rank();
938 std::pair<size_type,size_type> work_range =
943 VectorValue *
const y = &
m_y(0,iBlockRow);
946 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
955 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
957 const MatrixValue* sh_A[BlockSize];
958 const VectorValue* sh_x[BlockSize];
961 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
963 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
965 for (
size_type col = 0; col < block_size; ++col ) {
967 sh_x[col] = &
m_x( 0 , iBlockColumn );
968 sh_A[col] = &
m_A.
values( 0 , iBlockEntry + col );
971 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
975 const size_type iEntryEnd = iEntryBeg + nEntry;
978 VectorValue ytmp = 0 ;
981 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
984 const size_type iEnd = iEntryBeg + nEntryB;
994 ValTV c(&(tensor.
value(iEntry)));
996 for (
size_type col = 0; col < block_size; ++col ) {
997 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
998 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
1002 aj.multiply_add(ak, xj);
1003 vy.multiply_add(c, aj);
1010 for ( ; iEntry<iEntryEnd; ++iEntry) {
1015 for (
size_type col = 0; col < block_size; ++col ) {
1016 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
1017 sh_A[col][k] * sh_x[col][
j] );
1027 device.team_barrier();
1043 const bool use_block_algorithm =
true;
1045 const bool use_block_algorithm =
false;
1048 const size_t row_count = A.
graph.row_map.extent(0) - 1 ;
1049 if (use_block_algorithm) {
1051 const size_t team_size = 4;
1053 const size_t team_size = 2;
1055 const size_t league_size = row_count;
1056 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1060 Kokkos::parallel_for( row_count ,
MultiplyImpl(A,x,y) );
static const size_type num_entry_align
CrsProductTensor< ValueType, Device, Memory >::host_mirror_type create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
BlockSpec::size_type size_type
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Bases defined by combinatorial product of polynomial bases.
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero's.
host_mirror_type HostMirror
k_iterator k_begin() const
Iterator pointing to first k entry.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
CrsProductTensor< ValueType, execution_space > tensor_type
static const bool is_cuda
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop's per multiply-add.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
entry_array_type m_num_entry
row_map_array_type m_row_map
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate 'i'.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
Stokhos::CrsMatrix< ValueType, Device, Layout >::host_mirror_type create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms=Teuchos::ParameterList())
const block_vector_type m_x
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
size_type m_avg_entries_per_row
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
static const size_type tensor_align
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate 'i'.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
ExecutionSpace execution_space
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
CrsProductTensor< value_type, host_mirror_space > host_mirror_type
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION CrsProductTensor()
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms=Teuchos::ParameterList())
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
tensor_type::size_type size_type
CRS matrix of dense blocks.
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate 'i'.
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
CrsProductTensor< ValueType, execution_space > tensor_type
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
static const size_type vectorsize
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
const block_vector_type m_y
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
KOKKOS_INLINE_FUNCTION void zero()
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
coord2_array_type m_coord2
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
virtual ordinal_type size() const =0
Return total size of basis.
static host_mirror_type create_mirror_view(const CrsProductTensor &tensor)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
static const bool use_intrinsics
tensor_type::size_type size_type
static const size_type cuda_vectorsize
static CrsProductTensor createMeanBased()