10 #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
11 #define STOKHOS_CRSPRODUCTTENSOR_HPP
13 #include "Kokkos_Core.hpp"
45 template<
typename ValueType,
class ExecutionSpace,
class Memory =
void >
54 typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space
host_mirror_space ;
62 #elif defined(__MIC__)
73 #if defined( KOKKOS_ENABLE_CUDA )
74 std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
87 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type >
vec_type;
88 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
coord_array_type;
89 typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type >
coord2_array_type;
90 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type >
value_array_type;
91 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
entry_array_type;
92 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
row_map_array_type;
123 KOKKOS_INLINE_FUNCTION
126 KOKKOS_INLINE_FUNCTION
140 KOKKOS_INLINE_FUNCTION
154 KOKKOS_INLINE_FUNCTION
172 KOKKOS_INLINE_FUNCTION
176 KOKKOS_INLINE_FUNCTION
180 KOKKOS_INLINE_FUNCTION
185 KOKKOS_INLINE_FUNCTION
190 KOKKOS_INLINE_FUNCTION
195 KOKKOS_INLINE_FUNCTION
200 KOKKOS_INLINE_FUNCTION
205 KOKKOS_INLINE_FUNCTION
210 KOKKOS_INLINE_FUNCTION
215 KOKKOS_INLINE_FUNCTION
220 KOKKOS_INLINE_FUNCTION
225 KOKKOS_INLINE_FUNCTION
230 KOKKOS_INLINE_FUNCTION
234 template <
typename OrdinalType>
253 std::vector< size_t > coord_work( dimension, (
size_t) 0 );
255 for (
typename Cijk_type::i_iterator i_it=Cijk.
i_begin();
256 i_it!=Cijk.
i_end(); ++i_it) {
257 OrdinalType i = index(i_it);
258 for (
typename Cijk_type::ik_iterator k_it = Cijk.
k_begin(i_it);
259 k_it != Cijk.
k_end(i_it); ++k_it) {
260 OrdinalType k = index(k_it);
261 for (
typename Cijk_type::ikj_iterator j_it = Cijk.
j_begin(k_it);
262 j_it != Cijk.
j_end(k_it); ++j_it) {
263 OrdinalType
j = index(j_it);
280 coord_work[i] += pad;
286 std::vector< CijkRowCount > row_count( dimension );
288 row_count[i].count = coord_work[i];
289 row_count[i].basis = i;
296 std::vector<size_type> sorted_row_map( dimension );
298 coord_work[i] = row_count[i].count;
299 sorted_row_map[ row_count[i].basis ] = i;
319 typename coord_array_type::HostMirror
321 typename coord2_array_type::HostMirror
323 typename value_array_type::HostMirror
325 typename entry_array_type::HostMirror
327 typename entry_array_type::HostMirror
334 sum += coord_work[i];
335 host_row_map(i+1) =
sum;
336 host_num_entry(i) = 0;
340 coord_work[iCoord] = host_row_map[iCoord];
349 for (
typename Cijk_type::i_iterator i_it=Cijk.
i_begin();
350 i_it!=Cijk.
i_end(); ++i_it) {
351 OrdinalType i = index(i_it);
353 for (
typename Cijk_type::ik_iterator k_it = Cijk.
k_begin(i_it);
354 k_it != Cijk.
k_end(i_it); ++k_it) {
355 OrdinalType k = index(k_it);
356 for (
typename Cijk_type::ikj_iterator j_it = Cijk.
j_begin(k_it);
357 j_it != Cijk.
j_end(k_it); ++j_it) {
358 OrdinalType
j = index(j_it);
359 ValueType c = Stokhos::value(j_it);
361 const size_type n = coord_work[row]; ++coord_work[row];
362 host_value(n) = (j != k) ? c : 0.5*c;
363 host_coord2(n,0) =
j;
364 host_coord2(n,1) = k;
365 host_coord(n) = ( k << 16 ) | j;
366 ++host_num_entry(row);
372 host_num_entry(row) =
416 typename coord_array_type::HostMirror
418 typename coord2_array_type::HostMirror
420 typename value_array_type::HostMirror
422 typename entry_array_type::HostMirror
424 typename entry_array_type::HostMirror
430 host_num_entry(0) = 1;
434 host_coord2(0,0) = 0;
435 host_coord2(0,1) = 0;
467 template <
class DstDevice,
class DstMemory >
480 template<
class Device,
typename OrdinalType,
typename ValueType>
481 CrsProductTensor<ValueType, Device>
490 template<
class Device,
typename OrdinalType,
typename ValueType,
492 CrsProductTensor<ValueType, Device, Memory>
499 basis, Cijk, params );
502 template<
class Device,
typename OrdinalType,
typename ValueType>
503 CrsProductTensor<ValueType, Device>
509 template<
class Device,
typename OrdinalType,
typename ValueType,
511 CrsProductTensor<ValueType, Device, Memory>
517 template <
class ValueType,
class Device,
class Memory >
519 typename CrsProductTensor<ValueType,Device,Memory>::HostMirror
525 template <
class ValueType,
526 class DstDevice,
class DstMemory,
527 class SrcDevice,
class SrcMemory >
535 template <
typename ValueType,
typename Device >
546 #define USE_AUTO_VECTORIZATION 1
548 #define USE_AUTO_VECTORIZATION 0
551 #if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
554 template<
typename MatrixValue ,
typename VectorValue >
555 KOKKOS_INLINE_FUNCTION
556 static void apply(
const tensor_type & tensor ,
557 const MatrixValue *
const a ,
558 const VectorValue *
const x ,
559 VectorValue *
const y ,
560 const VectorValue & alpha = VectorValue(1) )
564 const size_type * cj = &tensor.coord(0,0);
565 const size_type * ck = &tensor.coord(0,1);
566 const size_type nDim = tensor.dimension();
568 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
569 const size_type nEntry = tensor.num_entry(iy);
570 const size_type iEntryBeg = tensor.entry_begin(iy);
571 const size_type iEntryEnd = iEntryBeg + nEntry;
572 VectorValue ytmp = 0;
574 #pragma simd vectorlength(tensor_type::vectorsize)
576 #pragma vector aligned
577 for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
578 const size_type
j = cj[iEntry];
579 const size_type k = ck[iEntry];
580 ytmp += tensor.value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
583 y[iy] += alpha * ytmp ;
587 #elif defined(__MIC__)
590 template<
typename MatrixValue ,
typename VectorValue >
591 KOKKOS_INLINE_FUNCTION
592 static void apply(
const tensor_type & tensor ,
593 const MatrixValue *
const a ,
594 const VectorValue *
const x ,
595 VectorValue *
const y ,
596 const VectorValue & alpha = VectorValue(1) )
598 const size_type nDim = tensor.dimension();
599 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
601 const size_type nEntry = tensor.num_entry(iy);
602 const size_type iEntryBeg = tensor.entry_begin(iy);
603 const size_type iEntryEnd = iEntryBeg + nEntry;
604 size_type iEntry = iEntryBeg;
606 VectorValue ytmp = 0 ;
608 const size_type nBlock = nEntry / tensor_type::vectorsize;
609 const size_type nEntryB = nBlock * tensor_type::vectorsize;
610 const size_type iEnd = iEntryBeg + nEntryB;
612 typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
615 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
616 const size_type *
j = &tensor.coord(iEntry,0);
617 const size_type *k = &tensor.coord(iEntry,1);
618 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
619 c(&(tensor.value(iEntry)));
623 aj.multiply_add(ak, xj);
624 vy.multiply_add(c, aj);
631 const size_type rem = iEntryEnd-iEntry;
633 typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
634 const size_type *j = &tensor.coord(iEntry,0);
635 const size_type *k = &tensor.coord(iEntry,1);
636 TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
637 c(&(tensor.value(iEntry)));
641 aj.multiply_add(ak, xj);
646 y[iy] += alpha * ytmp ;
653 template<
typename MatrixValue ,
typename VectorValue >
654 KOKKOS_INLINE_FUNCTION
656 const MatrixValue *
const a ,
657 const VectorValue *
const x ,
658 VectorValue *
const y ,
659 const VectorValue & alpha = VectorValue(1) )
662 for (
size_type iy = 0 ; iy < nDim ; ++iy ) {
666 const size_type iEntryEnd = iEntryBeg + nEntry;
669 VectorValue ytmp = 0 ;
672 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
673 const size_type nBlock = nEntry / tensor_type::vectorsize;
674 const size_type nEntryB = nBlock * tensor_type::vectorsize;
675 const size_type iEnd = iEntryBeg + nEntryB;
680 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
683 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.
value(iEntry)));
687 aj.multiply_add(ak, xj);
688 vy.multiply_add(c, aj);
694 for ( ; iEntry<iEntryEnd; ++iEntry) {
698 ytmp += tensor.
value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
701 y[iy] += alpha * ytmp ;
706 KOKKOS_INLINE_FUNCTION
710 KOKKOS_INLINE_FUNCTION
723 template<
typename ValueType ,
typename MatrixValue ,
typename VectorValue ,
732 typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space >
block_vector_type ;
753 KOKKOS_INLINE_FUNCTION
758 VectorValue *
const y = &
m_y(0,iBlockRow);
766 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
767 const VectorValue *
const x = &
m_x( 0 ,
m_A.
graph.entries(iEntry) );
768 const MatrixValue *
const a = &
m_A.
values( 0 , iEntry );
779 KOKKOS_INLINE_FUNCTION
780 std::pair< size_type , size_type >
785 enum { work_align = 64 /
sizeof(VectorValue) };
787 enum { work_mask = work_align - 1 };
790 ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
791 thread_count ) << work_shift ;
794 std::min( thread_rank * work_per_thread , work_count );
796 std::min( work_begin + work_per_thread , work_count );
798 return std::make_pair( work_begin , work_end );
805 KOKKOS_INLINE_FUNCTION
806 void operator()(
const typename Kokkos::TeamPolicy< execution_space >::member_type &
device )
const
808 const size_type iBlockRow = device.league_rank();
812 if (iBlockRow >= row_count)
815 const size_type num_thread = device.team_size();
816 const size_type thread_idx = device.team_rank();
817 std::pair<size_type,size_type> work_range =
822 VectorValue *
const y = &
m_y(0,iBlockRow);
825 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
834 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
836 const MatrixValue* sh_A[BlockSize];
837 const VectorValue* sh_x[BlockSize];
840 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
842 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
844 for (
size_type col = 0; col < block_size; ++col ) {
846 sh_x[col] = &
m_x( 0 , iBlockColumn );
847 sh_A[col] = &
m_A.
values( 0 , iBlockEntry + col );
850 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
854 const size_type iEntryEnd = iEntryBeg + nEntry;
857 VectorValue ytmp = 0 ;
862 const size_type iEnd = iEntryBeg + nEntryB;
872 ValTV c(&(tensor.
value(iEntry)));
874 for (
size_type col = 0; col < block_size; ++col ) {
875 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
876 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
880 aj.multiply_add(ak, xj);
881 vy.multiply_add(c, aj);
895 ValTV2 c(&(tensor.
value(iEntry)));
897 for (
size_type col = 0; col < block_size; ++col ) {
898 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
899 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
903 aj.multiply_add(ak, xj);
914 device.team_barrier();
925 KOKKOS_INLINE_FUNCTION
926 void operator()(
const typename Kokkos::TeamPolicy< execution_space >::member_type & device )
const
928 const size_type iBlockRow = device.league_rank();
932 if (iBlockRow >= row_count)
935 const size_type num_thread = device.team_size();
936 const size_type thread_idx = device.team_rank();
937 std::pair<size_type,size_type> work_range =
942 VectorValue *
const y = &
m_y(0,iBlockRow);
945 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
954 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
956 const MatrixValue* sh_A[BlockSize];
957 const VectorValue* sh_x[BlockSize];
960 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
962 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
964 for (
size_type col = 0; col < block_size; ++col ) {
966 sh_x[col] = &
m_x( 0 , iBlockColumn );
967 sh_A[col] = &
m_A.
values( 0 , iBlockEntry + col );
970 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
974 const size_type iEntryEnd = iEntryBeg + nEntry;
977 VectorValue ytmp = 0 ;
980 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
983 const size_type iEnd = iEntryBeg + nEntryB;
993 ValTV c(&(tensor.
value(iEntry)));
995 for (
size_type col = 0; col < block_size; ++col ) {
996 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
997 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
1001 aj.multiply_add(ak, xj);
1002 vy.multiply_add(c, aj);
1009 for ( ; iEntry<iEntryEnd; ++iEntry) {
1014 for (
size_type col = 0; col < block_size; ++col ) {
1015 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
1016 sh_A[col][k] * sh_x[col][
j] );
1026 device.team_barrier();
1042 const bool use_block_algorithm =
true;
1044 const bool use_block_algorithm =
false;
1047 const size_t row_count = A.
graph.row_map.extent(0) - 1 ;
1048 if (use_block_algorithm) {
1050 const size_t team_size = 4;
1052 const size_t team_size = 2;
1054 const size_t league_size = row_count;
1055 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1059 Kokkos::parallel_for( row_count ,
MultiplyImpl(A,x,y) );
static const size_type num_entry_align
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
BlockSpec::size_type size_type
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Bases defined by combinatorial product of polynomial bases.
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero's.
k_iterator k_begin() const
Iterator pointing to first k entry.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
CrsProductTensor< ValueType, execution_space > tensor_type
static const bool is_cuda
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop's per multiply-add.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
entry_array_type m_num_entry
row_map_array_type m_row_map
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate 'i'.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms=Teuchos::ParameterList())
const block_vector_type m_x
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
size_type m_avg_entries_per_row
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
static const size_type tensor_align
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate 'i'.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
CrsProductTensor< ValueType, Device, Memory >::HostMirror create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
CrsProductTensor< value_type, host_mirror_space > HostMirror
ExecutionSpace execution_space
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION CrsProductTensor()
static HostMirror create_mirror_view(const CrsProductTensor &tensor)
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms=Teuchos::ParameterList())
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
tensor_type::size_type size_type
CRS matrix of dense blocks.
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate 'i'.
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
CrsProductTensor< ValueType, execution_space > tensor_type
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
static const size_type vectorsize
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
const block_vector_type m_y
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
KOKKOS_INLINE_FUNCTION void zero()
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
coord2_array_type m_coord2
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
virtual ordinal_type size() const =0
Return total size of basis.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
static const bool use_intrinsics
tensor_type::size_type size_type
static const size_type cuda_vectorsize
static CrsProductTensor createMeanBased()