42 #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
43 #define STOKHOS_CRSPRODUCTTENSOR_HPP
47 #include "Kokkos_Core.hpp"
79 template<
typename ValueType,
class ExecutionSpace,
class Memory =
void >
88 typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space
host_mirror_space ;
96 #elif defined(__MIC__)
107 #if defined( KOKKOS_ENABLE_CUDA )
108 Kokkos::Impl::is_same<ExecutionSpace,Kokkos::Cuda>::value;
121 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type >
vec_type;
122 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
coord_array_type;
123 typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type >
coord2_array_type;
124 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type >
value_array_type;
125 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
entry_array_type;
126 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type >
row_map_array_type;
157 KOKKOS_INLINE_FUNCTION
160 KOKKOS_INLINE_FUNCTION
174 KOKKOS_INLINE_FUNCTION
188 KOKKOS_INLINE_FUNCTION
206 KOKKOS_INLINE_FUNCTION
210 KOKKOS_INLINE_FUNCTION
214 KOKKOS_INLINE_FUNCTION
219 KOKKOS_INLINE_FUNCTION
224 KOKKOS_INLINE_FUNCTION
229 KOKKOS_INLINE_FUNCTION
234 KOKKOS_INLINE_FUNCTION
239 KOKKOS_INLINE_FUNCTION
244 KOKKOS_INLINE_FUNCTION
249 KOKKOS_INLINE_FUNCTION
254 KOKKOS_INLINE_FUNCTION
259 KOKKOS_INLINE_FUNCTION
264 KOKKOS_INLINE_FUNCTION
268 template <
typename OrdinalType>
287 std::vector< size_t > coord_work( dimension, (
size_t) 0 );
289 for (
typename Cijk_type::i_iterator i_it=Cijk.
i_begin();
290 i_it!=Cijk.
i_end(); ++i_it) {
291 OrdinalType i = index(i_it);
292 for (
typename Cijk_type::ik_iterator k_it = Cijk.
k_begin(i_it);
293 k_it != Cijk.
k_end(i_it); ++k_it) {
294 OrdinalType k = index(k_it);
295 for (
typename Cijk_type::ikj_iterator j_it = Cijk.
j_begin(k_it);
296 j_it != Cijk.
j_end(k_it); ++j_it) {
297 OrdinalType
j = index(j_it);
314 coord_work[i] += pad;
320 std::vector< CijkRowCount > row_count( dimension );
322 row_count[i].count = coord_work[i];
323 row_count[i].basis = i;
330 std::vector<size_type> sorted_row_map( dimension );
332 coord_work[i] = row_count[i].count;
333 sorted_row_map[ row_count[i].basis ] = i;
353 typename coord_array_type::HostMirror
355 typename coord2_array_type::HostMirror
357 typename value_array_type::HostMirror
359 typename entry_array_type::HostMirror
361 typename entry_array_type::HostMirror
368 sum += coord_work[i];
369 host_row_map(i+1) =
sum;
370 host_num_entry(i) = 0;
374 coord_work[iCoord] = host_row_map[iCoord];
383 for (
typename Cijk_type::i_iterator i_it=Cijk.
i_begin();
384 i_it!=Cijk.
i_end(); ++i_it) {
385 OrdinalType i = index(i_it);
387 for (
typename Cijk_type::ik_iterator k_it = Cijk.
k_begin(i_it);
388 k_it != Cijk.
k_end(i_it); ++k_it) {
389 OrdinalType k = index(k_it);
390 for (
typename Cijk_type::ikj_iterator j_it = Cijk.
j_begin(k_it);
391 j_it != Cijk.
j_end(k_it); ++j_it) {
392 OrdinalType
j = index(j_it);
393 ValueType c = Stokhos::value(j_it);
395 const size_type n = coord_work[row]; ++coord_work[row];
396 host_value(n) = (j != k) ? c : 0.5*c;
397 host_coord2(n,0) =
j;
398 host_coord2(n,1) = k;
399 host_coord(n) = ( k << 16 ) | j;
400 ++host_num_entry(row);
406 host_num_entry(row) =
450 typename coord_array_type::HostMirror
452 typename coord2_array_type::HostMirror
454 typename value_array_type::HostMirror
456 typename entry_array_type::HostMirror
458 typename entry_array_type::HostMirror
464 host_num_entry(0) = 1;
468 host_coord2(0,0) = 0;
469 host_coord2(0,1) = 0;
501 template <
class DstDevice,
class DstMemory >
514 template<
class Device,
typename OrdinalType,
typename ValueType>
515 CrsProductTensor<ValueType, Device>
524 template<
class Device,
typename OrdinalType,
typename ValueType,
526 CrsProductTensor<ValueType, Device, Memory>
533 basis, Cijk, params );
536 template<
class Device,
typename OrdinalType,
typename ValueType>
537 CrsProductTensor<ValueType, Device>
543 template<
class Device,
typename OrdinalType,
typename ValueType,
545 CrsProductTensor<ValueType, Device, Memory>
551 template <
class ValueType,
class Device,
class Memory >
553 typename CrsProductTensor<ValueType,Device,Memory>::HostMirror
559 template <
class ValueType,
560 class DstDevice,
class DstMemory,
561 class SrcDevice,
class SrcMemory >
569 template <
typename ValueType,
typename Device >
580 #define USE_AUTO_VECTORIZATION 1
582 #define USE_AUTO_VECTORIZATION 0
585 #if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
588 template<
typename MatrixValue ,
typename VectorValue >
589 KOKKOS_INLINE_FUNCTION
590 static void apply(
const tensor_type & tensor ,
591 const MatrixValue *
const a ,
592 const VectorValue *
const x ,
593 VectorValue *
const y ,
594 const VectorValue & alpha = VectorValue(1) )
598 const size_type * cj = &tensor.coord(0,0);
599 const size_type * ck = &tensor.coord(0,1);
600 const size_type nDim = tensor.dimension();
602 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
603 const size_type nEntry = tensor.num_entry(iy);
604 const size_type iEntryBeg = tensor.entry_begin(iy);
605 const size_type iEntryEnd = iEntryBeg + nEntry;
606 VectorValue ytmp = 0;
608 #pragma simd vectorlength(tensor_type::vectorsize)
610 #pragma vector aligned
611 for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
612 const size_type
j = cj[iEntry];
613 const size_type k = ck[iEntry];
614 ytmp += tensor.value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
617 y[iy] += alpha * ytmp ;
621 #elif defined(__MIC__)
624 template<
typename MatrixValue ,
typename VectorValue >
625 KOKKOS_INLINE_FUNCTION
626 static void apply(
const tensor_type & tensor ,
627 const MatrixValue *
const a ,
628 const VectorValue *
const x ,
629 VectorValue *
const y ,
630 const VectorValue & alpha = VectorValue(1) )
632 const size_type nDim = tensor.dimension();
633 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
635 const size_type nEntry = tensor.num_entry(iy);
636 const size_type iEntryBeg = tensor.entry_begin(iy);
637 const size_type iEntryEnd = iEntryBeg + nEntry;
638 size_type iEntry = iEntryBeg;
640 VectorValue ytmp = 0 ;
642 const size_type nBlock = nEntry / tensor_type::vectorsize;
643 const size_type nEntryB = nBlock * tensor_type::vectorsize;
644 const size_type iEnd = iEntryBeg + nEntryB;
646 typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
649 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
650 const size_type *
j = &tensor.coord(iEntry,0);
651 const size_type *k = &tensor.coord(iEntry,1);
652 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
653 c(&(tensor.value(iEntry)));
657 aj.multiply_add(ak, xj);
658 vy.multiply_add(c, aj);
665 const size_type rem = iEntryEnd-iEntry;
667 typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
668 const size_type *j = &tensor.coord(iEntry,0);
669 const size_type *k = &tensor.coord(iEntry,1);
670 TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
671 c(&(tensor.value(iEntry)));
675 aj.multiply_add(ak, xj);
680 y[iy] += alpha * ytmp ;
687 template<
typename MatrixValue ,
typename VectorValue >
688 KOKKOS_INLINE_FUNCTION
690 const MatrixValue *
const a ,
691 const VectorValue *
const x ,
692 VectorValue *
const y ,
693 const VectorValue & alpha = VectorValue(1) )
696 for (
size_type iy = 0 ; iy < nDim ; ++iy ) {
700 const size_type iEntryEnd = iEntryBeg + nEntry;
703 VectorValue ytmp = 0 ;
706 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
707 const size_type nBlock = nEntry / tensor_type::vectorsize;
708 const size_type nEntryB = nBlock * tensor_type::vectorsize;
709 const size_type iEnd = iEntryBeg + nEntryB;
714 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
717 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.
value(iEntry)));
721 aj.multiply_add(ak, xj);
722 vy.multiply_add(c, aj);
728 for ( ; iEntry<iEntryEnd; ++iEntry) {
732 ytmp += tensor.
value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
735 y[iy] += alpha * ytmp ;
740 KOKKOS_INLINE_FUNCTION
744 KOKKOS_INLINE_FUNCTION
757 template<
typename ValueType ,
typename MatrixValue ,
typename VectorValue ,
766 typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space >
block_vector_type ;
787 KOKKOS_INLINE_FUNCTION
792 VectorValue *
const y = &
m_y(0,iBlockRow);
800 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
801 const VectorValue *
const x = &
m_x( 0 ,
m_A.
graph.entries(iEntry) );
802 const MatrixValue *
const a = &
m_A.
values( 0 , iEntry );
813 KOKKOS_INLINE_FUNCTION
814 std::pair< size_type , size_type >
819 enum { work_align = 64 /
sizeof(VectorValue) };
820 enum { work_shift = Kokkos::Impl::power_of_two< work_align >::value };
821 enum { work_mask = work_align - 1 };
824 ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
825 thread_count ) << work_shift ;
828 std::min( thread_rank * work_per_thread , work_count );
830 std::min( work_begin + work_per_thread , work_count );
832 return std::make_pair( work_begin , work_end );
839 KOKKOS_INLINE_FUNCTION
840 void operator()(
const typename Kokkos::TeamPolicy< execution_space >::member_type &
device )
const
842 const size_type iBlockRow = device.league_rank();
846 if (iBlockRow >= row_count)
849 const size_type num_thread = device.team_size();
850 const size_type thread_idx = device.team_rank();
851 std::pair<size_type,size_type> work_range =
856 VectorValue *
const y = &
m_y(0,iBlockRow);
859 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
868 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
870 const MatrixValue* sh_A[BlockSize];
871 const VectorValue* sh_x[BlockSize];
874 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
876 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
878 for (
size_type col = 0; col < block_size; ++col ) {
880 sh_x[col] = &
m_x( 0 , iBlockColumn );
881 sh_A[col] = &
m_A.
values( 0 , iBlockEntry + col );
884 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
888 const size_type iEntryEnd = iEntryBeg + nEntry;
891 VectorValue ytmp = 0 ;
896 const size_type iEnd = iEntryBeg + nEntryB;
906 ValTV c(&(tensor.
value(iEntry)));
908 for (
size_type col = 0; col < block_size; ++col ) {
909 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
910 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
914 aj.multiply_add(ak, xj);
915 vy.multiply_add(c, aj);
929 ValTV2 c(&(tensor.
value(iEntry)));
931 for (
size_type col = 0; col < block_size; ++col ) {
932 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
933 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
937 aj.multiply_add(ak, xj);
948 device.team_barrier();
959 KOKKOS_INLINE_FUNCTION
960 void operator()(
const typename Kokkos::TeamPolicy< execution_space >::member_type & device )
const
962 const size_type iBlockRow = device.league_rank();
966 if (iBlockRow >= row_count)
969 const size_type num_thread = device.team_size();
970 const size_type thread_idx = device.team_rank();
971 std::pair<size_type,size_type> work_range =
976 VectorValue *
const y = &
m_y(0,iBlockRow);
979 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
988 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
990 const MatrixValue* sh_A[BlockSize];
991 const VectorValue* sh_x[BlockSize];
994 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
996 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
998 for (
size_type col = 0; col < block_size; ++col ) {
1000 sh_x[col] = &
m_x( 0 , iBlockColumn );
1001 sh_A[col] = &
m_A.
values( 0 , iBlockEntry + col );
1004 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
1008 const size_type iEntryEnd = iEntryBeg + nEntry;
1011 VectorValue ytmp = 0 ;
1014 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
1017 const size_type iEnd = iEntryBeg + nEntryB;
1027 ValTV c(&(tensor.
value(iEntry)));
1029 for (
size_type col = 0; col < block_size; ++col ) {
1030 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
1031 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
1035 aj.multiply_add(ak, xj);
1036 vy.multiply_add(c, aj);
1043 for ( ; iEntry<iEntryEnd; ++iEntry) {
1048 for (
size_type col = 0; col < block_size; ++col ) {
1049 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
1050 sh_A[col][k] * sh_x[col][
j] );
1060 device.team_barrier();
1076 const bool use_block_algorithm =
true;
1078 const bool use_block_algorithm =
false;
1081 const size_t row_count = A.
graph.row_map.extent(0) - 1 ;
1082 if (use_block_algorithm) {
1084 const size_t team_size = 4;
1086 const size_t team_size = 2;
1088 const size_t league_size = row_count;
1089 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1093 Kokkos::parallel_for( row_count ,
MultiplyImpl(A,x,y) );
static const size_type num_entry_align
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
BlockSpec::size_type size_type
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Bases defined by combinatorial product of polynomial bases.
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero's.
k_iterator k_begin() const
Iterator pointing to first k entry.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
CrsProductTensor< ValueType, execution_space > tensor_type
static const bool is_cuda
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop's per multiply-add.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
entry_array_type m_num_entry
row_map_array_type m_row_map
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate 'i'.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms=Teuchos::ParameterList())
const block_vector_type m_x
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
size_type m_avg_entries_per_row
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
static const size_type tensor_align
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate 'i'.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
CrsProductTensor< ValueType, Device, Memory >::HostMirror create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
CrsProductTensor< value_type, host_mirror_space > HostMirror
ExecutionSpace execution_space
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION CrsProductTensor()
static HostMirror create_mirror_view(const CrsProductTensor &tensor)
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms=Teuchos::ParameterList())
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
tensor_type::size_type size_type
CRS matrix of dense blocks.
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate 'i'.
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
CrsProductTensor< ValueType, execution_space > tensor_type
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
static const size_type vectorsize
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
const block_vector_type m_y
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
KOKKOS_INLINE_FUNCTION void zero()
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
coord2_array_type m_coord2
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
virtual ordinal_type size() const =0
Return total size of basis.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
static const bool use_intrinsics
tensor_type::size_type size_type
static const size_type cuda_vectorsize
static CrsProductTensor createMeanBased()