10 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
11 #define KOKKOS_CRSMATRIX_UQ_PCE_HPP
16 #include "KokkosSparse_CrsMatrix.hpp"
17 #include "KokkosSparse_spmv.hpp"
28 template <
typename ViewType,
typename Enabled =
void>
33 template <
typename ViewType>
36 std::enable_if_t<ViewType::memory_traits::is_random_access> > {
38 static constexpr
unsigned M0 = ViewType::memory_traits::impl_value;
39 static constexpr
unsigned M1 =
40 M0 & (Kokkos::Unmanaged | Kokkos::Atomic | Kokkos::Restrict | Kokkos::Aligned);
41 typedef Kokkos::View<
typename ViewType::data_type,
42 typename ViewType::array_layout,
43 typename ViewType::device_type,
44 Kokkos::MemoryTraits<M1> >
type;
56 template <
typename MatrixDevice,
57 typename MatrixStorage,
58 typename MatrixOrdinal,
59 typename MatrixMemory,
61 typename InputStorage,
63 typename OutputStorage,
70 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
72 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
121 : m_A_values( A.values )
122 , m_A_graph( A.graph )
125 , m_tensor( Kokkos::
cijk(A.values) )
137 KOKKOS_INLINE_FUNCTION
144 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
145 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
149 for (
size_type j = 0 ;
j < m_tensor.dimension() ; ++
j )
152 for (
size_type j = 0 ;
j < m_tensor.dimension() ; ++
j )
155 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
156 const input_scalar *
const x = & m_x( 0 , m_A_graph.entries(iEntry) );
160 m_tensor , A , x , y , m_a );
175 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
176 KOKKOS_INLINE_FUNCTION
177 void operator()(
const team_member &
device )
const
179 const size_type iBlockRow = device.league_rank();
182 const size_type row_count = m_A_graph.row_map.extent(0)-1;
183 if (iBlockRow >= row_count)
186 const size_type num_thread = device.team_size();
187 const size_type thread_idx = device.team_rank();
188 const Kokkos::pair<size_type,size_type> work_range =
189 details::compute_work_range<output_scalar>(
190 device, m_tensor.dimension(), num_thread, thread_idx);
194 output_scalar *
const y = & m_y(0,iBlockRow);
197 if ( m_b == output_scalar(0) )
198 for ( size_type
j = work_range.first ;
j < work_range.second ; ++
j )
201 for ( size_type
j = work_range.first ;
j < work_range.second ; ++
j )
204 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
205 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
206 const size_type BlockSize = 9;
207 const size_type numBlock =
208 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
210 const matrix_scalar* sh_A[BlockSize];
211 const input_scalar* sh_x[BlockSize];
213 size_type iBlockEntry = iBlockEntryBeg;
214 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
215 const size_type block_size =
216 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
218 for ( size_type col = 0; col < block_size; ++col ) {
219 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
220 sh_x[col] = & m_x( 0 , iBlockColumn );
221 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
224 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
226 const size_type nEntry = m_tensor.num_entry(iy);
227 const size_type iEntryBeg = m_tensor.entry_begin(iy);
228 const size_type iEntryEnd = iEntryBeg + nEntry;
229 size_type iEntry = iEntryBeg;
231 output_scalar ytmp = 0 ;
234 const size_type nBlock = nEntry / tensor_type::vectorsize;
235 const size_type nEntryB = nBlock * tensor_type::vectorsize;
236 const size_type iEnd = iEntryBeg + nEntryB;
238 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
239 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
240 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
243 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
244 const size_type *
j = &m_tensor.coord(iEntry,0);
245 const size_type *k = &m_tensor.coord(iEntry,1);
246 ValTV c(&(m_tensor.value(iEntry)));
248 for ( size_type col = 0; col < block_size; ++col ) {
249 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
250 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
254 aj.multiply_add(ak, xj);
255 vy.multiply_add(c, aj);
262 const size_type rem = iEntryEnd-iEntry;
264 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
265 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
266 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
267 const size_type *j = &m_tensor.coord(iEntry,0);
268 const size_type *k = &m_tensor.coord(iEntry,1);
269 ValTV2 c(&(m_tensor.value(iEntry)));
271 for ( size_type col = 0; col < block_size; ++col ) {
272 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
273 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
277 aj.multiply_add(ak, xj);
283 y[iy] += m_a * ytmp ;
288 device.team_barrier();
304 typedef typename Kokkos::TeamPolicy< execution_space >::member_type
team_member ;
305 KOKKOS_INLINE_FUNCTION
308 const size_type iBlockRow = device.league_rank();
311 const size_type row_count = m_A_graph.row_map.extent(0)-1;
312 if (iBlockRow >= row_count)
315 const size_type num_thread = device.team_size();
316 const size_type thread_idx = device.team_rank();
317 const Kokkos::pair<size_type,size_type> work_range =
318 details::compute_work_range<output_scalar>(
319 device, m_tensor.dimension(), num_thread, thread_idx);
327 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
330 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
333 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
334 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
337 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
343 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
345 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
347 for (
size_type col = 0; col < block_size; ++col ) {
348 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
349 sh_x[col] = & m_x( 0 , iBlockColumn );
350 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
353 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
355 const size_type nEntry = m_tensor.num_entry(iy);
356 const size_type iEntryBeg = m_tensor.entry_begin(iy);
357 const size_type iEntryEnd = iEntryBeg + nEntry;
363 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
364 const size_type nBlock = nEntry / tensor_type::vectorsize;
365 const size_type nEntryB = nBlock * tensor_type::vectorsize;
366 const size_type iEnd = iEntryBeg + nEntryB;
373 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
374 const size_type *j = &m_tensor.coord(iEntry,0);
375 const size_type *k = &m_tensor.coord(iEntry,1);
376 ValTV c(&(m_tensor.value(iEntry)));
378 for (
size_type col = 0; col < block_size; ++col ) {
379 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
380 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
384 aj.multiply_add(ak, xj);
385 vy.multiply_add(c, aj);
392 for ( ; iEntry<iEntryEnd; ++iEntry) {
393 const size_type j = m_tensor.coord(iEntry,0);
394 const size_type k = m_tensor.coord(iEntry,1);
397 for (
size_type col = 0; col < block_size; ++col ) {
398 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
399 sh_A[col][k] * sh_x[col][
j] );
404 y[iy] += m_a * ytmp ;
409 device.team_barrier();
427 const bool use_block_algorithm =
true;
429 const bool use_block_algorithm =
false;
432 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
433 if (use_block_algorithm) {
435 const size_t team_size = 4;
437 const size_t team_size = 2;
439 const size_t league_size = row_count;
440 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
441 Kokkos::parallel_for( config ,
Multiply(A,x,y,a,b) );
444 Kokkos::parallel_for( row_count ,
Multiply(A,x,y,a,b) );
453 template <
typename MatrixDevice,
454 typename MatrixStorage,
455 typename MatrixOrdinal,
456 typename MatrixMemory,
458 typename InputStorage,
460 typename OutputStorage,
461 typename ... OutputP>
467 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
469 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
480 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
518 : m_A_values( A.values )
519 , m_A_graph( A.graph )
522 , m_tensor( Kokkos::
cijk(A.values) )
534 KOKKOS_INLINE_FUNCTION
537 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
538 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
544 for (
size_type col=0; col<num_col; ++col)
545 for (
size_type j = 0 ; j < m_tensor.dimension() ; ++
j )
546 m_y(j, iBlockRow, col) = 0 ;
548 for (
size_type col=0; col<num_col; ++col)
549 for (
size_type j = 0 ; j < m_tensor.dimension() ; ++
j )
550 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
556 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
558 const size_type iBlockCol = m_A_graph.entries(iEntry);
560 for (
size_type col=0; col<num_col; ++col) {
562 const input_scalar *
const x = &m_x( 0, iBlockCol, col );
580 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
582 KOKKOS_INLINE_FUNCTION
583 void operator()(
const team_member & device )
const
585 const size_type iBlockRow = device.league_rank();
588 const size_type row_count = m_A_graph.row_map.extent(0)-1;
589 if (iBlockRow >= row_count)
592 const size_type num_thread = device.team_size();
593 const size_type thread_idx = device.team_rank();
594 const Kokkos::pair<size_type,size_type> work_range =
595 details::compute_work_range<output_scalar>(
596 device, m_tensor.dimension(), num_thread, thread_idx);
598 const size_type num_col = m_y.extent(2);
601 if ( m_b == output_scalar(0) )
602 for (size_type col=0; col<num_col; ++col)
603 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
604 m_y(j, iBlockRow, col) = 0 ;
606 for (size_type col=0; col<num_col; ++col)
607 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
608 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
610 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
611 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
612 const size_type BlockSize = 9;
613 const size_type numBlock =
614 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
616 const matrix_scalar* sh_A[BlockSize];
617 const input_scalar* sh_x[BlockSize];
619 size_type iBlockEntry = iBlockEntryBeg;
620 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
621 const size_type block_size =
622 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
625 for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
627 output_scalar *
const y = & m_y( 0 , iBlockRow , vec_col );
629 for ( size_type col = 0; col < block_size; ++col ) {
630 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
631 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
632 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
635 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
637 const size_type nEntry = m_tensor.num_entry(iy);
638 const size_type iEntryBeg = m_tensor.entry_begin(iy);
639 const size_type iEntryEnd = iEntryBeg + nEntry;
640 size_type iEntry = iEntryBeg;
642 output_scalar ytmp = 0 ;
645 const size_type nBlock = nEntry / tensor_type::vectorsize;
646 const size_type nEntryB = nBlock * tensor_type::vectorsize;
647 const size_type iEnd = iEntryBeg + nEntryB;
649 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
650 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
651 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
654 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
655 const size_type *j = &m_tensor.coord(iEntry,0);
656 const size_type *k = &m_tensor.coord(iEntry,1);
657 ValTV c(&(m_tensor.value(iEntry)));
659 for ( size_type col = 0; col < block_size; ++col ) {
660 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
661 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
665 aj.multiply_add(ak, xj);
666 vy.multiply_add(c, aj);
673 const size_type rem = iEntryEnd-iEntry;
675 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
676 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
677 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
678 const size_type *j = &m_tensor.coord(iEntry,0);
679 const size_type *k = &m_tensor.coord(iEntry,1);
680 ValTV2 c(&(m_tensor.value(iEntry)));
682 for ( size_type col = 0; col < block_size; ++col ) {
683 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
684 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
688 aj.multiply_add(ak, xj);
694 y[iy] += m_a * ytmp ;
701 device.team_barrier();
718 typedef typename Kokkos::TeamPolicy< execution_space >::member_type
team_member ;
720 KOKKOS_INLINE_FUNCTION
723 const size_type iBlockRow = device.league_rank();
726 const size_type row_count = m_A_graph.row_map.extent(0)-1;
727 if (iBlockRow >= row_count)
730 const size_type num_thread = device.team_size();
731 const size_type thread_idx = device.team_rank();
732 const Kokkos::pair<size_type,size_type> work_range =
733 details::compute_work_range<output_scalar>(
734 device, m_tensor.dimension(), num_thread, thread_idx);
740 for (
size_type col=0; col<num_col; ++col)
741 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
742 m_y(j, iBlockRow, col) = 0 ;
744 for (
size_type col=0; col<num_col; ++col)
745 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
746 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
748 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
749 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
752 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
758 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
760 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
763 for (
size_type vec_col=0; vec_col<num_col; ++vec_col) {
767 for (
size_type col = 0; col < block_size; ++col ) {
768 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
769 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
770 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
773 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
775 const size_type nEntry = m_tensor.num_entry(iy);
776 const size_type iEntryBeg = m_tensor.entry_begin(iy);
777 const size_type iEntryEnd = iEntryBeg + nEntry;
783 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
784 const size_type nBlock = nEntry / tensor_type::vectorsize;
785 const size_type nEntryB = nBlock * tensor_type::vectorsize;
786 const size_type iEnd = iEntryBeg + nEntryB;
793 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
794 const size_type *j = &m_tensor.coord(iEntry,0);
795 const size_type *k = &m_tensor.coord(iEntry,1);
796 ValTV c(&(m_tensor.value(iEntry)));
798 for (
size_type col = 0; col < block_size; ++col ) {
799 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
800 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
804 aj.multiply_add(ak, xj);
805 vy.multiply_add(c, aj);
812 for ( ; iEntry<iEntryEnd; ++iEntry) {
813 const size_type j = m_tensor.coord(iEntry,0);
814 const size_type k = m_tensor.coord(iEntry,1);
817 for (
size_type col = 0; col < block_size; ++col ) {
818 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
819 sh_A[col][k] * sh_x[col][
j] );
824 y[iy] += m_a * ytmp ;
831 device.team_barrier();
849 const bool use_block_algorithm =
true;
851 const bool use_block_algorithm =
false;
854 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
855 if (use_block_algorithm) {
857 const size_t team_size = 4;
859 const size_t team_size = 2;
861 const size_t league_size = row_count;
862 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
863 Kokkos::parallel_for( config ,
Multiply(A,x,y,a,b) );
866 Kokkos::parallel_for( row_count ,
Multiply(A,x,y,a,b) );
875 template <
typename MatrixDevice,
876 typename MatrixStorage,
877 typename MatrixOrdinal,
878 typename MatrixMemory,
880 typename InputStorage,
882 typename OutputStorage,
883 typename ... OutputP>
889 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
891 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
931 template <
typename MatrixDevice,
932 typename MatrixStorage,
933 typename MatrixOrdinal,
934 typename MatrixMemory,
936 typename InputStorage,
938 typename OutputStorage,
939 typename ... OutputP>
945 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
947 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
983 template <
typename MatrixType,
typename InputViewType,
typename OutputViewType>
990 template <
typename MatrixDevice,
991 typename MatrixStorage,
992 typename MatrixOrdinal,
993 typename MatrixMemory,
995 typename InputStorage,
997 typename OutputStorage,
998 typename ... OutputP>
1004 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1006 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1015 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
1032 template <
int BlockSize>
1033 struct BlockKernel {
1055 : m_A_values( A.values )
1056 , m_A_graph( A.graph )
1062 , numBlocks( dim / BlockSize )
1063 , rem( dim % BlockSize )
1064 , dim_block( numBlocks*BlockSize )
1067 KOKKOS_INLINE_FUNCTION
1070 #if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
1071 output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
1076 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1077 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1079 for (; pce_block < dim_block; pce_block+=BlockSize) {
1082 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1086 for (
size_type k = 0; k < BlockSize; ++k)
1089 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1093 for (
size_type k = 0; k < BlockSize; ++k)
1095 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1097 const size_type col = m_A_graph.entries(iEntry);
1099 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1103 for (
size_type k = 0; k < BlockSize; ++k)
1106 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1110 for (
size_type k = 0; k < BlockSize; ++k) {
1119 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1126 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1132 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1134 const size_type col = m_A_graph.entries(iEntry);
1136 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1143 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1175 : m_A_values( A.values )
1176 , m_A_graph( A.graph )
1184 KOKKOS_INLINE_FUNCTION
1187 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1188 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1191 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1198 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1204 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1206 const size_type col = m_A_graph.entries(iEntry);
1208 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1225 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1229 #if defined (__MIC__)
1231 Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1233 Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1235 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1237 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1239 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1241 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1244 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1246 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1248 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1250 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1259 template <
typename MatrixDevice,
1260 typename MatrixStorage,
1261 typename MatrixOrdinal,
1262 typename MatrixMemory,
1263 typename MatrixSize,
1264 typename InputStorage,
1265 typename ... InputP,
1266 typename OutputStorage,
1267 typename ... OutputP>
1273 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1275 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1284 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
1306 InputP... > input_vector_1d_type;
1308 OutputP... > output_vector_1d_type;
1310 output_vector_1d_type> MeanMultiply1D;
1313 input_vector_1d_type x_col =
1314 Kokkos::subview(x, Kokkos::ALL(), i);
1315 output_vector_1d_type y_col =
1316 Kokkos::subview(y, Kokkos::ALL(), i);
1317 MeanMultiply1D::apply( A, x_col, y_col, a, b );
1326 template <
typename MatrixDevice,
1327 typename MatrixStorage,
1328 typename MatrixOrdinal,
1329 typename MatrixMemory,
1330 typename MatrixSize,
1331 typename InputStorage,
1332 typename ... InputP,
1333 typename OutputStorage,
1334 typename ... OutputP>
1340 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1342 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1382 template <
typename MatrixDevice,
1383 typename MatrixStorage,
1384 typename MatrixOrdinal,
1385 typename MatrixMemory,
1386 typename MatrixSize,
1387 typename InputStorage,
1388 typename ... InputP,
1389 typename OutputStorage,
1390 typename ... OutputP>
1396 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1398 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1436 namespace KokkosSparse {
1439 #if KOKKOSKERNELS_VERSION >= 40199
1440 typename ExecutionSpace,
1442 #if KOKKOSKERNELS_VERSION >= 40299
1447 typename MatrixType,
1449 typename ... InputP,
1450 typename OutputType,
1451 typename ... OutputP>
1452 typename std::enable_if<
1455 #if KOKKOSKERNELS_VERSION >= 40299
1456 && KokkosSparse::is_crs_matrix_v<MatrixType>
1457 && (Kokkos::View< OutputType, OutputP... >::rank() == 1)
1461 #
if KOKKOSKERNELS_VERSION >= 40199
1462 const ExecutionSpace& space,
1464 #
if KOKKOSKERNELS_VERSION < 40299
1465 KokkosKernels::Experimental::Controls,
1471 const MatrixType& A,
1472 const Kokkos::View< InputType, InputP... >& x,
1474 const Kokkos::View< OutputType, OutputP... >& y
1475 #
if KOKKOSKERNELS_VERSION < 40299
1480 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1481 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1483 OutputVectorType> multiply_type;
1485 OutputVectorType> mean_multiply_type;
1487 #if KOKKOSKERNELS_VERSION >= 40199
1488 if(space != ExecutionSpace()) {
1490 "Stokhos spmv not implemented for non-default execution space instance");
1495 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1500 "Stokhos spmv not implemented for non-constant a or b");
1503 mean_multiply_type::apply( A, x, y,
1504 Sacado::Value<AlphaType>::eval(a),
1505 Sacado::Value<BetaType>::eval(b) );
1508 multiply_type::apply( A, x, y,
1509 Sacado::Value<AlphaType>::eval(a),
1510 Sacado::Value<BetaType>::eval(b) );
1514 #if KOKKOSKERNELS_VERSION >= 40199
1515 typename ExecutionSpace,
1517 #if KOKKOSKERNELS_VERSION >= 40299
1522 typename MatrixType,
1524 typename ... InputP,
1525 typename OutputType,
1526 typename ... OutputP>
1527 typename std::enable_if<
1530 #if KOKKOSKERNELS_VERSION >= 40299
1531 && KokkosSparse::is_crs_matrix_v<MatrixType>
1532 && (Kokkos::View< OutputType, OutputP... >::rank() == 2)
1536 #
if KOKKOSKERNELS_VERSION >= 40199
1537 const ExecutionSpace& space,
1539 #
if KOKKOSKERNELS_VERSION < 40299
1540 KokkosKernels::Experimental::Controls,
1546 const MatrixType& A,
1547 const Kokkos::View< InputType, InputP... >& x,
1549 const Kokkos::View< OutputType, OutputP... >& y
1550 #
if KOKKOSKERNELS_VERSION < 40299
1555 #if KOKKOSKERNELS_VERSION >= 40199
1556 if(space != ExecutionSpace()) {
1558 "Stokhos spmv not implemented for non-default execution space instance");
1563 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1565 if (y.extent(1) == 1) {
1566 auto y_1D = subview(y, Kokkos::ALL(), 0);
1567 auto x_1D = subview(x, Kokkos::ALL(), 0);
1568 #if KOKKOSKERNELS_VERSION >= 40299
1569 spmv(space, handle, mode, a, A, x_1D, b, y_1D);
1570 #elif (KOKKOSKERNELS_VERSION < 40299) && (KOKKOSKERNELS_VERSION >= 40199)
1571 spmv(space, KokkosKernels::Experimental::Controls(), mode, a, A, x_1D, b, y_1D, RANK_ONE());
1573 spmv(KokkosKernels::Experimental::Controls(), mode, a, A, x_1D, b, y_1D, RANK_ONE());
1577 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1578 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1581 OutputVectorType> multiply_type;
1583 OutputVectorType> mean_multiply_type;
1587 "Stokhos spmv not implemented for non-constant a or b");
1590 typename InputVectorType::const_type x_const = x;
1593 mean_multiply_type::apply( A, x_const, y,
1594 Sacado::Value<AlphaType>::eval(a),
1595 Sacado::Value<BetaType>::eval(b));
1598 multiply_type::apply( A, x_const, y,
1599 Sacado::Value<AlphaType>::eval(a),
1600 Sacado::Value<BetaType>::eval(b));
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
const input_array_type v_x
Kokkos::FlatArrayType< matrix_values_type >::type matrix_array_type
Impl::RemoveRandomAccess< typename input_vector_type::array_type >::type input_array_type
Impl::RemoveRandomAccess< typename output_vector_type::array_type >::type output_array_type
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
Multiply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Sacado::UQ::PCE< InputStorage > InputVectorValue
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
InputVectorValue::value_type input_scalar
OutputVectorValue::value_type output_scalar
const matrix_graph_type m_A_graph
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
tensor_type::value_type tensor_scalar
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
OutputVectorValue::value_type output_scalar
const output_array_type v_y
MatrixValue::ordinal_type size_type
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
Kokkos::DefaultExecutionSpace execution_space
const matrix_array_type m_A_values
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
InputVectorValue::value_type input_scalar
const input_array_type v_x
Sacado::UQ::PCE< MatrixStorage > MatrixValue
OutputVectorValue::value_type output_scalar
Sacado::UQ::PCE< InputStorage > InputVectorValue
Impl::RemoveRandomAccess< typename input_vector_type::array_type >::type input_array_type
Sacado::UQ::PCE< MatrixStorage > MatrixValue
const input_array_type m_x
const size_type dim_block
const matrix_graph_type m_A_graph
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
matrix_type::const_type const_matrix_type
const tensor_type m_tensor
InputVectorValue::value_type input_scalar
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
tensor_type::size_type size_type
Kokkos::CijkType< matrix_values_type >::type tensor_type
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
Kokkos::View< typename ViewType::data_type, typename ViewType::array_layout, typename ViewType::device_type, Kokkos::MemoryTraits< M1 > > type
MatrixValue::value_type matrix_scalar
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
const matrix_array_type m_A_values
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
Sacado::UQ::PCE< InputStorage > InputVectorValue
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
matrix_type::const_type const_matrix_type
const matrix_array_type m_A_values
const matrix_graph_type m_A_graph
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
MatrixDevice::execution_space execution_space
matrix_type::values_type matrix_values_type
matrix_type::StaticCrsGraphType matrix_graph_type
OutputVectorValue::value_type output_scalar
matrix_type::values_type matrix_values_type
const matrix_graph_type m_A_graph
MatrixDevice::execution_space execution_space
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
MatrixDevice::execution_space execution_space
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
matrix_type::const_type const_matrix_type
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
matrix_type::values_type matrix_values_type
matrix_values_type::array_type matrix_array_type
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
Impl::RemoveRandomAccess< typename input_vector_type::array_type >::type input_array_type
Kokkos::FlatArrayType< matrix_values_type >::type matrix_array_type
Multiply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
Sacado::UQ::PCE< InputStorage > InputVectorValue
tensor_type::size_type size_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
KOKKOS_INLINE_FUNCTION void operator()(const team_member &device) const
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
Impl::RemoveRandomAccess< typename output_vector_type::array_type >::type output_array_type
InputVectorValue::value_type input_scalar
matrix_values_type::array_type matrix_array_type
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
const output_array_type v_y
MatrixValue::ordinal_type size_type
Sacado::UQ::PCE< MatrixStorage > MatrixValue
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Impl::RemoveRandomAccess< typename output_vector_type::array_type >::type output_array_type
const matrix_array_type m_A_values
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
matrix_type::StaticCrsGraphType matrix_graph_type
InputVectorValue::value_type input_scalar
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Kokkos::TeamPolicy< execution_space >::member_type team_member
const size_type numBlocks
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Sacado::UQ::PCE< InputStorage > InputVectorValue
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
MatrixDevice::execution_space execution_space
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
Impl::RemoveRandomAccess< typename input_vector_type::array_type >::type input_array_type
InputVectorValue::value_type input_scalar
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
const input_array_type m_x
const tensor_type m_tensor
Sacado::UQ::PCE< MatrixStorage > MatrixValue
OutputVectorValue::value_type output_scalar
const output_array_type m_y
matrix_type::StaticCrsGraphType matrix_graph_type
Sacado::UQ::PCE< InputStorage > InputVectorValue
tensor_type::value_type tensor_scalar
MatrixValue::value_type matrix_scalar
MatrixDevice::execution_space execution_space
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
Kokkos::CijkType< matrix_values_type >::type tensor_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
matrix_type::const_type const_matrix_type
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
KOKKOS_INLINE_FUNCTION void zero()
InputVectorValue::value_type input_scalar
MatrixValue::value_type matrix_scalar
InputVectorValue::value_type input_scalar
OutputVectorValue::value_type output_scalar
const output_array_type m_y
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
Kokkos::TeamPolicy< execution_space >::member_type team_member
OutputVectorValue::value_type output_scalar
Sacado::UQ::PCE< InputStorage > InputVectorValue
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
OutputVectorValue::value_type output_scalar
Impl::RemoveRandomAccess< typename output_vector_type::array_type >::type output_array_type
Sacado::UQ::PCE< InputStorage > InputVectorValue
KOKKOS_INLINE_FUNCTION void operator()(const team_member &device) const
BlockKernel(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
Kernel(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)