42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_HPP
48 #include "KokkosSparse_CrsMatrix.hpp"
49 #include "KokkosSparse_spmv.hpp"
66 template <
typename MatrixDevice,
67 typename MatrixStorage,
68 typename MatrixOrdinal,
69 typename MatrixMemory,
71 typename InputStorage,
73 typename OutputStorage,
80 Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
82 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
131 : m_A_values( A.values )
132 , m_A_graph( A.graph )
135 , m_tensor( Kokkos::
cijk(A.values) )
147 KOKKOS_INLINE_FUNCTION
154 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
155 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
159 for (
size_type j = 0 ;
j < m_tensor.dimension() ; ++
j )
162 for (
size_type j = 0 ;
j < m_tensor.dimension() ; ++
j )
165 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
166 const input_scalar *
const x = & m_x( 0 , m_A_graph.entries(iEntry) );
170 m_tensor , A , x , y , m_a );
185 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
186 KOKKOS_INLINE_FUNCTION
187 void operator()(
const team_member &
device )
const
189 const size_type iBlockRow = device.league_rank();
192 const size_type row_count = m_A_graph.row_map.extent(0)-1;
193 if (iBlockRow >= row_count)
196 const size_type num_thread = device.team_size();
197 const size_type thread_idx = device.team_rank();
198 const Kokkos::pair<size_type,size_type> work_range =
199 details::compute_work_range<output_scalar>(
200 device, m_tensor.dimension(), num_thread, thread_idx);
204 output_scalar *
const y = & m_y(0,iBlockRow);
207 if ( m_b == output_scalar(0) )
208 for ( size_type
j = work_range.first ;
j < work_range.second ; ++
j )
211 for ( size_type
j = work_range.first ;
j < work_range.second ; ++
j )
214 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
215 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
216 const size_type BlockSize = 9;
217 const size_type numBlock =
218 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
220 const matrix_scalar* sh_A[BlockSize];
221 const input_scalar* sh_x[BlockSize];
223 size_type iBlockEntry = iBlockEntryBeg;
224 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
225 const size_type block_size =
226 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
228 for ( size_type col = 0; col < block_size; ++col ) {
229 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
230 sh_x[col] = & m_x( 0 , iBlockColumn );
231 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
234 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
236 const size_type nEntry = m_tensor.num_entry(iy);
237 const size_type iEntryBeg = m_tensor.entry_begin(iy);
238 const size_type iEntryEnd = iEntryBeg + nEntry;
239 size_type iEntry = iEntryBeg;
241 output_scalar ytmp = 0 ;
244 const size_type nBlock = nEntry / tensor_type::vectorsize;
245 const size_type nEntryB = nBlock * tensor_type::vectorsize;
246 const size_type iEnd = iEntryBeg + nEntryB;
248 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
249 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
250 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
253 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
254 const size_type *
j = &m_tensor.coord(iEntry,0);
255 const size_type *k = &m_tensor.coord(iEntry,1);
256 ValTV c(&(m_tensor.value(iEntry)));
258 for ( size_type col = 0; col < block_size; ++col ) {
259 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
260 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
264 aj.multiply_add(ak, xj);
265 vy.multiply_add(c, aj);
272 const size_type rem = iEntryEnd-iEntry;
274 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
275 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
276 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
277 const size_type *j = &m_tensor.coord(iEntry,0);
278 const size_type *k = &m_tensor.coord(iEntry,1);
279 ValTV2 c(&(m_tensor.value(iEntry)));
281 for ( size_type col = 0; col < block_size; ++col ) {
282 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
283 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
287 aj.multiply_add(ak, xj);
293 y[iy] += m_a * ytmp ;
298 device.team_barrier();
314 typedef typename Kokkos::TeamPolicy< execution_space >::member_type
team_member ;
315 KOKKOS_INLINE_FUNCTION
318 const size_type iBlockRow = device.league_rank();
321 const size_type row_count = m_A_graph.row_map.extent(0)-1;
322 if (iBlockRow >= row_count)
325 const size_type num_thread = device.team_size();
326 const size_type thread_idx = device.team_rank();
327 const Kokkos::pair<size_type,size_type> work_range =
328 details::compute_work_range<output_scalar>(
329 device, m_tensor.dimension(), num_thread, thread_idx);
337 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
340 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
343 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
344 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
347 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
353 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
355 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
357 for (
size_type col = 0; col < block_size; ++col ) {
358 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
359 sh_x[col] = & m_x( 0 , iBlockColumn );
360 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
363 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
365 const size_type nEntry = m_tensor.num_entry(iy);
366 const size_type iEntryBeg = m_tensor.entry_begin(iy);
367 const size_type iEntryEnd = iEntryBeg + nEntry;
373 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
374 const size_type nBlock = nEntry / tensor_type::vectorsize;
375 const size_type nEntryB = nBlock * tensor_type::vectorsize;
376 const size_type iEnd = iEntryBeg + nEntryB;
383 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
384 const size_type *j = &m_tensor.coord(iEntry,0);
385 const size_type *k = &m_tensor.coord(iEntry,1);
386 ValTV c(&(m_tensor.value(iEntry)));
388 for (
size_type col = 0; col < block_size; ++col ) {
389 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
390 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
394 aj.multiply_add(ak, xj);
395 vy.multiply_add(c, aj);
402 for ( ; iEntry<iEntryEnd; ++iEntry) {
403 const size_type j = m_tensor.coord(iEntry,0);
404 const size_type k = m_tensor.coord(iEntry,1);
407 for (
size_type col = 0; col < block_size; ++col ) {
408 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
409 sh_A[col][k] * sh_x[col][
j] );
414 y[iy] += m_a * ytmp ;
419 device.team_barrier();
437 const bool use_block_algorithm =
true;
439 const bool use_block_algorithm =
false;
442 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
443 if (use_block_algorithm) {
445 const size_t team_size = 4;
447 const size_t team_size = 2;
449 const size_t league_size = row_count;
450 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
451 Kokkos::parallel_for( config ,
Multiply(A,x,y,a,b) );
454 Kokkos::parallel_for( row_count ,
Multiply(A,x,y,a,b) );
463 template <
typename MatrixDevice,
464 typename MatrixStorage,
465 typename MatrixOrdinal,
466 typename MatrixMemory,
468 typename InputStorage,
470 typename OutputStorage,
471 typename ... OutputP>
477 Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
479 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
528 : m_A_values( A.values )
529 , m_A_graph( A.graph )
532 , m_tensor( Kokkos::
cijk(A.values) )
544 KOKKOS_INLINE_FUNCTION
547 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
548 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
554 for (
size_type col=0; col<num_col; ++col)
555 for (
size_type j = 0 ; j < m_tensor.dimension() ; ++
j )
556 m_y(j, iBlockRow, col) = 0 ;
558 for (
size_type col=0; col<num_col; ++col)
559 for (
size_type j = 0 ; j < m_tensor.dimension() ; ++
j )
560 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
566 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
568 const size_type iBlockCol = m_A_graph.entries(iEntry);
570 for (
size_type col=0; col<num_col; ++col) {
572 const input_scalar *
const x = &m_x( 0, iBlockCol, col );
590 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
592 KOKKOS_INLINE_FUNCTION
593 void operator()(
const team_member & device )
const
595 const size_type iBlockRow = device.league_rank();
598 const size_type row_count = m_A_graph.row_map.extent(0)-1;
599 if (iBlockRow >= row_count)
602 const size_type num_thread = device.team_size();
603 const size_type thread_idx = device.team_rank();
604 const Kokkos::pair<size_type,size_type> work_range =
605 details::compute_work_range<output_scalar>(
606 device, m_tensor.dimension(), num_thread, thread_idx);
608 const size_type num_col = m_y.extent(2);
611 if ( m_b == output_scalar(0) )
612 for (size_type col=0; col<num_col; ++col)
613 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
614 m_y(j, iBlockRow, col) = 0 ;
616 for (size_type col=0; col<num_col; ++col)
617 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
618 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
620 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
621 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
622 const size_type BlockSize = 9;
623 const size_type numBlock =
624 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
626 const matrix_scalar* sh_A[BlockSize];
627 const input_scalar* sh_x[BlockSize];
629 size_type iBlockEntry = iBlockEntryBeg;
630 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
631 const size_type block_size =
632 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
635 for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
637 output_scalar *
const y = & m_y( 0 , iBlockRow , vec_col );
639 for ( size_type col = 0; col < block_size; ++col ) {
640 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
641 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
642 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
645 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
647 const size_type nEntry = m_tensor.num_entry(iy);
648 const size_type iEntryBeg = m_tensor.entry_begin(iy);
649 const size_type iEntryEnd = iEntryBeg + nEntry;
650 size_type iEntry = iEntryBeg;
652 output_scalar ytmp = 0 ;
655 const size_type nBlock = nEntry / tensor_type::vectorsize;
656 const size_type nEntryB = nBlock * tensor_type::vectorsize;
657 const size_type iEnd = iEntryBeg + nEntryB;
659 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
660 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
661 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
664 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
665 const size_type *j = &m_tensor.coord(iEntry,0);
666 const size_type *k = &m_tensor.coord(iEntry,1);
667 ValTV c(&(m_tensor.value(iEntry)));
669 for ( size_type col = 0; col < block_size; ++col ) {
670 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
671 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
675 aj.multiply_add(ak, xj);
676 vy.multiply_add(c, aj);
683 const size_type rem = iEntryEnd-iEntry;
685 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
686 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
687 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
688 const size_type *j = &m_tensor.coord(iEntry,0);
689 const size_type *k = &m_tensor.coord(iEntry,1);
690 ValTV2 c(&(m_tensor.value(iEntry)));
692 for ( size_type col = 0; col < block_size; ++col ) {
693 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
694 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
698 aj.multiply_add(ak, xj);
704 y[iy] += m_a * ytmp ;
711 device.team_barrier();
728 typedef typename Kokkos::TeamPolicy< execution_space >::member_type
team_member ;
730 KOKKOS_INLINE_FUNCTION
733 const size_type iBlockRow = device.league_rank();
736 const size_type row_count = m_A_graph.row_map.extent(0)-1;
737 if (iBlockRow >= row_count)
740 const size_type num_thread = device.team_size();
741 const size_type thread_idx = device.team_rank();
742 const Kokkos::pair<size_type,size_type> work_range =
743 details::compute_work_range<output_scalar>(
744 device, m_tensor.dimension(), num_thread, thread_idx);
750 for (
size_type col=0; col<num_col; ++col)
751 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
752 m_y(j, iBlockRow, col) = 0 ;
754 for (
size_type col=0; col<num_col; ++col)
755 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
756 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
758 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
759 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
762 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
768 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
770 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
773 for (
size_type vec_col=0; vec_col<num_col; ++vec_col) {
777 for (
size_type col = 0; col < block_size; ++col ) {
778 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
779 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
780 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
783 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
785 const size_type nEntry = m_tensor.num_entry(iy);
786 const size_type iEntryBeg = m_tensor.entry_begin(iy);
787 const size_type iEntryEnd = iEntryBeg + nEntry;
793 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
794 const size_type nBlock = nEntry / tensor_type::vectorsize;
795 const size_type nEntryB = nBlock * tensor_type::vectorsize;
796 const size_type iEnd = iEntryBeg + nEntryB;
803 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
804 const size_type *j = &m_tensor.coord(iEntry,0);
805 const size_type *k = &m_tensor.coord(iEntry,1);
806 ValTV c(&(m_tensor.value(iEntry)));
808 for (
size_type col = 0; col < block_size; ++col ) {
809 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
810 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
814 aj.multiply_add(ak, xj);
815 vy.multiply_add(c, aj);
822 for ( ; iEntry<iEntryEnd; ++iEntry) {
823 const size_type j = m_tensor.coord(iEntry,0);
824 const size_type k = m_tensor.coord(iEntry,1);
827 for (
size_type col = 0; col < block_size; ++col ) {
828 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
829 sh_A[col][k] * sh_x[col][
j] );
834 y[iy] += m_a * ytmp ;
841 device.team_barrier();
859 const bool use_block_algorithm =
true;
861 const bool use_block_algorithm =
false;
864 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
865 if (use_block_algorithm) {
867 const size_t team_size = 4;
869 const size_t team_size = 2;
871 const size_t league_size = row_count;
872 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
873 Kokkos::parallel_for( config ,
Multiply(A,x,y,a,b) );
876 Kokkos::parallel_for( row_count ,
Multiply(A,x,y,a,b) );
881 template <
typename MatrixType,
typename InputViewType,
typename OutputViewType>
888 template <
typename MatrixDevice,
889 typename MatrixStorage,
890 typename MatrixOrdinal,
891 typename MatrixMemory,
893 typename InputStorage,
895 typename OutputStorage,
896 typename ... OutputP>
902 Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
904 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
930 template <
int BlockSize>
953 : m_A_values( A.values )
954 , m_A_graph( A.graph )
960 , numBlocks( dim / BlockSize )
961 , rem( dim % BlockSize )
962 , dim_block( numBlocks*BlockSize )
965 KOKKOS_INLINE_FUNCTION
968 #if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
969 output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
974 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
975 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
977 for (; pce_block < dim_block; pce_block+=BlockSize) {
980 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
984 for (
size_type k = 0; k < BlockSize; ++k)
987 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
991 for (
size_type k = 0; k < BlockSize; ++k)
993 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
995 const size_type col = m_A_graph.entries(iEntry);
997 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1001 for (
size_type k = 0; k < BlockSize; ++k)
1004 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1008 for (
size_type k = 0; k < BlockSize; ++k) {
1017 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1024 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1030 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1032 const size_type col = m_A_graph.entries(iEntry);
1034 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1041 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1073 : m_A_values( A.values )
1074 , m_A_graph( A.graph )
1082 KOKKOS_INLINE_FUNCTION
1085 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1086 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1089 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1096 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1102 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1104 const size_type col = m_A_graph.entries(iEntry);
1106 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1123 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1127 #if defined (__MIC__)
1129 Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1131 Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1133 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1135 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1137 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1139 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1142 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1144 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1146 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1148 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1157 template <
typename MatrixDevice,
1158 typename MatrixStorage,
1159 typename MatrixOrdinal,
1160 typename MatrixMemory,
1161 typename MatrixSize,
1162 typename InputStorage,
1163 typename ... InputP,
1164 typename OutputStorage,
1165 typename ... OutputP>
1171 Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
1173 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1204 InputP... > input_vector_1d_type;
1206 OutputP... > output_vector_1d_type;
1208 output_vector_1d_type> MeanMultiply1D;
1211 input_vector_1d_type x_col =
1212 Kokkos::subview(x, Kokkos::ALL(), i);
1213 output_vector_1d_type y_col =
1214 Kokkos::subview(y, Kokkos::ALL(), i);
1215 MeanMultiply1D::apply( A, x_col, y_col, a, b );
1222 namespace KokkosSparse {
1224 template <
typename AlphaType,
1226 typename MatrixType,
1228 typename ... InputP,
1229 typename OutputType,
1230 typename ... OutputP>
1231 typename std::enable_if<
1238 const MatrixType& A,
1239 const Kokkos::View< InputType, InputP... >& x,
1241 const Kokkos::View< OutputType, OutputP... >& y,
1244 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1245 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1247 OutputVectorType> multiply_type;
1249 OutputVectorType> mean_multiply_type;
1253 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1258 "Stokhos spmv not implemented for non-constant a or b");
1261 mean_multiply_type::apply( A, x, y,
1262 Sacado::Value<AlphaType>::eval(a),
1263 Sacado::Value<BetaType>::eval(b) );
1266 multiply_type::apply( A, x, y,
1267 Sacado::Value<AlphaType>::eval(a),
1268 Sacado::Value<BetaType>::eval(b) );
1271 template <
typename AlphaType,
1273 typename MatrixType,
1275 typename ... InputP,
1276 typename OutputType,
1277 typename ... OutputP>
1278 typename std::enable_if<
1285 const MatrixType& A,
1286 const Kokkos::View< InputType, InputP... >& x,
1288 const Kokkos::View< OutputType, OutputP... >& y,
1293 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1295 if (y.extent(1) == 1) {
1296 auto y_1D = subview(y, Kokkos::ALL(), 0);
1297 auto x_1D = subview(x, Kokkos::ALL(), 0);
1298 spmv(mode, a, A, x_1D, b, y_1D, RANK_ONE());
1301 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1302 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1304 OutputVectorType> multiply_type;
1306 OutputVectorType> mean_multiply_type;
1310 "Stokhos spmv not implemented for non-constant a or b");
1313 mean_multiply_type::apply( A, x, y,
1314 Sacado::Value<AlphaType>::eval(a),
1315 Sacado::Value<BetaType>::eval(b));
1318 multiply_type::apply( A, x, y,
1319 Sacado::Value<AlphaType>::eval(a),
1320 Sacado::Value<BetaType>::eval(b));
const matrix_array_type m_A_values
Kokkos::FlatArrayType< matrix_values_type >::type matrix_array_type
Kokkos::CijkType< matrix_values_type >::type tensor_type
input_vector_type::array_type input_array_type
MatrixDevice execution_space
InputVectorValue::value_type input_scalar
InputVectorValue::value_type input_scalar
const matrix_graph_type m_A_graph
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Kokkos::View< InputVectorValue *, InputP... > input_vector_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
const matrix_graph_type m_A_graph
Sacado::UQ::PCE< MatrixStorage > MatrixValue
const input_array_type v_x
tensor_type::size_type size_type
const input_array_type v_x
Kokkos::View< InputVectorValue *, InputP... > input_vector_type
const output_array_type m_y
tensor_type::value_type tensor_scalar
const matrix_array_type m_A_values
matrix_type::values_type matrix_values_type
output_vector_type::array_type output_array_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
MatrixDevice execution_space
input_vector_type::array_type input_array_type
KOKKOS_INLINE_FUNCTION void operator()(const team_member &device) const
Kokkos::View< InputVectorValue **, InputP... > input_vector_type
MatrixDevice execution_space
Sacado::UQ::PCE< InputStorage > InputVectorValue
Sacado::UQ::PCE< InputStorage > InputVectorValue
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
matrix_type::values_type matrix_values_type
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
const tensor_type m_tensor
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Kokkos::TeamPolicy< execution_space >::member_type team_member
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
Sacado::UQ::PCE< MatrixStorage > MatrixValue
input_vector_type::array_type input_array_type
const tensor_type m_tensor
tensor_type::value_type tensor_scalar
matrix_type::StaticCrsGraphType matrix_graph_type
const output_array_type m_y
const size_type dim_block
const matrix_array_type m_A_values
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
input_vector_type::array_type input_array_type
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
Multiply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
const input_array_type m_x
Sacado::UQ::PCE< MatrixStorage > MatrixValue
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
output_vector_type::array_type output_array_type
matrix_type::values_type matrix_values_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
const size_type numBlocks
MatrixValue::value_type matrix_scalar
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
Kokkos::CijkType< matrix_values_type >::type tensor_type
const matrix_graph_type m_A_graph
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
InputVectorValue::value_type input_scalar
MatrixValue::ordinal_type size_type
MatrixValue::ordinal_type size_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
matrix_values_type::array_type matrix_array_type
const matrix_array_type m_A_values
Kokkos::TeamPolicy< execution_space >::member_type team_member
MatrixValue::value_type matrix_scalar
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
Sacado::UQ::PCE< MatrixStorage > MatrixValue
matrix_type::StaticCrsGraphType matrix_graph_type
const output_array_type v_y
Kernel(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
matrix_type::StaticCrsGraphType matrix_graph_type
Multiply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
output_vector_type::array_type output_array_type
Kokkos::FlatArrayType< matrix_values_type >::type matrix_array_type
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
OutputVectorValue::value_type output_scalar
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
InputVectorValue::value_type input_scalar
const input_array_type m_x
KOKKOS_INLINE_FUNCTION void zero()
tensor_type::size_type size_type
OutputVectorValue::value_type output_scalar
OutputVectorValue::value_type output_scalar
KOKKOS_INLINE_FUNCTION void operator()(const team_member &device) const
output_vector_type::array_type output_array_type
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
MatrixDevice execution_space
MatrixDevice execution_space
OutputVectorValue::value_type output_scalar
Kokkos::View< InputVectorValue **, InputP... > input_vector_type
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
BlockKernel(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
const matrix_graph_type m_A_graph
MatrixValue::value_type matrix_scalar
const output_array_type v_y
matrix_values_type::array_type matrix_array_type
Sacado::UQ::PCE< InputStorage > InputVectorValue
Sacado::UQ::PCE< InputStorage > InputVectorValue