42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_HPP
48 #include "KokkosSparse_CrsMatrix.hpp"
49 #include "KokkosSparse_spmv.hpp"
66 template <
typename MatrixDevice,
67 typename MatrixStorage,
68 typename MatrixOrdinal,
69 typename MatrixMemory,
71 typename InputStorage,
73 typename OutputStorage,
80 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
82 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
131 : m_A_values( A.values )
132 , m_A_graph( A.graph )
135 , m_tensor( Kokkos::
cijk(A.values) )
147 KOKKOS_INLINE_FUNCTION
154 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
155 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
159 for (
size_type j = 0 ;
j < m_tensor.dimension() ; ++
j )
162 for (
size_type j = 0 ;
j < m_tensor.dimension() ; ++
j )
165 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
166 const input_scalar *
const x = & m_x( 0 , m_A_graph.entries(iEntry) );
170 m_tensor , A , x , y , m_a );
185 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
186 KOKKOS_INLINE_FUNCTION
187 void operator()(
const team_member &
device )
const
189 const size_type iBlockRow = device.league_rank();
192 const size_type row_count = m_A_graph.row_map.extent(0)-1;
193 if (iBlockRow >= row_count)
196 const size_type num_thread = device.team_size();
197 const size_type thread_idx = device.team_rank();
198 const Kokkos::pair<size_type,size_type> work_range =
199 details::compute_work_range<output_scalar>(
200 device, m_tensor.dimension(), num_thread, thread_idx);
204 output_scalar *
const y = & m_y(0,iBlockRow);
207 if ( m_b == output_scalar(0) )
208 for ( size_type
j = work_range.first ;
j < work_range.second ; ++
j )
211 for ( size_type
j = work_range.first ;
j < work_range.second ; ++
j )
214 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
215 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
216 const size_type BlockSize = 9;
217 const size_type numBlock =
218 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
220 const matrix_scalar* sh_A[BlockSize];
221 const input_scalar* sh_x[BlockSize];
223 size_type iBlockEntry = iBlockEntryBeg;
224 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
225 const size_type block_size =
226 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
228 for ( size_type col = 0; col < block_size; ++col ) {
229 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
230 sh_x[col] = & m_x( 0 , iBlockColumn );
231 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
234 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
236 const size_type nEntry = m_tensor.num_entry(iy);
237 const size_type iEntryBeg = m_tensor.entry_begin(iy);
238 const size_type iEntryEnd = iEntryBeg + nEntry;
239 size_type iEntry = iEntryBeg;
241 output_scalar ytmp = 0 ;
244 const size_type nBlock = nEntry / tensor_type::vectorsize;
245 const size_type nEntryB = nBlock * tensor_type::vectorsize;
246 const size_type iEnd = iEntryBeg + nEntryB;
248 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
249 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
250 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
253 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
254 const size_type *
j = &m_tensor.coord(iEntry,0);
255 const size_type *k = &m_tensor.coord(iEntry,1);
256 ValTV c(&(m_tensor.value(iEntry)));
258 for ( size_type col = 0; col < block_size; ++col ) {
259 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
260 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
264 aj.multiply_add(ak, xj);
265 vy.multiply_add(c, aj);
272 const size_type rem = iEntryEnd-iEntry;
274 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
275 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
276 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
277 const size_type *j = &m_tensor.coord(iEntry,0);
278 const size_type *k = &m_tensor.coord(iEntry,1);
279 ValTV2 c(&(m_tensor.value(iEntry)));
281 for ( size_type col = 0; col < block_size; ++col ) {
282 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
283 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
287 aj.multiply_add(ak, xj);
293 y[iy] += m_a * ytmp ;
298 device.team_barrier();
314 typedef typename Kokkos::TeamPolicy< execution_space >::member_type
team_member ;
315 KOKKOS_INLINE_FUNCTION
318 const size_type iBlockRow = device.league_rank();
321 const size_type row_count = m_A_graph.row_map.extent(0)-1;
322 if (iBlockRow >= row_count)
325 const size_type num_thread = device.team_size();
326 const size_type thread_idx = device.team_rank();
327 const Kokkos::pair<size_type,size_type> work_range =
328 details::compute_work_range<output_scalar>(
329 device, m_tensor.dimension(), num_thread, thread_idx);
337 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
340 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
343 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
344 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
347 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
353 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
355 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
357 for (
size_type col = 0; col < block_size; ++col ) {
358 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
359 sh_x[col] = & m_x( 0 , iBlockColumn );
360 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
363 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
365 const size_type nEntry = m_tensor.num_entry(iy);
366 const size_type iEntryBeg = m_tensor.entry_begin(iy);
367 const size_type iEntryEnd = iEntryBeg + nEntry;
373 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
374 const size_type nBlock = nEntry / tensor_type::vectorsize;
375 const size_type nEntryB = nBlock * tensor_type::vectorsize;
376 const size_type iEnd = iEntryBeg + nEntryB;
383 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
384 const size_type *j = &m_tensor.coord(iEntry,0);
385 const size_type *k = &m_tensor.coord(iEntry,1);
386 ValTV c(&(m_tensor.value(iEntry)));
388 for (
size_type col = 0; col < block_size; ++col ) {
389 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
390 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
394 aj.multiply_add(ak, xj);
395 vy.multiply_add(c, aj);
402 for ( ; iEntry<iEntryEnd; ++iEntry) {
403 const size_type j = m_tensor.coord(iEntry,0);
404 const size_type k = m_tensor.coord(iEntry,1);
407 for (
size_type col = 0; col < block_size; ++col ) {
408 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
409 sh_A[col][k] * sh_x[col][
j] );
414 y[iy] += m_a * ytmp ;
419 device.team_barrier();
437 const bool use_block_algorithm =
true;
439 const bool use_block_algorithm =
false;
442 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
443 if (use_block_algorithm) {
445 const size_t team_size = 4;
447 const size_t team_size = 2;
449 const size_t league_size = row_count;
450 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
451 Kokkos::parallel_for( config ,
Multiply(A,x,y,a,b) );
454 Kokkos::parallel_for( row_count ,
Multiply(A,x,y,a,b) );
463 template <
typename MatrixDevice,
464 typename MatrixStorage,
465 typename MatrixOrdinal,
466 typename MatrixMemory,
468 typename InputStorage,
470 typename OutputStorage,
471 typename ... OutputP>
477 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
479 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
490 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
528 : m_A_values( A.values )
529 , m_A_graph( A.graph )
532 , m_tensor( Kokkos::
cijk(A.values) )
544 KOKKOS_INLINE_FUNCTION
547 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
548 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
554 for (
size_type col=0; col<num_col; ++col)
555 for (
size_type j = 0 ; j < m_tensor.dimension() ; ++
j )
556 m_y(j, iBlockRow, col) = 0 ;
558 for (
size_type col=0; col<num_col; ++col)
559 for (
size_type j = 0 ; j < m_tensor.dimension() ; ++
j )
560 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
566 for (
size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
568 const size_type iBlockCol = m_A_graph.entries(iEntry);
570 for (
size_type col=0; col<num_col; ++col) {
572 const input_scalar *
const x = &m_x( 0, iBlockCol, col );
590 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
592 KOKKOS_INLINE_FUNCTION
593 void operator()(
const team_member & device )
const
595 const size_type iBlockRow = device.league_rank();
598 const size_type row_count = m_A_graph.row_map.extent(0)-1;
599 if (iBlockRow >= row_count)
602 const size_type num_thread = device.team_size();
603 const size_type thread_idx = device.team_rank();
604 const Kokkos::pair<size_type,size_type> work_range =
605 details::compute_work_range<output_scalar>(
606 device, m_tensor.dimension(), num_thread, thread_idx);
608 const size_type num_col = m_y.extent(2);
611 if ( m_b == output_scalar(0) )
612 for (size_type col=0; col<num_col; ++col)
613 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
614 m_y(j, iBlockRow, col) = 0 ;
616 for (size_type col=0; col<num_col; ++col)
617 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
618 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
620 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
621 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
622 const size_type BlockSize = 9;
623 const size_type numBlock =
624 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
626 const matrix_scalar* sh_A[BlockSize];
627 const input_scalar* sh_x[BlockSize];
629 size_type iBlockEntry = iBlockEntryBeg;
630 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
631 const size_type block_size =
632 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
635 for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
637 output_scalar *
const y = & m_y( 0 , iBlockRow , vec_col );
639 for ( size_type col = 0; col < block_size; ++col ) {
640 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
641 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
642 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
645 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
647 const size_type nEntry = m_tensor.num_entry(iy);
648 const size_type iEntryBeg = m_tensor.entry_begin(iy);
649 const size_type iEntryEnd = iEntryBeg + nEntry;
650 size_type iEntry = iEntryBeg;
652 output_scalar ytmp = 0 ;
655 const size_type nBlock = nEntry / tensor_type::vectorsize;
656 const size_type nEntryB = nBlock * tensor_type::vectorsize;
657 const size_type iEnd = iEntryBeg + nEntryB;
659 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
660 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
661 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
664 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
665 const size_type *j = &m_tensor.coord(iEntry,0);
666 const size_type *k = &m_tensor.coord(iEntry,1);
667 ValTV c(&(m_tensor.value(iEntry)));
669 for ( size_type col = 0; col < block_size; ++col ) {
670 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
671 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
675 aj.multiply_add(ak, xj);
676 vy.multiply_add(c, aj);
683 const size_type rem = iEntryEnd-iEntry;
685 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
686 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
687 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
688 const size_type *j = &m_tensor.coord(iEntry,0);
689 const size_type *k = &m_tensor.coord(iEntry,1);
690 ValTV2 c(&(m_tensor.value(iEntry)));
692 for ( size_type col = 0; col < block_size; ++col ) {
693 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
694 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
698 aj.multiply_add(ak, xj);
704 y[iy] += m_a * ytmp ;
711 device.team_barrier();
728 typedef typename Kokkos::TeamPolicy< execution_space >::member_type
team_member ;
730 KOKKOS_INLINE_FUNCTION
733 const size_type iBlockRow = device.league_rank();
736 const size_type row_count = m_A_graph.row_map.extent(0)-1;
737 if (iBlockRow >= row_count)
740 const size_type num_thread = device.team_size();
741 const size_type thread_idx = device.team_rank();
742 const Kokkos::pair<size_type,size_type> work_range =
743 details::compute_work_range<output_scalar>(
744 device, m_tensor.dimension(), num_thread, thread_idx);
750 for (
size_type col=0; col<num_col; ++col)
751 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
752 m_y(j, iBlockRow, col) = 0 ;
754 for (
size_type col=0; col<num_col; ++col)
755 for (
size_type j = work_range.first ; j < work_range.second ; ++j )
756 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
758 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
759 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
762 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
768 for (
size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
770 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
773 for (
size_type vec_col=0; vec_col<num_col; ++vec_col) {
777 for (
size_type col = 0; col < block_size; ++col ) {
778 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
779 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
780 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
783 for (
size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
785 const size_type nEntry = m_tensor.num_entry(iy);
786 const size_type iEntryBeg = m_tensor.entry_begin(iy);
787 const size_type iEntryEnd = iEntryBeg + nEntry;
793 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
794 const size_type nBlock = nEntry / tensor_type::vectorsize;
795 const size_type nEntryB = nBlock * tensor_type::vectorsize;
796 const size_type iEnd = iEntryBeg + nEntryB;
803 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
804 const size_type *j = &m_tensor.coord(iEntry,0);
805 const size_type *k = &m_tensor.coord(iEntry,1);
806 ValTV c(&(m_tensor.value(iEntry)));
808 for (
size_type col = 0; col < block_size; ++col ) {
809 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
810 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
814 aj.multiply_add(ak, xj);
815 vy.multiply_add(c, aj);
822 for ( ; iEntry<iEntryEnd; ++iEntry) {
823 const size_type j = m_tensor.coord(iEntry,0);
824 const size_type k = m_tensor.coord(iEntry,1);
827 for (
size_type col = 0; col < block_size; ++col ) {
828 ytmp += cijk * ( sh_A[col][
j] * sh_x[col][k] +
829 sh_A[col][k] * sh_x[col][
j] );
834 y[iy] += m_a * ytmp ;
841 device.team_barrier();
859 const bool use_block_algorithm =
true;
861 const bool use_block_algorithm =
false;
864 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
865 if (use_block_algorithm) {
867 const size_t team_size = 4;
869 const size_t team_size = 2;
871 const size_t league_size = row_count;
872 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
873 Kokkos::parallel_for( config ,
Multiply(A,x,y,a,b) );
876 Kokkos::parallel_for( row_count ,
Multiply(A,x,y,a,b) );
885 template <
typename MatrixDevice,
886 typename MatrixStorage,
887 typename MatrixOrdinal,
888 typename MatrixMemory,
890 typename InputStorage,
892 typename OutputStorage,
893 typename ... OutputP>
899 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
901 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
941 template <
typename MatrixDevice,
942 typename MatrixStorage,
943 typename MatrixOrdinal,
944 typename MatrixMemory,
946 typename InputStorage,
948 typename OutputStorage,
949 typename ... OutputP>
955 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
957 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
993 template <
typename MatrixType,
typename InputViewType,
typename OutputViewType>
1000 template <
typename MatrixDevice,
1001 typename MatrixStorage,
1002 typename MatrixOrdinal,
1003 typename MatrixMemory,
1004 typename MatrixSize,
1005 typename InputStorage,
1006 typename ... InputP,
1007 typename OutputStorage,
1008 typename ... OutputP>
1014 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1016 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1025 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
1042 template <
int BlockSize>
1043 struct BlockKernel {
1065 : m_A_values( A.values )
1066 , m_A_graph( A.graph )
1072 , numBlocks( dim / BlockSize )
1073 , rem( dim % BlockSize )
1074 , dim_block( numBlocks*BlockSize )
1077 KOKKOS_INLINE_FUNCTION
1080 #if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
1081 output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
1086 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1087 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1089 for (; pce_block < dim_block; pce_block+=BlockSize) {
1092 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1096 for (
size_type k = 0; k < BlockSize; ++k)
1099 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1103 for (
size_type k = 0; k < BlockSize; ++k)
1105 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1107 const size_type col = m_A_graph.entries(iEntry);
1109 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1113 for (
size_type k = 0; k < BlockSize; ++k)
1116 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1120 for (
size_type k = 0; k < BlockSize; ++k) {
1129 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1136 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1142 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1144 const size_type col = m_A_graph.entries(iEntry);
1146 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1153 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1185 : m_A_values( A.values )
1186 , m_A_graph( A.graph )
1194 KOKKOS_INLINE_FUNCTION
1197 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1198 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1201 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1208 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1214 for (
size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1216 const size_type col = m_A_graph.entries(iEntry);
1218 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1235 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1239 #if defined (__MIC__)
1241 Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1243 Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1245 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1247 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1249 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1251 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1254 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1256 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1258 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1260 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1269 template <
typename MatrixDevice,
1270 typename MatrixStorage,
1271 typename MatrixOrdinal,
1272 typename MatrixMemory,
1273 typename MatrixSize,
1274 typename InputStorage,
1275 typename ... InputP,
1276 typename OutputStorage,
1277 typename ... OutputP>
1283 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1285 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1294 typedef KokkosSparse::CrsMatrix<
const MatrixValue,
1316 InputP... > input_vector_1d_type;
1318 OutputP... > output_vector_1d_type;
1320 output_vector_1d_type> MeanMultiply1D;
1323 input_vector_1d_type x_col =
1324 Kokkos::subview(x, Kokkos::ALL(), i);
1325 output_vector_1d_type y_col =
1326 Kokkos::subview(y, Kokkos::ALL(), i);
1327 MeanMultiply1D::apply( A, x_col, y_col, a, b );
1336 template <
typename MatrixDevice,
1337 typename MatrixStorage,
1338 typename MatrixOrdinal,
1339 typename MatrixMemory,
1340 typename MatrixSize,
1341 typename InputStorage,
1342 typename ... InputP,
1343 typename OutputStorage,
1344 typename ... OutputP>
1350 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1352 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1392 template <
typename MatrixDevice,
1393 typename MatrixStorage,
1394 typename MatrixOrdinal,
1395 typename MatrixMemory,
1396 typename MatrixSize,
1397 typename InputStorage,
1398 typename ... InputP,
1399 typename OutputStorage,
1400 typename ... OutputP>
1406 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1408 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1446 namespace KokkosSparse {
1448 template <
typename AlphaType,
1450 typename MatrixType,
1452 typename ... InputP,
1453 typename OutputType,
1454 typename ... OutputP>
1455 typename std::enable_if<
1462 const MatrixType& A,
1463 const Kokkos::View< InputType, InputP... >& x,
1465 const Kokkos::View< OutputType, OutputP... >& y,
1468 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1469 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1471 OutputVectorType> multiply_type;
1473 OutputVectorType> mean_multiply_type;
1477 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1482 "Stokhos spmv not implemented for non-constant a or b");
1485 mean_multiply_type::apply( A, x, y,
1486 Sacado::Value<AlphaType>::eval(a),
1487 Sacado::Value<BetaType>::eval(b) );
1490 multiply_type::apply( A, x, y,
1491 Sacado::Value<AlphaType>::eval(a),
1492 Sacado::Value<BetaType>::eval(b) );
1495 template <
typename AlphaType,
1497 typename MatrixType,
1499 typename ... InputP,
1500 typename OutputType,
1501 typename ... OutputP>
1502 typename std::enable_if<
1507 KokkosKernels::Experimental::Controls,
1510 const MatrixType& A,
1511 const Kokkos::View< InputType, InputP... >& x,
1513 const Kokkos::View< OutputType, OutputP... >& y,
1516 spmv(mode, a, A, x, b, y, RANK_ONE());
1519 template <
typename AlphaType,
1521 typename MatrixType,
1523 typename ... InputP,
1524 typename OutputType,
1525 typename ... OutputP>
1526 typename std::enable_if<
1533 const MatrixType& A,
1534 const Kokkos::View< InputType, InputP... >& x,
1536 const Kokkos::View< OutputType, OutputP... >& y,
1541 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1543 if (y.extent(1) == 1) {
1544 auto y_1D = subview(y, Kokkos::ALL(), 0);
1545 auto x_1D = subview(x, Kokkos::ALL(), 0);
1546 spmv(mode, a, A, x_1D, b, y_1D, RANK_ONE());
1549 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1550 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1553 OutputVectorType> multiply_type;
1555 OutputVectorType> mean_multiply_type;
1559 "Stokhos spmv not implemented for non-constant a or b");
1562 typename InputVectorType::const_type x_const = x;
1565 mean_multiply_type::apply( A, x_const, y,
1566 Sacado::Value<AlphaType>::eval(a),
1567 Sacado::Value<BetaType>::eval(b));
1570 multiply_type::apply( A, x_const, y,
1571 Sacado::Value<AlphaType>::eval(a),
1572 Sacado::Value<BetaType>::eval(b));
1576 template <
typename AlphaType,
1578 typename MatrixType,
1580 typename ... InputP,
1581 typename OutputType,
1582 typename ... OutputP>
1583 typename std::enable_if<
1588 KokkosKernels::Experimental::Controls,
1591 const MatrixType& A,
1592 const Kokkos::View< InputType, InputP... >& x,
1594 const Kokkos::View< OutputType, OutputP... >& y,
1597 spmv(mode, a, A, x, b, y, RANK_TWO());
input_vector_type::array_type input_array_type
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
const input_array_type v_x
Kokkos::FlatArrayType< matrix_values_type >::type matrix_array_type
input_vector_type::array_type input_array_type
input_vector_type::array_type input_array_type
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
Multiply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Sacado::UQ::PCE< InputStorage > InputVectorValue
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
InputVectorValue::value_type input_scalar
OutputVectorValue::value_type output_scalar
const matrix_graph_type m_A_graph
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
tensor_type::value_type tensor_scalar
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
output_vector_type::array_type output_array_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
OutputVectorValue::value_type output_scalar
const output_array_type v_y
MatrixValue::ordinal_type size_type
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
Kokkos::DefaultExecutionSpace execution_space
const matrix_array_type m_A_values
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
InputVectorValue::value_type input_scalar
const input_array_type v_x
Sacado::UQ::PCE< MatrixStorage > MatrixValue
OutputVectorValue::value_type output_scalar
Sacado::UQ::PCE< InputStorage > InputVectorValue
Sacado::UQ::PCE< MatrixStorage > MatrixValue
const input_array_type m_x
const size_type dim_block
const matrix_graph_type m_A_graph
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
matrix_type::const_type const_matrix_type
const tensor_type m_tensor
InputVectorValue::value_type input_scalar
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
tensor_type::size_type size_type
output_vector_type::array_type output_array_type
Kokkos::CijkType< matrix_values_type >::type tensor_type
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
MatrixValue::value_type matrix_scalar
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
const matrix_array_type m_A_values
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
Sacado::UQ::PCE< InputStorage > InputVectorValue
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
input_vector_type::array_type input_array_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
matrix_type::const_type const_matrix_type
const matrix_array_type m_A_values
const matrix_graph_type m_A_graph
Kokkos::View< const InputVectorValue *, InputP... > input_vector_type
MatrixDevice::execution_space execution_space
matrix_type::values_type matrix_values_type
matrix_type::StaticCrsGraphType matrix_graph_type
OutputVectorValue::value_type output_scalar
matrix_type::values_type matrix_values_type
const matrix_graph_type m_A_graph
MatrixDevice::execution_space execution_space
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
MatrixDevice::execution_space execution_space
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
matrix_type::const_type const_matrix_type
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
matrix_type::values_type matrix_values_type
matrix_values_type::array_type matrix_array_type
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
Kokkos::FlatArrayType< matrix_values_type >::type matrix_array_type
Multiply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
Sacado::UQ::PCE< InputStorage > InputVectorValue
tensor_type::size_type size_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
Kokkos::View< OutputVectorValue *, OutputP... > output_vector_type
KOKKOS_INLINE_FUNCTION void operator()(const team_member &device) const
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Kokkos::View< const InputVectorValue **, InputP... > input_vector_type
InputVectorValue::value_type input_scalar
matrix_values_type::array_type matrix_array_type
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
const output_array_type v_y
MatrixValue::ordinal_type size_type
Sacado::UQ::PCE< MatrixStorage > MatrixValue
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
const matrix_array_type m_A_values
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
matrix_type::StaticCrsGraphType matrix_graph_type
output_vector_type::array_type output_array_type
InputVectorValue::value_type input_scalar
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Kokkos::TeamPolicy< execution_space >::member_type team_member
const size_type numBlocks
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Sacado::UQ::PCE< InputStorage > InputVectorValue
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
MatrixDevice::execution_space execution_space
KokkosSparse::CrsMatrix< const MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
InputVectorValue::value_type input_scalar
KokkosSparse::CrsMatrix< MatrixValue, MatrixOrdinal, MatrixDevice, MatrixMemory, MatrixSize > matrix_type
const input_array_type m_x
const tensor_type m_tensor
Sacado::UQ::PCE< MatrixStorage > MatrixValue
OutputVectorValue::value_type output_scalar
const output_array_type m_y
matrix_type::StaticCrsGraphType matrix_graph_type
Sacado::UQ::PCE< InputStorage > InputVectorValue
tensor_type::value_type tensor_scalar
MatrixValue::value_type matrix_scalar
MatrixDevice::execution_space execution_space
output_vector_type::array_type output_array_type
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
Kokkos::CijkType< matrix_values_type >::type tensor_type
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
matrix_type::const_type const_matrix_type
static void apply(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a=input_scalar(1), const output_scalar &b=output_scalar(0))
KOKKOS_INLINE_FUNCTION void zero()
InputVectorValue::value_type input_scalar
MatrixValue::value_type matrix_scalar
InputVectorValue::value_type input_scalar
OutputVectorValue::value_type output_scalar
const output_array_type m_y
Kokkos::View< OutputVectorValue **, OutputP... > output_vector_type
Kokkos::TeamPolicy< execution_space >::member_type team_member
OutputVectorValue::value_type output_scalar
Sacado::UQ::PCE< InputStorage > InputVectorValue
Sacado::UQ::PCE< MatrixStorage > MatrixValue
Sacado::UQ::PCE< OutputStorage > OutputVectorValue
OutputVectorValue::value_type output_scalar
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
Sacado::UQ::PCE< InputStorage > InputVectorValue
KOKKOS_INLINE_FUNCTION void operator()(const team_member &device) const
BlockKernel(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)
Kernel(const matrix_type &A, const input_vector_type &x, const output_vector_type &y, const input_scalar &a, const output_scalar &b)