43 #ifndef KOKKOS_CRSMATRIX_H_
44 #define KOKKOS_CRSMATRIX_H_
56 #include <Kokkos_Core.hpp>
57 #include <Kokkos_StaticCrsGraph.hpp>
58 #include <Kokkos_MV.hpp>
60 #ifdef KOKKOS_USE_CUSPARSE
61 # include <cusparse.h>
63 #endif // KOKKOS_USE_CUSPARSE
67 # include <mkl_spblas.h>
69 #endif // KOKKOS_USE_MKL
72 #include <impl/Kokkos_Error.hpp>
113 template<
class MatrixType>
136 KOKKOS_INLINE_FUNCTION
141 values_ (values), colidx_ (colidx__), stride_ (stride),
length (count)
150 KOKKOS_INLINE_FUNCTION
152 const typename MatrixType::index_type& colidx__,
156 values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride),
length (count)
170 KOKKOS_INLINE_FUNCTION
172 return values_[i*stride_];
179 KOKKOS_INLINE_FUNCTION
181 return colidx_[i*stride_];
193 template<
class MatrixType>
196 typedef const typename MatrixType::non_const_value_type
value_type;
198 typedef const typename MatrixType::non_const_ordinal_type
ordinal_type;
216 KOKKOS_INLINE_FUNCTION
221 values_ (values), colidx_ (colidx__), stride_ (stride),
length (count)
230 KOKKOS_INLINE_FUNCTION
232 const typename MatrixType::index_type& colidx__,
236 values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride),
length (count)
250 KOKKOS_INLINE_FUNCTION
252 return values_[i*stride_];
259 KOKKOS_INLINE_FUNCTION
261 return colidx_[i*stride_];
271 struct DeviceConfig {
274 Dim3(
const size_t x_,
const size_t y_ = 1,
const size_t z_ = 1) :
275 x(x_), y(y_), z(z_) {}
280 size_t num_threads_per_block;
282 DeviceConfig(
const size_t num_blocks_ = 0,
283 const size_t threads_per_block_x_ = 0,
284 const size_t threads_per_block_y_ = 0,
285 const size_t threads_per_block_z_ = 1) :
286 block_dim(threads_per_block_x_,threads_per_block_y_,threads_per_block_z_),
287 num_blocks(num_blocks_),
288 num_threads_per_block(block_dim.x * block_dim.y * block_dim.z)
305 template<
typename ScalarType,
306 typename OrdinalType,
308 class MemoryTraits = void,
309 typename SizeType =
size_t>
312 typedef typename Kokkos::ViewTraits<ScalarType*,Device,void,void>::host_mirror_space host_mirror_space ;
314 typedef Device device_type;
315 typedef ScalarType value_type;
316 typedef OrdinalType ordinal_type;
317 typedef MemoryTraits memory_traits;
318 typedef SizeType size_type;
369 typedef typename index_type ::non_const_value_type non_const_ordinal_type;
371 #ifdef KOKKOS_USE_CUSPARSE
372 cusparseHandle_t cusparse_handle;
373 cusparseMatDescr_t cusparse_descr;
374 #endif // KOKKOS_USE_CUSPARSE
381 DeviceConfig dev_config;
389 : graph(), values(), _numRows (0), _numCols (0), _nnz (0)
395 template<
typename SType,
415 , values( arg_label , arg_graph.entries.dimension_0() )
416 , _numRows( arg_graph.row_map.dimension_0() - 1 )
417 , _numCols( maximum_entry( arg_graph ) + 1 )
418 , _nnz( arg_graph.entries.dimension_0() )
456 import (label, nrows, ncols, annz, val, rows, cols);
460 #ifdef KOKKOS_USE_CUSPARSE
464 cusparseCreate (&cusparse_handle);
469 cusparseCreateMatDescr (&cusparse_descr);
470 #endif // KOKKOS_USE_CUSPARSE
497 graph.row_map = rows;
498 graph.entries = cols;
500 #ifdef KOKKOS_USE_CUSPARSE
501 cusparseCreate(&cusparse_handle);
502 cusparseCreateMatDescr(&cusparse_descr);
503 #endif // KOKKOS_USE_CUSPARSE
525 _numRows (graph_.row_map.dimension_0()-1),
527 _nnz (graph_.entries.dimension_0())
529 #ifdef KOKKOS_USE_CUSPARSE
530 cusparseCreate(&cusparse_handle);
531 cusparseCreateMatDescr(&cusparse_descr);
532 #endif // KOKKOS_USE_CUSPARSE
536 import (
const std::string &label,
549 OrdinalType target_nnz,
550 OrdinalType varianz_nel_row,
551 OrdinalType width_row);
557 OrdinalType cols_per_row);
560 generate (
const std::string &label);
563 generateHostGraph (OrdinalType nrows,
565 OrdinalType cols_per_row);
570 insertInGraph(OrdinalType rowi, OrdinalType col)
572 insertInGraph(rowi, &col, 1);
583 KOKKOS_INLINE_FUNCTION
585 sumIntoValues (
const OrdinalType rowi,
586 const OrdinalType cols[],
589 const bool force_atomic =
false)
const
591 SparseRowView<CrsMatrix> row_view = this->
row (rowi);
592 const int length = row_view.length;
593 for (
size_t i = 0; i < ncol; ++i) {
594 for (
int j = 0; j < length; ++j) {
595 if (row_view.colidx(j) == cols[i]) {
597 atomic_add(&row_view.value(j), vals[i]);
599 row_view.value(j) += vals[i];
607 KOKKOS_INLINE_FUNCTION
609 replaceValues (
const OrdinalType rowi,
610 const OrdinalType cols[],
613 const bool force_atomic =
false)
const
615 SparseRowView<CrsMatrix> row_view = this->
row (rowi);
616 const int length = row_view.length;
617 for (
size_t i = 0; i < ncol; ++i) {
618 for (
int j = 0; j < length; ++j) {
619 if (row_view.colidx(j) == cols[i]) {
621 atomic_assign(&row_view.value(j), vals[i]);
623 row_view.value(j) = vals[i];
642 template<
typename aScalarType,
typename aOrdinalType,
class aDevice,
class aMemoryTraits,
typename aSizeType>
644 operator= (
const CrsMatrix<aScalarType,aOrdinalType,aDevice,aMemoryTraits, aSizeType>& mtx)
646 _numRows = mtx.numRows();
647 _numCols = mtx.numCols();
651 dev_config = mtx.dev_config;
656 KOKKOS_INLINE_FUNCTION
661 KOKKOS_INLINE_FUNCTION
666 KOKKOS_INLINE_FUNCTION
667 ordinal_type
nnz()
const {
674 KOKKOS_INLINE_FUNCTION
676 const int start = graph.row_map(i);
677 const int count = graph.row_map(i+1) - start;
685 #if defined ( KOKKOS_EXPRESSION_CHECK )
692 KOKKOS_INLINE_FUNCTION
694 const int start = graph.row_map(i);
695 const int count = graph.row_map(i+1) - start;
697 #if defined ( KOKKOS_EXPRESSION_CHECK )
705 ordinal_type _numRows;
706 ordinal_type _numCols;
716 std::vector<OrdinalType> h_entries_;
717 std::vector<OrdinalType> rows_;
728 insertInGraph (
const OrdinalType rowi, OrdinalType *cols,
const size_t ncol)
730 OrdinalType*
const start = &h_entries_[rows_[rowi]];
731 OrdinalType*
const end = &h_entries_[rows_[rowi+1]];
732 for (
size_t i = 0; i < ncol; ++i) {
733 OrdinalType *iter = start;
734 while (iter < end && *iter != -1 && *iter != cols[i]) {
746 assert (iter != end );
756 template<
typename ScalarType ,
typename OrdinalType,
class Device,
class MemoryTraits,
typename SizeType >
758 CrsMatrix<ScalarType , OrdinalType, Device, MemoryTraits, SizeType >::
759 import (
const std::string &label,
767 std::string str = label;
768 values = values_type (str.append (
".values"), annz);
776 std::vector<int> row_lengths (_numRows, 0);
779 for (
int i = 0; i < _numRows; ++i) {
780 row_lengths[i] = rows[i + 1] - rows[i];
784 graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (
".graph"), row_lengths);
785 typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values);
786 typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
791 for (OrdinalType i = 0; i < _nnz; ++i) {
793 h_values(i) = val[i];
795 h_entries(i) = cols[i];
802 template<
typename ScalarType,
typename OrdinalType,
class Device,
class MemoryTraits,
typename SizeType>
808 OrdinalType target_nnz,
809 OrdinalType varianz_nel_row,
810 OrdinalType width_row)
815 graph.row_map =
row_map_type (
"CrsMatrix::rowPtr", nrows + 1);
816 typename row_map_type::HostMirror h_row_map = Kokkos::create_mirror_view (graph.row_map);
822 OrdinalType elements_per_row = target_nnz / nrows;
826 for (
int rowi = 0; rowi < nrows; ++rowi) {
831 _nnz = h_row_map(nrows);
833 graph.entries =
index_type(
"CrsMatrix::colInd", _nnz);
834 typename values_type::HostMirror h_values = Kokkos::create_mirror_view(values);
835 typename index_type::HostMirror h_entries = Kokkos::create_mirror_view(graph.entries);
837 for(
int rowi = 0; rowi < nrows; rowi++) {
838 for(
int k = h_row_map(rowi); k < h_row_map(rowi + 1); k++) {
856 template<
typename ScalarType,
typename OrdinalType,
class Device,
class MemoryTraits,
typename SizeType>
862 OrdinalType cols_per_row)
866 _nnz = nrows*cols_per_row;
868 std::string str = label;
869 values = values_type (str.append (
".values"), _nnz);
872 std::vector<int> row_lengths (_numRows, 0);
875 for (
int i = 0; i < _numRows; ++i) {
876 row_lengths[i] = cols_per_row;
880 graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (
".graph"), row_lengths);
881 typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values);
882 typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
887 for (OrdinalType i = 0; i < _nnz; ++i) {
888 h_values(i) = ScalarType();
889 h_entries(i) = OrdinalType();
895 template<
typename ScalarType,
typename OrdinalType,
class Device,
class MemoryTraits,
typename SizeType>
901 size_t ptr_from= 0, ptr_to = 0;
903 while ( ptr_from < h_entries_.size()) {
904 size_t row_stop = rows_[cur_row+1];
905 while (ptr_from < row_stop) {
906 if ( h_entries_[ptr_from] == OrdinalType(-1)) {
909 h_entries_[ptr_to++] = h_entries_[ptr_from++];
912 rows_[++cur_row] = ptr_to;
914 OrdinalType nrows = rows_.size()-1;
915 OrdinalType nnz_ = ptr_to;
917 h_entries_.resize(nnz_);
920 for (OrdinalType i=0; i<nrows; ++i )
921 std::sort(&h_entries_[i], &h_entries_[i+1]);
924 import(label, nrows, nrows, nnz_, NULL, &rows_[0], &h_entries_[0]);
929 template<
typename ScalarType,
typename OrdinalType,
class Device,
class MemoryTraits,
typename SizeType>
931 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >::
932 generateHostGraph ( OrdinalType nrows,
934 OrdinalType cols_per_row)
938 _nnz = nrows*cols_per_row;
940 h_entries_.resize(_nnz, OrdinalType(-1));
941 rows_.resize(_numRows+1);
943 for (OrdinalType i = 0; i < _numRows; ++i)
944 rows_[i+1] = rows_[i]+cols_per_row;
948 template<
class DeviceType>
949 inline int RowsPerThread(
const int NNZPerRow) {
950 if(NNZPerRow == 0)
return 1;
952 while(result*NNZPerRow <= 2048) {
957 #ifdef KOKKOS_HAVE_CUDA
959 inline int RowsPerThread<Kokkos::Cuda>(
const int NNZPerRow) {
966 template<
class DeviceType ,
typename ScalarType ,
int NNZPerRow=27>
967 struct MV_MultiplyShflThreadsPerRow {
970 typedef typename Kokkos::Impl::remove_const< ScalarType >::type value_type ;
972 #ifdef KOKKOS_HAVE_CUDA
973 enum { shfl_possible =
974 Kokkos::Impl::is_same< DeviceType , Kokkos::Cuda >::value &&
976 Kokkos::Impl::is_same< value_type , unsigned int >::value ||
977 Kokkos::Impl::is_same< value_type , int >::value ||
978 Kokkos::Impl::is_same< value_type , float >::value ||
979 Kokkos::Impl::is_same< value_type , double >::value
981 #else // NOT KOKKOS_HAVE_CUDA
982 enum { shfl_possible = 0 };
983 #endif // KOKKOS_HAVE_CUDA
987 #if defined( __CUDA_ARCH__ )
988 enum { device_value = shfl_possible && ( 300 <= __CUDA_ARCH__ ) ?
996 enum { device_value = 1 };
999 #ifdef KOKKOS_HAVE_CUDA
1000 inline static int host_value()
1001 {
return shfl_possible && ( 300 <= Kokkos::Cuda::device_arch() ) ?
1008 #else // NOT KOKKOS_HAVE_CUDA
1009 inline static int host_value() {
return 1; }
1010 #endif // KOKKOS_HAVE_CUDA
1015 template<
class RangeVector,
1022 int ExplicitVectorLength = 8>
1023 struct MV_MultiplyFunctor {
1024 typedef typename CrsMatrix::device_type device_type ;
1025 typedef typename CrsMatrix::ordinal_type size_type ;
1029 typedef typename team_policy::member_type team_member ;
1031 typedef Vectorization<device_type,ExplicitVectorLength> vectorization;
1039 int rows_per_thread;
1041 MV_MultiplyFunctor(
const CoeffVector1 beta_,
1042 const CoeffVector2 alpha_,
1043 const CrsMatrix m_A_,
1044 const DomainVector m_x_,
1045 const RangeVector m_y_,
1047 const int rows_per_thread_):
1048 beta(beta_), alpha(alpha_), m_A(m_A_), m_x(m_x_), m_y(m_y_), n(n_), rows_per_thread(rows_per_thread_) {}
1052 template<
int UNROLL>
1053 KOKKOS_INLINE_FUNCTION
1054 void strip_mine (
const team_member & dev,
const size_type & iRow,
const size_type& kk)
const {
1056 value_type sum[UNROLL];
1058 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1061 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1064 for (size_type k = 0 ; k < UNROLL ; ++k) {
1078 const SparseRowView<CrsMatrix> row = m_A.row(iRow);
1080 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1083 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1086 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
1087 #pragma loop count (15)
1089 for (size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment ) {
1090 const value_type val = row.value(iEntry);
1091 const size_type ind = row.colidx(iEntry);
1093 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1096 for (size_type k = 0; k < UNROLL; ++k) {
1097 sum[k] += val * m_x(ind, kk + k);
1102 for (
int ii=0; ii < UNROLL; ++ii) {
1103 sum[ii] = -vectorization::reduce(sum[ii]);
1106 for (
int ii=0; ii < UNROLL; ++ii) {
1107 sum[ii] = vectorization::reduce(sum[ii]);
1110 if (vectorization::is_lane_0(dev)) {
1111 if(doalpha * doalpha != 1) {
1112 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1115 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1118 for (size_type k = 0; k < UNROLL; ++k) {
1119 sum[k] *= alpha(kk + k);
1124 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1127 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1130 for (size_type k = 0; k < UNROLL; ++k) {
1131 m_y(iRow, kk + k) = sum[k];
1133 }
else if(dobeta == 1) {
1134 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1137 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1140 for (size_type k = 0; k < UNROLL; ++k) {
1141 m_y(iRow, kk + k) += sum[k];
1143 }
else if (dobeta == -1) {
1144 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1147 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1150 for (size_type k = 0; k < UNROLL; ++k) {
1151 m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
1154 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1157 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1160 for (size_type k = 0; k < UNROLL; ++k) {
1161 m_y(iRow, kk + k) = beta(kk + k) * m_y(iRow, kk + k) + sum[k] ;
1167 KOKKOS_INLINE_FUNCTION
1168 void strip_mine_1 (
const team_member & dev,
const size_type& iRow)
const {
1171 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1173 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1176 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1179 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
1180 #pragma loop count (15)
1182 for(size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment) {
1183 sum += row.value(iEntry) * m_x(row.colidx(iEntry),0);
1186 sum = vectorization::reduce(sum);
1188 if (vectorization::is_lane_0(dev)) {
1191 sum *= value_type(-1);
1192 else if(doalpha * doalpha != 1) {
1197 m_y(iRow, 0) = sum ;
1198 }
else if (dobeta == 1) {
1199 m_y(iRow, 0) += sum ;
1200 }
else if (dobeta == -1) {
1201 m_y(iRow, 0) = -m_y(iRow, 0) + sum;
1203 m_y(iRow, 0) = beta(0) * m_y(iRow, 0) + sum;
1209 KOKKOS_INLINE_FUNCTION
1210 void operator()(
const team_member & dev)
const {
1211 for(
int loop = 0; loop < rows_per_thread; loop++) {
1212 const size_type iRow = vectorization::global_thread_rank(dev) * rows_per_thread + loop;
1213 if(iRow>=m_A.numRows())
return;
1217 #ifdef KOKKOS_FAST_COMPILE
1218 for (; kk + 4 <= n; kk += 4) {
1219 strip_mine<4>(dev, iRow, kk);
1221 for( ; kk < n; ++kk) {
1222 strip_mine<1>(dev, iRow, kk);
1225 # ifdef __CUDA_ARCH__
1226 if ((n > 8) && (n % 8 == 1)) {
1227 strip_mine<9>(dev, iRow, kk);
1230 for(; kk + 8 <= n; kk += 8)
1231 strip_mine<8>(dev, iRow, kk);
1234 # else // NOT a CUDA device
1235 if ((n > 16) && (n % 16 == 1)) {
1236 strip_mine<17>(dev, iRow, kk);
1240 for (; kk + 16 <= n; kk += 16) {
1241 strip_mine<16>(dev, iRow, kk);
1247 strip_mine<15>(dev, iRow, kk);
1251 strip_mine<14>(dev, iRow, kk);
1255 strip_mine<13>(dev, iRow, kk);
1259 strip_mine<12>(dev, iRow, kk);
1263 strip_mine<11>(dev, iRow, kk);
1267 strip_mine<10>(dev, iRow, kk);
1271 strip_mine<9>(dev, iRow, kk);
1275 strip_mine<8>(dev, iRow, kk);
1277 # endif // __CUDA_ARCH__
1279 strip_mine<7>(dev, iRow, kk);
1283 strip_mine<6>(dev, iRow, kk);
1287 strip_mine<5>(dev, iRow, kk);
1291 strip_mine<4>(dev, iRow, kk);
1295 strip_mine<3>(dev, iRow, kk);
1299 strip_mine<2>(dev, iRow, kk);
1303 strip_mine_1(dev, iRow);
1306 #endif // KOKKOS_FAST_COMPILE
1311 template<
class RangeVector,
1318 int ExplicitVectorLength = 8>
1319 struct MV_MultiplySingleFunctor {
1320 typedef typename CrsMatrix::device_type device_type ;
1321 typedef typename CrsMatrix::ordinal_type size_type ;
1325 typedef typename team_policy::member_type team_member ;
1328 typedef Vectorization<device_type,ExplicitVectorLength> vectorization;
1335 int rows_per_thread;
1337 MV_MultiplySingleFunctor(
1338 const CoeffVector1 beta_,
1339 const CoeffVector2 alpha_,
1340 const CrsMatrix m_A_,
1341 const DomainVector m_x_,
1342 const RangeVector m_y_,
1343 const int rows_per_thread_):
1344 beta(beta_), alpha(alpha_),
1345 m_A(m_A_), m_x(m_x_), m_y(m_y_),
1346 rows_per_thread(rows_per_thread_) {}
1348 KOKKOS_INLINE_FUNCTION
1349 void operator()(
const team_member & dev)
const {
1351 for(
int loop = 0; loop < rows_per_thread; loop++) {
1352 const size_type iRow = vectorization::global_thread_rank(dev)*rows_per_thread + loop;
1353 if(iRow>=m_A.numRows())
return;
1354 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1355 const size_type row_length = row.length ;
1358 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1361 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1364 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
1365 #pragma loop count (15)
1367 for (size_type iEntry = vectorization::begin(); iEntry < row_length; iEntry += vectorization::increment) {
1368 sum += row.value(iEntry) * m_x(row.colidx(iEntry));
1371 sum = vectorization::reduce(sum);
1374 if (vectorization::is_lane_0(dev)) {
1375 if (doalpha == -1) sum *= value_type(-1);
1376 else if (doalpha * doalpha != 1) {
1382 }
else if (dobeta == 1) {
1384 }
else if (dobeta == -1) {
1385 m_y(iRow) = -m_y(iRow) + sum;
1387 m_y(iRow) = beta(0) * m_y(iRow) + sum;
1396 template <
class RangeVector,
1402 MV_Multiply_Check_Compatibility (
const CoeffVector1 &betav,
1403 const RangeVector &y,
1404 const CoeffVector2 &alphav,
1406 const DomainVector &x,
1410 typename DomainVector::size_type numVecs = x.dimension_1();
1411 typename DomainVector::size_type numRows = A.numRows();
1412 typename DomainVector::size_type numCols = A.numCols();
1414 if (y.dimension_1() != numVecs) {
1415 std::ostringstream msg;
1416 msg <<
"Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and x do not match\n";
1417 msg <<
"\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) <<
") b("
1418 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) <<
") a("
1419 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) <<
") x("
1420 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) <<
") x("
1421 << DomainVector::memory_space::query_label(x.ptr_on_device()) <<
")\n";
1422 msg <<
"\t Dimensions are: y(" << y.dimension_0() <<
"," << y.dimension_1() <<
") x(" << x.dimension_0() <<
"," << x.dimension_1() <<
")\n";
1423 Impl::throw_runtime_exception( msg.str() );
1425 if (numRows > y.dimension_0()) {
1426 std::ostringstream msg;
1427 msg <<
"Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of y and A do not match\n";
1428 msg <<
"\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) <<
") b("
1429 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) <<
") a("
1430 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) <<
") x("
1431 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) <<
") x("
1432 << DomainVector::memory_space::query_label(x.ptr_on_device()) <<
")\n";
1433 msg <<
"\t Dimensions are: y(" << y.dimension_0() <<
"," << y.dimension_1() <<
") A(" << A.numRows() <<
"," << A.numCols() <<
")\n";
1434 Impl::throw_runtime_exception( msg.str() );
1436 if (numCols > x.dimension_0()) {
1437 std::ostringstream msg;
1438 msg <<
"Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of x and A do not match\n";
1439 msg <<
"\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) <<
") b("
1440 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) <<
") a("
1441 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) <<
") x("
1442 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) <<
") x("
1443 << DomainVector::memory_space::query_label(x.ptr_on_device()) <<
")\n";
1444 msg <<
"\t Dimensions are: x(" << x.dimension_0() <<
"," << x.dimension_1() <<
") A(" << A.numRows() <<
"," << A.numCols() <<
")\n";
1445 Impl::throw_runtime_exception( msg.str() );
1448 if (betav.dimension_0()!=numVecs) {
1449 std::ostringstream msg;
1450 msg <<
"Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and b do not match\n";
1451 msg <<
"\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) <<
") b("
1452 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) <<
") a("
1453 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) <<
") x("
1454 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) <<
") x("
1455 << DomainVector::memory_space::query_label(x.ptr_on_device()) <<
")\n";
1456 msg <<
"\t Dimensions are: y(" << y.dimension_0() <<
"," << y.dimension_1() <<
") b(" << betav.dimension_0() <<
")\n";
1457 Impl::throw_runtime_exception( msg.str() );
1461 if(alphav.dimension_0()!=numVecs) {
1462 std::ostringstream msg;
1463 msg <<
"Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of x and b do not match\n";
1464 msg <<
"\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) <<
") b("
1465 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) <<
") a("
1466 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) <<
") x("
1467 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) <<
") x("
1468 << DomainVector::memory_space::query_label(x.ptr_on_device()) <<
")\n";
1469 msg <<
"\t Dimensions are: x(" << x.dimension_0() <<
"," << x.dimension_1() <<
") b(" << betav.dimension_0() <<
")\n";
1470 Impl::throw_runtime_exception( msg.str() );
1477 template<
class RangeVector,
1484 int NNZPerRow = 27 >
1485 struct MV_MultiplyTransposeFunctor {
1486 typedef typename CrsMatrix::device_type device_type ;
1487 typedef typename CrsMatrix::ordinal_type size_type ;
1491 typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
1500 KOKKOS_INLINE_FUNCTION
1501 void operator()(
int i)
const {
1502 const size_type iRow = i/ShflThreadsPerRow::device_value;
1503 const int lane = i%ShflThreadsPerRow::device_value;
1505 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1507 for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) {
1508 const value_type val = row.value(iEntry);
1509 const size_type ind = row.colidx(iEntry);
1512 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1515 for (size_type k = 0; k < n; ++k) {
1516 atomic_add(&m_y(ind,k), value_type(alpha(k) * val * m_x(iRow, k)));
1519 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1522 for (size_type k = 0; k < n; ++k) {
1523 atomic_add(&m_y(ind,k), value_type(val * m_x(iRow, k)));
1531 template<
class RangeVector,
1538 int NNZPerRow = 27 >
1539 struct MV_MultiplyTransposeSingleFunctor {
1540 typedef typename CrsMatrix::device_type device_type ;
1541 typedef typename CrsMatrix::ordinal_type size_type ;
1545 typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
1554 KOKKOS_INLINE_FUNCTION
1555 void operator()(
int i)
const {
1556 const size_type iRow = i/ShflThreadsPerRow::device_value;
1557 const int lane = i%ShflThreadsPerRow::device_value;
1559 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1561 for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) {
1562 const value_type val = row.value(iEntry);
1563 const size_type ind = row.colidx(iEntry);
1566 atomic_add(&m_y(ind), value_type(alpha(0) * val * m_x(iRow)));
1568 atomic_add(&m_y(ind), value_type(val * m_x(iRow)));
1574 template <
class RangeVector,
1582 MV_MultiplyTranspose (
typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav,
1583 const RangeVector &y,
1584 const CoeffVector2 &alphav,
1585 const TCrsMatrix &A,
1586 const DomainVector &x)
1589 if(A.numRows() == -1)
return;
1593 MV_MulScalar(y,betav,y);
1595 MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
1599 typedef View<
typename RangeVector::non_const_data_type ,
1600 typename RangeVector::array_layout ,
1601 typename RangeVector::device_type ,
1602 typename RangeVector::memory_traits >
1605 typedef View<
typename DomainVector::const_data_type ,
1606 typename DomainVector::array_layout ,
1607 typename DomainVector::device_type ,
1608 Kokkos::MemoryRandomAccess >
1611 typedef View<
typename CoeffVector1::const_data_type ,
1612 typename CoeffVector1::array_layout ,
1613 typename CoeffVector1::device_type ,
1614 Kokkos::MemoryRandomAccess >
1617 typedef View<
typename CoeffVector2::const_data_type ,
1618 typename CoeffVector2::array_layout ,
1619 typename CoeffVector2::device_type ,
1620 Kokkos::MemoryRandomAccess >
1623 typedef CrsMatrix<
typename TCrsMatrix::const_value_type,
1624 typename TCrsMatrix::ordinal_type,
1625 typename TCrsMatrix::device_type,
1626 typename TCrsMatrix::memory_traits,
1627 typename TCrsMatrix::size_type> CrsMatrixType;
1669 typedef MV_MultiplyTransposeFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, CoeffVector1Type, CoeffVector2Type, 2, 2 >
1674 int numVecs = x.dimension_1();
1675 CoeffVector1 beta = betav;
1676 CoeffVector2 alpha = alphav;
1679 alpha = CoeffVector2(
"CrsMatrix::auto_a", numVecs);
1680 typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
1681 typename CoeffVector2::value_type s_a = (
typename CoeffVector2::value_type) doalpha;
1683 for (
int i = 0; i < numVecs; ++i)
1690 beta = CoeffVector1(
"CrsMatrix::auto_b", numVecs);
1691 typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
1692 typename CoeffVector1::value_type s_b = (
typename CoeffVector1::value_type) dobeta;
1694 for(
int i = 0; i < numVecs; i++)
1701 MV_MulScalar(y,betav,y);
1704 MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
1707 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1713 op.n = x.dimension_1();
1720 template<
class RangeVector,
class CrsMatrix,
class DomainVector,
class CoeffVector1,
class CoeffVector2>
1722 MV_MultiplyTranspose (
const CoeffVector1& betav,
1723 const RangeVector& y,
1724 const CoeffVector2& alphav,
1726 const DomainVector& x,
1732 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A , x);
1734 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A , x);
1735 else if(alpha == -1)
1736 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A , x);
1738 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A , x);
1739 }
else if(beta == 1) {
1743 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A , x);
1744 else if(alpha == -1)
1745 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A , x);
1747 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A , x);
1748 }
else if(beta == -1) {
1750 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A , x);
1752 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A , x);
1753 else if(alpha == -1)
1754 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A , x);
1756 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A , x);
1759 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A , x);
1761 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A , x);
1762 else if(alpha == -1)
1763 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A , x);
1765 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A , x);
1769 template<
class RangeVector,
class CrsMatrix,
class DomainVector>
1771 MV_MultiplyTranspose (
typename RangeVector::const_value_type s_b,
1772 const RangeVector& y,
1773 typename DomainVector::const_value_type s_a,
1775 const DomainVector& x)
1790 int numVecs = x.dimension_1();
1794 return MV_MultiplyTranspose (a, y, a, A, x, 0, 0);
1796 return MV_MultiplyTranspose (a, y, a, A, x, 0, 1);
1797 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1798 return MV_MultiplyTranspose (a, y, a, A, x, 0, -1);
1800 a = aVector(
"a", numVecs);
1801 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1802 for (
int i = 0; i < numVecs; ++i) {
1806 return MV_MultiplyTranspose (a, y, a, A, x, 0, 2);
1808 }
else if (s_b == 1) {
1810 return MV_MultiplyTranspose (a, y, a, A, x, 1, 0);
1812 return MV_MultiplyTranspose (a, y, a, A, x, 1, 1);
1813 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1814 return MV_MultiplyTranspose (a, y, a, A, x, 1, -1);
1816 a = aVector(
"a", numVecs);
1817 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1818 for (
int i = 0; i < numVecs; ++i) {
1822 return MV_MultiplyTranspose (a, y, a, A, x, 1, 2);
1824 }
else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) {
1826 return MV_MultiplyTranspose (a, y, a, A, x, -1, 0);
1828 return MV_MultiplyTranspose (a, y, a, A, x, -1, 1);
1829 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1830 return MV_MultiplyTranspose (a, y, a, A, x, -1, -1);
1832 a = aVector(
"a", numVecs);
1833 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1834 for (
int i = 0; i < numVecs; ++i) {
1838 return MV_MultiplyTranspose (a, y, a, A, x, -1, 2);
1841 b = aVector(
"b", numVecs);
1842 typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b);
1843 for (
int i = 0; i < numVecs; ++i) {
1849 return MV_MultiplyTranspose (b, y, a, A, x, 2, 0);
1851 return MV_MultiplyTranspose (b, y, a, A, x, 2, 1);
1852 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1853 return MV_MultiplyTranspose (b, y, a, A, x, 2, -1);
1855 a = aVector(
"a", numVecs);
1856 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1857 for (
int i = 0; i < numVecs; ++i) {
1861 return MV_MultiplyTranspose (b, y, a, A, x, 2, 2);
1866 template<
class DeviceType >
1869 mv_multiply_team_policy(
const int nrow ,
const int rows_per_thread,
const int increment )
1871 #ifdef KOKKOS_HAVE_CUDA
1872 const int teamsize = Impl::is_same< DeviceType , Kokkos::Cuda>::value ? 256 : 1;
1874 const int teamsize = 1;
1876 const int nteams = (((nrow+rows_per_thread-1)/rows_per_thread)
1877 *increment+teamsize-1)/teamsize;
1882 template<
class RangeVector,
1890 MV_MultiplySingle (
typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav,
1891 const RangeVector &y,
1892 const CoeffVector2 &alphav,
1893 const TCrsMatrix& A,
1894 const DomainVector& x)
1896 if(A.numRows()<=0)
return;
1899 V_MulScalar(y,betav,y);
1902 V_MulScalar(y,
typename RangeVector::value_type(dobeta),y);
1906 typedef View<
typename RangeVector::non_const_data_type ,
1907 typename RangeVector::array_layout ,
1908 typename RangeVector::device_type ,
1909 typename RangeVector::memory_traits >
1912 typedef View<
typename DomainVector::const_data_type ,
1913 typename DomainVector::array_layout ,
1914 typename DomainVector::device_type ,
1916 Kokkos::MemoryRandomAccess >
1919 typedef View<
typename CoeffVector1::const_data_type ,
1920 typename CoeffVector1::array_layout ,
1921 typename CoeffVector1::device_type ,
1922 Kokkos::MemoryRandomAccess >
1925 typedef View<
typename CoeffVector2::const_data_type ,
1926 typename CoeffVector2::array_layout ,
1927 typename CoeffVector2::device_type ,
1928 Kokkos::MemoryRandomAccess >
1931 typedef CrsMatrix<
typename TCrsMatrix::const_value_type,
1932 typename TCrsMatrix::ordinal_type,
1933 typename TCrsMatrix::device_type,
1934 typename TCrsMatrix::memory_traits,
1935 typename TCrsMatrix::size_type>
1938 Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
1940 const int NNZPerRow = A.nnz()/A.numRows();
1942 #ifndef KOKKOS_FAST_COMPILE
1945 typedef Vectorization<typename RangeVector::device_type,32> vec_type;
1946 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1947 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1949 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1951 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1953 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1955 }
else if(NNZPerRow>=48) {
1956 typedef Vectorization<typename RangeVector::device_type,16> vec_type;
1957 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1958 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1960 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1962 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1964 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1966 }
else if(NNZPerRow>=24) {
1967 typedef Vectorization<typename RangeVector::device_type,8> vec_type;
1968 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1969 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1971 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1973 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1975 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1977 }
else if(NNZPerRow>=12) {
1978 typedef Vectorization<typename RangeVector::device_type,4> vec_type;
1979 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1980 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1982 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1984 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1986 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1988 }
else if(NNZPerRow>=4) {
1989 typedef Vectorization<typename RangeVector::device_type,2> vec_type;
1990 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1991 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1993 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1995 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1997 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2000 typedef Vectorization<typename RangeVector::device_type,1> vec_type;
2001 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2002 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
2004 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2006 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2008 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2011 #else // NOT KOKKOS_FAST_COMPILE
2012 typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2013 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2014 CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment > OpType ;
2016 int numVecs = x.dimension_1();
2017 CoeffVector1 beta = betav;
2018 CoeffVector2 alpha = alphav;
2021 alpha = CoeffVector2(
"CrsMatrix::auto_a", numVecs);
2022 typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
2023 typename CoeffVector2::value_type s_a = (
typename CoeffVector2::value_type) doalpha;
2025 for(
int i = 0; i < numVecs; i++)
2031 beta = CoeffVector1(
"CrsMatrix::auto_b", numVecs);
2032 typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
2033 typename CoeffVector1::value_type s_b = (
typename CoeffVector1::value_type) dobeta;
2035 for(
int i = 0; i < numVecs; i++)
2041 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2043 OpType op(beta,alpha,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2045 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2048 #endif // KOKKOS_FAST_COMPILE
2052 template <
class RangeVector,
2060 MV_Multiply (
typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav,
2061 const RangeVector &y,
2062 const CoeffVector2 &alphav,
2063 const TCrsMatrix &A,
2064 const DomainVector &x)
2067 if(A.numRows() <= 0)
return;
2071 MV_MulScalar(y,betav,y);
2073 MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
2077 typedef View<
typename RangeVector::non_const_data_type ,
2078 typename RangeVector::array_layout ,
2079 typename RangeVector::device_type ,
2080 typename RangeVector::memory_traits >
2083 typedef View<
typename DomainVector::const_data_type ,
2084 typename DomainVector::array_layout ,
2085 typename DomainVector::device_type ,
2086 Kokkos::MemoryRandomAccess >
2089 typedef View<
typename CoeffVector1::const_data_type ,
2090 typename CoeffVector1::array_layout ,
2091 typename CoeffVector1::device_type ,
2092 Kokkos::MemoryRandomAccess >
2095 typedef View<
typename CoeffVector2::const_data_type ,
2096 typename CoeffVector2::array_layout ,
2097 typename CoeffVector2::device_type ,
2098 Kokkos::MemoryRandomAccess >
2101 typedef CrsMatrix<
typename TCrsMatrix::const_value_type,
2102 typename TCrsMatrix::ordinal_type,
2103 typename TCrsMatrix::device_type,
2104 typename TCrsMatrix::memory_traits,
2105 typename TCrsMatrix::size_type> CrsMatrixType;
2107 Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
2109 const int NNZPerRow = A.nnz()/A.numRows();
2111 #ifndef KOKKOS_FAST_COMPILE
2113 if(x.dimension_1()==1) {
2114 typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type> DomainVector1D;
2115 typedef View<typename RangeVectorType::value_type*,typename RangeVector::array_layout ,typename RangeVectorType::device_type,typename RangeVector::memory_traits> RangeVector1D;
2116 RangeVector1D y_sub = RangeVector1D(y.ptr_on_device(),y.dimension_0());
2117 DomainVector1D x_sub = DomainVector1D(x.ptr_on_device(),x.dimension_0());
2119 return MV_MultiplySingle<RangeVector1D,TCrsMatrix,DomainVector1D,CoeffVector1,CoeffVector2,doalpha,dobeta>
2120 (betav,y_sub,alphav,A,x_sub);
2126 typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2127 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2128 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2130 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2132 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2134 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2136 }
else if(NNZPerRow>=48) {
2137 typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2138 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2139 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2141 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2143 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2145 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2147 }
else if(NNZPerRow>=16) {
2148 typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2149 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2150 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2152 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2154 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2156 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2158 }
else if(NNZPerRow>=12) {
2159 typedef Vectorization<typename RangeVector::device_type,4> vec_type;
2160 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2161 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2163 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2165 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2167 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2169 }
else if(NNZPerRow>=4) {
2170 typedef Vectorization<typename RangeVector::device_type,2> vec_type;
2171 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2172 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2174 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2176 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2178 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2181 typedef Vectorization<typename RangeVector::device_type,1> vec_type;
2182 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2183 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2185 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2187 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2189 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2194 #else // NOT KOKKOS_FAST_COMPILE
2196 typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2197 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2198 CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment >
2201 int numVecs = x.dimension_1();
2202 CoeffVector1 beta = betav;
2203 CoeffVector2 alpha = alphav;
2206 alpha = CoeffVector2(
"CrsMatrix::auto_a", numVecs);
2207 typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
2208 typename CoeffVector2::value_type s_a = (
typename CoeffVector2::value_type) doalpha;
2210 for (
int i = 0; i < numVecs; ++i)
2217 beta = CoeffVector1(
"CrsMatrix::auto_b", numVecs);
2218 typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
2219 typename CoeffVector1::value_type s_b = (
typename CoeffVector1::value_type) dobeta;
2221 for(
int i = 0; i < numVecs; i++)
2227 const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2229 OpType op(beta,alpha,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2232 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2235 #endif // KOKKOS_FAST_COMPILE
2239 template<
class RangeVector,
2247 MV_Multiply (
typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav,
2248 const RangeVector &y,
2249 const CoeffVector2 &alphav,
2250 const TCrsMatrix& A,
2251 const DomainVector& x) {
2252 return MV_MultiplySingle<RangeVector,TCrsMatrix,DomainVector,CoeffVector1,CoeffVector2,doalpha,dobeta>
2253 (betav,y,alphav,A,x);
2256 template<
class RangeVector,
class CrsMatrix,
class DomainVector,
class CoeffVector1,
class CoeffVector2>
2258 MV_Multiply (
const CoeffVector1& betav,
2259 const RangeVector& y,
2260 const CoeffVector2& alphav,
2262 const DomainVector& x,
2268 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A , x);
2270 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A , x);
2271 else if(alpha == -1)
2272 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A , x);
2274 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A , x);
2275 }
else if(beta == 1) {
2279 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A , x);
2280 else if(alpha == -1)
2281 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A , x);
2283 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A , x);
2284 }
else if(beta == -1) {
2286 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A , x);
2288 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A , x);
2289 else if(alpha == -1)
2290 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A , x);
2292 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A , x);
2295 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A , x);
2297 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A , x);
2298 else if(alpha == -1)
2299 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A , x);
2301 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A , x);
2305 template <
class RangeVector,
class CrsMatrix,
class DomainVector,
2306 class Value1,
class Layout1,
class Device1,
class MemoryManagement1,
2307 class Value2,
class Layout2,
class Device2,
class MemoryManagement2>
2310 const RangeVector& y,
2313 const DomainVector& x)
2315 return MV_Multiply (betav, y, alphav, A, x, 2, 2);
2318 template <
class RangeVector,
class CrsMatrix,
class DomainVector,
2319 class Value1,
class Layout1,
class Device1,
class MemoryManagement1>
2321 MV_Multiply (
const RangeVector& y,
2324 const DomainVector& x)
2326 return MV_Multiply (alphav, y, alphav, A, x, 0, 2);
2329 template<
class RangeVector,
class CrsMatrix,
class DomainVector>
2331 MV_Multiply (
const RangeVector& y,
2333 const DomainVector& x)
2344 #ifdef KOKKOS_USE_CUSPARSE
2345 if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, 1.0, A, x)) {
2348 #endif // KOKKOS_USE_CUSPARSE
2349 #ifdef KOKKOS_USE_MKL
2350 if (MV_Multiply_Try_MKL (0.0, y, 1.0, A, x)) {
2353 #endif // KOKKOS_USE_MKL
2357 return MV_Multiply (a, y, a, A, x, 0, 1);
2360 template<
class RangeVector,
class CrsMatrix,
class DomainVector>
2362 MV_Multiply (
const RangeVector& y,
2363 typename DomainVector::const_value_type s_a,
2365 const DomainVector& x)
2367 #ifdef KOKKOS_USE_CUSPARSE
2368 if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, s_a, A, x)) {
2371 #endif // KOKKOS_USE_CUSPARSE
2372 #ifdef KOKKOS_USE_MKL
2373 if (MV_Multiply_Try_MKL (0.0, y, s_a, A, x)) {
2376 #endif // KOKKOS_USE_MKL
2379 const int numVecs = x.dimension_1();
2381 if ((s_a < 1) && (s_a != 0)) {
2382 return MV_Multiply (a, y, a, A, x, 0, -1);
2383 }
else if (s_a == 1) {
2384 return MV_Multiply (a, y, a, A, x, 0, 1);
2388 a = aVector(
"a", numVecs);
2389 typename aVector::HostMirror h_a = Kokkos::create_mirror_view (a);
2390 for (
int i = 0; i < numVecs; ++i) {
2394 return MV_Multiply (a, y, a, A, x, 0, 2);
2398 template<
class RangeVector,
class CrsMatrix,
class DomainVector>
2400 MV_Multiply (
typename RangeVector::const_value_type s_b,
2401 const RangeVector& y,
2402 typename DomainVector::const_value_type s_a,
2404 const DomainVector& x)
2406 #ifdef KOKKOS_USE_CUSPARSE
2407 if (CuSparse::MV_Multiply_Try_CuSparse (s_b, y, s_a, A, x)) {
2410 #endif // KOKKOSE_USE_CUSPARSE
2411 #ifdef KOKKOS_USE_MKL
2412 if (MV_Multiply_Try_MKL (s_b, y, s_a, A, x)) {
2415 #endif // KOKKOS_USE_MKL
2419 int numVecs = x.dimension_1();
2425 return MV_Multiply (a, y, a, A, x, 0, 0);
2427 return MV_Multiply (a, y, a, A, x, 0, 1);
2428 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2429 return MV_Multiply (a, y, a, A, x, 0, -1);
2431 a = aVector(
"a", numVecs);
2432 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2433 for (
int i = 0; i < numVecs; ++i) {
2437 return MV_Multiply (a, y, a, A, x, 0, 2);
2439 }
else if (s_b == 1) {
2441 return MV_Multiply (a, y, a, A, x, 1, 0);
2443 return MV_Multiply (a, y, a, A, x, 1, 1);
2444 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2445 return MV_Multiply (a, y, a, A, x, 1, -1);
2447 a = aVector(
"a", numVecs);
2448 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2449 for (
int i = 0; i < numVecs; ++i) {
2453 return MV_Multiply (a, y, a, A, x, 1, 2);
2455 }
else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) {
2457 return MV_Multiply (a, y, a, A, x, -1, 0);
2459 return MV_Multiply (a, y, a, A, x, -1, 1);
2460 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2461 return MV_Multiply (a, y, a, A, x, -1, -1);
2463 a = aVector(
"a", numVecs);
2464 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2465 for (
int i = 0; i < numVecs; ++i) {
2469 return MV_Multiply (a, y, a, A, x, -1, 2);
2472 b = aVector(
"b", numVecs);
2473 typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b);
2474 for (
int i = 0; i < numVecs; ++i) {
2480 return MV_Multiply (b, y, a, A, x, 2, 0);
2482 return MV_Multiply (b, y, a, A, x, 2, 1);
2483 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2484 return MV_Multiply (b, y, a, A, x, 2, -1);
2486 a = aVector(
"a", numVecs);
2487 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2488 for (
int i = 0; i < numVecs; ++i) {
2492 return MV_Multiply (b, y, a, A, x, 2, 2);
2497 namespace KokkosCrsMatrix {
2511 template <
class CrsMatrixDst,
class CrsMatrixSrc>
2512 void deep_copy (CrsMatrixDst A, CrsMatrixSrc B) {
values_type::non_const_value_type non_const_value_type
Nonconst version of the type of the entries in the sparse matrix.
KOKKOS_INLINE_FUNCTION SparseRowViewConst< CrsMatrix > rowConst(int i) const
Return a const view of row i of the matrix.
void deep_copy(const View< DT, DL, DD, DM, DS > &dst, typename Impl::enable_if<(Impl::is_same< typename ViewTraits< DT, DL, DD, DM >::non_const_value_type, typename ViewTraits< DT, DL, DD, DM >::value_type >::value), typename ViewTraits< DT, DL, DD, DM >::const_value_type >::type &value)
Deep copy a value into a view.
KOKKOS_INLINE_FUNCTION value_type & value(const int &i) const
(Const) reference to the value of entry i in this row of the sparse matrix.
KOKKOS_INLINE_FUNCTION SparseRowView(const typename MatrixType::values_type &values, const typename MatrixType::index_type &colidx__, const int &stride, const int &count, const int &idx)
Constructor.
Const view of a row of a sparse matrix.
View of a row of a sparse matrix.
const int length
Number of entries in the row.
KOKKOS_INLINE_FUNCTION ordinal_type numCols() const
The number of columns in the sparse matrix.
KOKKOS_INLINE_FUNCTION ordinal_type & colidx(const int &i) const
Reference to the column index of entry i in this row of the sparse matrix.
CrsMatrix(const std::string &label, OrdinalType nrows, OrdinalType ncols, OrdinalType annz, ScalarType *val, OrdinalType *rows, OrdinalType *cols, bool pad=false)
Constructor that copies raw arrays of host data in coordinate format.
Kokkos::StaticCrsGraph< OrdinalType, Kokkos::LayoutLeft, Device, SizeType > StaticCrsGraphType
Type of the graph structure of the sparse matrix.
KOKKOS_INLINE_FUNCTION SparseRowViewConst(value_type *const values, ordinal_type *const colidx__, const int stride, const int count)
Constructor.
values_type::const_value_type const_value_type
Const version of the type of the entries in the sparse matrix.
KOKKOS_INLINE_FUNCTION ordinal_type & colidx(const int &i) const
(Const) reference to the column index of entry i in this row of the sparse matrix.
KOKKOS_INLINE_FUNCTION ordinal_type nnz() const
The number of stored entries in the sparse matrix.
CrsMatrix(const CrsMatrix< SType, OType, DType, MTType, IType > &B)
Copy Constructor.
KOKKOS_INLINE_FUNCTION SparseRowViewConst(const typename MatrixType::values_type &values, const typename MatrixType::index_type &colidx__, const int &stride, const int &count, const int &idx)
Constructor.
CrsMatrix(const std::string &arg_label, const StaticCrsGraphType &arg_graph)
Construct with a graph that will be shared.
KOKKOS_INLINE_FUNCTION ordinal_type numRows() const
The number of rows in the sparse matrix.
Compressed sparse row implementation of a sparse matrix.
CrsMatrix(const std::string &label, OrdinalType nrows, OrdinalType ncols, OrdinalType annz, values_type vals, row_map_type rows, index_type cols)
Constructor that accepts a row map, column indices, and values.
const MatrixType::non_const_value_type value_type
The type of the values in the row.
StaticCrsGraphType::entries_type index_type
Type of column indices in the sparse matrix.
Kokkos::View< value_type *, Kokkos::LayoutRight, device_type, MemoryTraits > values_type
Kokkos Array type of the entries (values) in the sparse matrix.
cuSPARSE implementation of Kokkos::CrsMatrix kernels.
MatrixType::value_type value_type
The type of the values in the row.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, typename Impl::enable_if< !Impl::is_integral< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
CrsMatrix(const std::string &label, OrdinalType ncols, values_type vals, StaticCrsGraphType graph_)
Constructor that accepts a a static graph, and values.
const MatrixType::non_const_ordinal_type ordinal_type
The type of the column indices in the row.
StaticCrsGraphType::row_map_type row_map_type
Type of the "row map" (which contains the offset for each row's data).
Intel MKL version of Kokkos' sparse matrix interface.
KOKKOS_INLINE_FUNCTION SparseRowView(value_type *const values, ordinal_type *const colidx__, const int stride, const int count)
Constructor.
CrsMatrix()
Default constructor; constructs an empty sparse matrix.
const int length
Number of entries in the row.
KOKKOS_INLINE_FUNCTION SparseRowView< CrsMatrix > row(int i) const
Return a view of row i of the matrix.
Execution policy for parallel work over a league of teams of threads.
KOKKOS_INLINE_FUNCTION value_type & value(const int &i) const
Reference to the value of entry i in this row of the sparse matrix.
void generate(const std::string &label, OrdinalType nrows, OrdinalType ncols, OrdinalType target_nnz, OrdinalType varianz_nel_row, OrdinalType width_row)
This is a method only for testing that creates a random sparse matrix.
MatrixType::ordinal_type ordinal_type
The type of the column indices in the row.
CrsMatrix< ScalarType, OrdinalType, host_mirror_space, MemoryTraits > HostMirror
Type of a host-memory mirror of the sparse matrix.