Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_CrsMatrix.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CUDA_CRSMATRIX_HPP
11 #define STOKHOS_CUDA_CRSMATRIX_HPP
12 
13 #include "Stokhos_ConfigDefs.h"
14 #ifdef HAVE_STOKHOS_CUSPARSE
15 
16 #include <utility>
17 #include <sstream>
18 #include <stdexcept>
19 
20 #include <cuda_runtime.h>
21 //#include <cusparse.h>
22 #include <cusparse_v2.h>
23 
24 #include "Kokkos_Core.hpp"
25 
26 #include "Stokhos_Multiply.hpp"
27 #include "Stokhos_CrsMatrix.hpp"
28 
29 // Use new cuSPARSE SpMv/SpMM functions only in CUDA 11 or greater.
30 // (while they exist in some versions of CUDA 10, they appear to not always
31 // product correct results).
32 #if CUSPARSE_VERSION >= 11401
33  #define USE_NEW_SPMV 1
34  #define NEW_SPMV_ALG_DEFAULTS \
35  cusparseSpMVAlg_t alg_spmv = CUSPARSE_SPMV_ALG_DEFAULT;
36  #define NEW_SPMM_ALG_DEFAULTS \
37  cusparseSpMMAlg_t alg_spmm = CUSPARSE_SPMM_ALG_DEFAULT;
38 #elif CUSPARSE_VERSION >= 11000
39  #define USE_NEW_SPMV 1
40  #define NEW_SPMV_ALG_DEFAULTS \
41  cusparseSpMVAlg_t alg_spmv = CUSPARSE_MV_ALG_DEFAULT;
42  #define NEW_SPMM_ALG_DEFAULTS \
43  cusparseSpMMAlg_t alg_spmm = CUSPARSE_MM_ALG_DEFAULT;
44 #else
45  #define USE_NEW_SPMV 0
46 #endif
47 
48 namespace Stokhos {
49 
50 class CudaSparseSingleton {
51 public:
52 
53  cusparseStatus_t status;
54  cusparseHandle_t handle;
55  cusparseMatDescr_t descra;
56 
57  static CudaSparseSingleton & singleton();
58 
59 private:
60 
61  CudaSparseSingleton()
62  {
63  status = cusparseCreate(&handle);
64  if(status != CUSPARSE_STATUS_SUCCESS)
65  {
66  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Initialization failed" ) );
67  }
68 
69  status = cusparseCreateMatDescr(&descra);
70  if(status != CUSPARSE_STATUS_SUCCESS)
71  {
72  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Matrix descriptor failed" ) );
73  }
74 
75  cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
76  cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
77  }
78 
79  CudaSparseSingleton( const CudaSparseSingleton & );
80  CudaSparseSingleton & operator = ( const CudaSparseSingleton & );
81 };
82 
83 CudaSparseSingleton & CudaSparseSingleton::singleton()
84 {
85  static CudaSparseSingleton s ; return s ;
86 }
87 
88 template<>
89 class Multiply<
90  CrsMatrix< float , Kokkos::Cuda > ,
91  Kokkos::View< float* , Kokkos::Cuda > ,
92  Kokkos::View< float* , Kokkos::Cuda > ,
93  void,
94  IntegralRank<1> >
95 {
96 public:
97  typedef Kokkos::Cuda execution_space ;
98  typedef execution_space::size_type size_type ;
99  typedef Kokkos::View< float* , execution_space > vector_type ;
100  typedef CrsMatrix< float , execution_space > matrix_type ;
101 
102  //--------------------------------------------------------------------------
103 
104  static void apply( const matrix_type & A ,
105  const vector_type & x ,
106  const vector_type & y )
107  {
108  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
109  const float alpha = 1 , beta = 0 ;
110  const int n = A.graph.row_map.extent(0) - 1 ;
111  const int nz = A.graph.entries.extent(0);
112 
113 #if USE_NEW_SPMV
114  NEW_SPMV_ALG_DEFAULTS
115  using offset_type = typename matrix_type::graph_type::size_type;
116  using entry_type = typename matrix_type::graph_type::data_type;
117  const cusparseIndexType_t myCusparseOffsetType =
118  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
119  const cusparseIndexType_t myCusparseEntryType =
120  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
121  cusparseSpMatDescr_t A_cusparse;
122  cusparseCreateCsr(
123  &A_cusparse, n, n, nz,
124  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
125  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
126  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
127  cusparseDnVecDescr_t x_cusparse, y_cusparse;
128  cusparseCreateDnVec(&x_cusparse, n, (void*)x.data(), CUDA_R_32F);
129  cusparseCreateDnVec(&y_cusparse, n, (void*)y.data(), CUDA_R_32F);
130  size_t bufferSize = 0;
131  void* dBuffer = NULL;
132  cusparseSpMV_bufferSize(
133  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
134  x_cusparse, &beta, y_cusparse, CUDA_R_32F, alg_spmv,
135  &bufferSize);
136  cudaMalloc(&dBuffer, bufferSize);
137  cusparseStatus_t status =
138  cusparseSpMV(s.handle ,
139  CUSPARSE_OPERATION_NON_TRANSPOSE ,
140  &alpha ,
141  A_cusparse ,
142  x_cusparse ,
143  &beta ,
144  y_cusparse ,
145  CUDA_R_32F ,
146  alg_spmv ,
147  dBuffer );
148  cudaFree(dBuffer);
149  cusparseDestroyDnVec(x_cusparse);
150  cusparseDestroyDnVec(y_cusparse);
151  cusparseDestroySpMat(A_cusparse);
152 #else
153  cusparseStatus_t status =
154  cusparseScsrmv( s.handle ,
155  CUSPARSE_OPERATION_NON_TRANSPOSE ,
156  n , n , nz ,
157  &alpha ,
158  s.descra ,
159  A.values.data() ,
160  A.graph.row_map.data() ,
161  A.graph.entries.data() ,
162  x.data() ,
163  &beta ,
164  y.data() );
165 #endif
166 
167  if ( CUSPARSE_STATUS_SUCCESS != status ) {
168  throw std::runtime_error( std::string("ERROR - cusparseScsrmv " ) );
169  }
170  }
171 };
172 
173 template<>
174 class Multiply<
175  CrsMatrix< double , Kokkos::Cuda > ,
176  Kokkos::View< double* , Kokkos::Cuda > ,
177  Kokkos::View< double* , Kokkos::Cuda > ,
178  void,
179  IntegralRank<1> >
180 {
181 public:
182  typedef Kokkos::Cuda execution_space ;
183  typedef execution_space::size_type size_type ;
184  typedef Kokkos::View< double* , execution_space > vector_type ;
185  typedef CrsMatrix< double , execution_space > matrix_type ;
186 
187  //--------------------------------------------------------------------------
188 
189  static void apply( const matrix_type & A ,
190  const vector_type & x ,
191  const vector_type & y )
192  {
193  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
194  const double alpha = 1 , beta = 0 ;
195  const int n = A.graph.row_map.extent(0) - 1 ;
196  const int nz = A.graph.entries.extent(0);
197 
198 #if USE_NEW_SPMV
199  NEW_SPMV_ALG_DEFAULTS
200  using offset_type = typename matrix_type::graph_type::size_type;
201  using entry_type = typename matrix_type::graph_type::data_type;
202  const cusparseIndexType_t myCusparseOffsetType =
203  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
204  const cusparseIndexType_t myCusparseEntryType =
205  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
206  cusparseSpMatDescr_t A_cusparse;
207  cusparseCreateCsr(
208  &A_cusparse, n, n, nz,
209  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
210  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
211  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
212  cusparseDnVecDescr_t x_cusparse, y_cusparse;
213  cusparseCreateDnVec(&x_cusparse, n, (void*)x.data(), CUDA_R_64F);
214  cusparseCreateDnVec(&y_cusparse, n, (void*)y.data(), CUDA_R_64F);
215  size_t bufferSize = 0;
216  void* dBuffer = NULL;
217  cusparseSpMV_bufferSize(
218  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
219  x_cusparse, &beta, y_cusparse, CUDA_R_64F, alg_spmv,
220  &bufferSize);
221  cudaMalloc(&dBuffer, bufferSize);
222  cusparseStatus_t status =
223  cusparseSpMV(s.handle ,
224  CUSPARSE_OPERATION_NON_TRANSPOSE ,
225  &alpha ,
226  A_cusparse ,
227  x_cusparse ,
228  &beta ,
229  y_cusparse ,
230  CUDA_R_64F ,
231  alg_spmv ,
232  dBuffer );
233  cudaFree(dBuffer);
234  cusparseDestroyDnVec(x_cusparse);
235  cusparseDestroyDnVec(y_cusparse);
236  cusparseDestroySpMat(A_cusparse);
237 #else
238  cusparseStatus_t status =
239  cusparseDcsrmv( s.handle ,
240  CUSPARSE_OPERATION_NON_TRANSPOSE ,
241  n , n , nz ,
242  &alpha ,
243  s.descra ,
244  A.values.data() ,
245  A.graph.row_map.data() ,
246  A.graph.entries.data() ,
247  x.data() ,
248  &beta ,
249  y.data() );
250 #endif
251 
252  if ( CUSPARSE_STATUS_SUCCESS != status ) {
253  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
254  }
255  }
256 };
257 
258 template <typename Ordinal>
259 class Multiply<
260  CrsMatrix< float , Kokkos::Cuda > ,
261  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
262  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
263  std::vector<Ordinal> ,
264  IntegralRank<2> >
265 {
266 public:
267  typedef Kokkos::Cuda execution_space;
268  typedef execution_space::size_type size_type;
269  typedef Kokkos::View< float*, Kokkos::LayoutLeft, execution_space > vector_type;
270  typedef Kokkos::View< float**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
271  typedef CrsMatrix< float , execution_space > matrix_type;
272 
273  //--------------------------------------------------------------------------
274 
275  static void apply( const matrix_type & A ,
276  const multi_vector_type & x ,
277  const multi_vector_type & y ,
278  const std::vector<Ordinal> & col_indices )
279  {
280  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
281  const float alpha = 1 , beta = 0 ;
282  const int n = A.graph.row_map.extent(0) - 1 ;
283  const int nz = A.graph.entries.extent(0);
284  const size_t ncol = col_indices.size();
285 
286  // Copy columns of x into a contiguous vector
287  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
288  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
289 
290  for (size_t col=0; col<ncol; col++) {
291  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
292  vector_type xx_view = Kokkos::subview( xx , span );
293  vector_type x_col =
294  Kokkos::subview( x, Kokkos::ALL(), col_indices[col] );
295  Kokkos::deep_copy(xx_view, x_col);
296  }
297 
298  // Sparse matrix-times-multivector
299 #if USE_NEW_SPMV
300  NEW_SPMM_ALG_DEFAULTS
301  using offset_type = typename matrix_type::graph_type::size_type;
302  using entry_type = typename matrix_type::graph_type::data_type;
303  const cusparseIndexType_t myCusparseOffsetType =
304  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
305  const cusparseIndexType_t myCusparseEntryType =
306  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
307  cusparseSpMatDescr_t A_cusparse;
308  cusparseCreateCsr(
309  &A_cusparse, n, n, nz,
310  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
311  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
312  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
313  cusparseDnMatDescr_t x_cusparse, y_cusparse;
314  cusparseCreateDnMat(
315  &x_cusparse, n, ncol, n,
316  (void*)xx.data(), CUDA_R_32F, CUSPARSE_ORDER_COL);
317  cusparseCreateDnMat(
318  &y_cusparse, n, ncol, n,
319  (void*)yy.data(), CUDA_R_32F, CUSPARSE_ORDER_COL);
320  size_t bufferSize = 0;
321  void* dBuffer = NULL;
322  cusparseSpMM_bufferSize(
323  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
324  CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
325  x_cusparse, &beta, y_cusparse, CUDA_R_32F, alg_spmm,
326  &bufferSize);
327  cudaMalloc(&dBuffer, bufferSize);
328  cusparseStatus_t status =
329  cusparseSpMM(s.handle ,
330  CUSPARSE_OPERATION_NON_TRANSPOSE ,
331  CUSPARSE_OPERATION_NON_TRANSPOSE ,
332  &alpha ,
333  A_cusparse ,
334  x_cusparse ,
335  &beta ,
336  y_cusparse ,
337  CUDA_R_32F ,
338  alg_spmm ,
339  dBuffer );
340  cudaFree(dBuffer);
341  cusparseDestroyDnMat(x_cusparse);
342  cusparseDestroyDnMat(y_cusparse);
343  cusparseDestroySpMat(A_cusparse);
344 #else
345  cusparseStatus_t status =
346  cusparseScsrmm( s.handle ,
347  CUSPARSE_OPERATION_NON_TRANSPOSE ,
348  n , ncol , n , nz ,
349  &alpha ,
350  s.descra ,
351  A.values.data() ,
352  A.graph.row_map.data() ,
353  A.graph.entries.data() ,
354  xx.data() ,
355  n ,
356  &beta ,
357  yy.data() ,
358  n );
359 #endif
360 
361  if ( CUSPARSE_STATUS_SUCCESS != status ) {
362  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
363  }
364 
365  // Copy columns out of continguous multivector
366  for (size_t col=0; col<ncol; col++) {
367  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
368  vector_type yy_view = Kokkos::subview( yy , span );
369  vector_type y_col =
370  Kokkos::subview( y, Kokkos::ALL(), col_indices[col] );
371  Kokkos::deep_copy(y_col, yy_view );
372  }
373  }
374 };
375 
376 #define USE_CUSPARSE 1
377 #if USE_CUSPARSE
378 
379 template <typename Ordinal>
380 class Multiply<
381  CrsMatrix< double , Kokkos::Cuda > ,
382  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
383  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
384  std::vector<Ordinal> ,
385  IntegralRank<2> >
386 {
387 public:
388  typedef Kokkos::Cuda execution_space;
389  typedef execution_space::size_type size_type;
390  typedef Kokkos::View< double*, Kokkos::LayoutLeft, execution_space > vector_type;
391  typedef Kokkos::View< double**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
392  typedef CrsMatrix< double , execution_space > matrix_type;
393 
394  //--------------------------------------------------------------------------
395 
396 #define USE_TRANSPOSE 0
397 #if USE_TRANSPOSE
398 
399  // A version that copies the vectors to a transposed 2D view and calls
400  // new CUSPARSE function for transpose layout. Seems to be somewhat
401  // slower????
402 
403  struct GatherTranspose {
404  typedef Kokkos::Cuda execution_space;
405  typedef execution_space::size_type size_type;
406 
407  multi_vector_type m_xt;
408  const multi_vector_type m_x;
409  const Kokkos::View<Ordinal*,execution_space> m_col;
410  const size_type m_ncol;
411 
412  GatherTranspose( multi_vector_type& xt,
413  const multi_vector_type& x,
414  const Kokkos::View<Ordinal*,execution_space>& col ) :
415  m_xt(xt), m_x(x), m_col(col), m_ncol(col.extent(0)) {}
416 
417  __device__
418  inline void operator() (size_type i) const {
419  for (size_type j=0; j<m_ncol; ++j)
420  m_xt(j,i) = m_x(i,m_col(j));
421  }
422 
423  static void apply( multi_vector_type& xt,
424  const multi_vector_type& x,
425  const Kokkos::View<Ordinal*,execution_space>& col ) {
426  const size_type n = x.extent(0);
427  Kokkos::parallel_for( n , GatherTranspose(xt,x,col) );
428  }
429  };
430 
431  static void apply( const matrix_type & A ,
432  const multi_vector_type & x ,
433  const multi_vector_type & y ,
434  const std::vector<Ordinal> & col_indices )
435  {
436  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
437  const double alpha = 1 , beta = 0 ;
438  const int n = A.graph.row_map.extent(0) - 1 ;
439  const int nz = A.graph.entries.extent(0);
440  const size_t ncol = col_indices.size();
441 
442  // Copy col_indices to the device
443  Kokkos::View<Ordinal*,execution_space> col_indices_dev(
444  Kokkos::ViewAllocateWithoutInitializing("col_indices"), ncol);
445  typename Kokkos::View<Ordinal*,execution_space>::HostMirror col_indices_host =
446  Kokkos::create_mirror_view(col_indices_dev);
447  for (size_t i=0; i<ncol; ++i)
448  col_indices_host(i) = col_indices[i];
449  Kokkos::deep_copy(col_indices_dev, col_indices_host);
450 
451  // Copy columns of x into a contiguous multi-vector and transpose
452  multi_vector_type xx(
453  Kokkos::ViewAllocateWithoutInitializing("xx"), ncol , n );
454  GatherTranspose::apply(xx, x, col_indices_dev);
455 
456  // Temporary to store result (this is not transposed)
457  multi_vector_type yy(
458  Kokkos::ViewAllocateWithoutInitializing("yy"), n , ncol );
459 
460  // Sparse matrix-times-multivector
461  cusparseStatus_t status =
462  cusparseDcsrmm2( s.handle ,
463  CUSPARSE_OPERATION_NON_TRANSPOSE ,
464  CUSPARSE_OPERATION_TRANSPOSE ,
465  n , ncol , n , nz ,
466  &alpha ,
467  s.descra ,
468  A.values.data() ,
469  A.graph.row_map.data() ,
470  A.graph.entries.data() ,
471  xx.data() ,
472  ncol ,
473  &beta ,
474  yy.data() ,
475  n );
476 
477  if ( CUSPARSE_STATUS_SUCCESS != status ) {
478  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
479  }
480 
481  // Copy columns out of continguous multivector
482  for (size_t col=0; col<ncol; col++) {
483  vector_type yy_view =
484  Kokkos::subview( yy , Kokkos::ALL(), col );
485  vector_type y_col =
486  Kokkos::subview( y, Kokkos::ALL(), col_indices[col] );
487  Kokkos::deep_copy(y_col, yy_view );
488  }
489  }
490 #else
491  static void apply( const matrix_type & A ,
492  const multi_vector_type & x ,
493  const multi_vector_type & y ,
494  const std::vector<Ordinal> & col_indices )
495  {
496  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
497  const double alpha = 1 , beta = 0 ;
498  const int n = A.graph.row_map.extent(0) - 1 ;
499  const int nz = A.graph.entries.extent(0);
500  const size_t ncol = col_indices.size();
501 
502  // Copy columns of x into a contiguous vector
503  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
504  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
505 
506  for (size_t col=0; col<ncol; col++) {
507  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
508  vector_type xx_view = Kokkos::subview( xx , span );
509  vector_type x_col =
510  Kokkos::subview( x, Kokkos::ALL(), col_indices[col] );
511  Kokkos::deep_copy(xx_view, x_col);
512  }
513 
514  // Sparse matrix-times-multivector
515 #if USE_NEW_SPMV
516  NEW_SPMM_ALG_DEFAULTS
517  using offset_type = typename matrix_type::graph_type::size_type;
518  using entry_type = typename matrix_type::graph_type::data_type;
519  const cusparseIndexType_t myCusparseOffsetType =
520  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
521  const cusparseIndexType_t myCusparseEntryType =
522  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
523  cusparseSpMatDescr_t A_cusparse;
524  cusparseCreateCsr(
525  &A_cusparse, n, n, nz,
526  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
527  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
528  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
529  cusparseDnMatDescr_t x_cusparse, y_cusparse;
530  cusparseCreateDnMat(
531  &x_cusparse, n, ncol, n,
532  (void*)xx.data(), CUDA_R_64F, CUSPARSE_ORDER_COL);
533  cusparseCreateDnMat(
534  &y_cusparse, n, ncol, n,
535  (void*)yy.data(), CUDA_R_64F, CUSPARSE_ORDER_COL);
536  size_t bufferSize = 0;
537  void* dBuffer = NULL;
538  cusparseSpMM_bufferSize(
539  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
540  CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
541  x_cusparse, &beta, y_cusparse, CUDA_R_64F, alg_spmm,
542  &bufferSize);
543  cudaMalloc(&dBuffer, bufferSize);
544  cusparseStatus_t status =
545  cusparseSpMM(s.handle ,
546  CUSPARSE_OPERATION_NON_TRANSPOSE ,
547  CUSPARSE_OPERATION_NON_TRANSPOSE ,
548  &alpha ,
549  A_cusparse ,
550  x_cusparse ,
551  &beta ,
552  y_cusparse ,
553  CUDA_R_64F ,
554  alg_spmm ,
555  dBuffer );
556  cudaFree(dBuffer);
557  cusparseDestroyDnMat(x_cusparse);
558  cusparseDestroyDnMat(y_cusparse);
559  cusparseDestroySpMat(A_cusparse);
560 #else
561  cusparseStatus_t status =
562  cusparseDcsrmm( s.handle ,
563  CUSPARSE_OPERATION_NON_TRANSPOSE ,
564  n , ncol , n , nz ,
565  &alpha ,
566  s.descra ,
567  A.values.data() ,
568  A.graph.row_map.data() ,
569  A.graph.entries.data() ,
570  xx.data() ,
571  n ,
572  &beta ,
573  yy.data() ,
574  n );
575 #endif
576 
577  if ( CUSPARSE_STATUS_SUCCESS != status ) {
578  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
579  }
580 
581  // Copy columns out of continguous multivector
582  for (size_t col=0; col<ncol; col++) {
583  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
584  vector_type yy_view = Kokkos::subview( yy , span );
585  vector_type y_col =
586  Kokkos::subview( y, Kokkos::ALL(), col_indices[col] );
587  Kokkos::deep_copy(y_col, yy_view );
588  }
589  }
590 #endif
591 };
592 
593 template <>
594 class Multiply<
595  CrsMatrix< float , Kokkos::Cuda > ,
596  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
597  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
598  void ,
599  IntegralRank<2> >
600 {
601 public:
602  typedef Kokkos::Cuda execution_space;
603  typedef execution_space::size_type size_type;
604  typedef Kokkos::View< float**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
605  typedef CrsMatrix< float , execution_space > matrix_type;
606 
607  //--------------------------------------------------------------------------
608 
609  static void apply( const matrix_type & A ,
610  const multi_vector_type & x ,
611  const multi_vector_type & y )
612  {
613  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
614  const float alpha = 1 , beta = 0 ;
615  const int n = A.graph.row_map.extent(0) - 1 ;
616  const int nz = A.graph.entries.extent(0);
617  const size_t ncol = x.extent(1);
618 
619  // Sparse matrix-times-multivector
620 #if USE_NEW_SPMV
621  NEW_SPMM_ALG_DEFAULTS
622  using offset_type = typename matrix_type::graph_type::size_type;
623  using entry_type = typename matrix_type::graph_type::data_type;
624  const cusparseIndexType_t myCusparseOffsetType =
625  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
626  const cusparseIndexType_t myCusparseEntryType =
627  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
628  cusparseSpMatDescr_t A_cusparse;
629  cusparseCreateCsr(
630  &A_cusparse, n, n, nz,
631  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
632  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
633  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
634  cusparseDnMatDescr_t x_cusparse, y_cusparse;
635  cusparseCreateDnMat(
636  &x_cusparse, n, ncol, x.stride(1),
637  (void*)x.data(), CUDA_R_32F, CUSPARSE_ORDER_COL);
638  cusparseCreateDnMat(
639  &y_cusparse, n, ncol, y.stride(1),
640  (void*)y.data(), CUDA_R_32F, CUSPARSE_ORDER_COL);
641  size_t bufferSize = 0;
642  void* dBuffer = NULL;
643  cusparseSpMM_bufferSize(
644  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
645  CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
646  x_cusparse, &beta, y_cusparse, CUDA_R_32F, alg_spmm,
647  &bufferSize);
648  cudaMalloc(&dBuffer, bufferSize);
649  cusparseStatus_t status =
650  cusparseSpMM(s.handle ,
651  CUSPARSE_OPERATION_NON_TRANSPOSE ,
652  CUSPARSE_OPERATION_NON_TRANSPOSE ,
653  &alpha ,
654  A_cusparse ,
655  x_cusparse ,
656  &beta ,
657  y_cusparse ,
658  CUDA_R_32F ,
659  alg_spmm ,
660  dBuffer );
661  cudaFree(dBuffer);
662  cusparseDestroyDnMat(x_cusparse);
663  cusparseDestroyDnMat(y_cusparse);
664  cusparseDestroySpMat(A_cusparse);
665 #else
666  cusparseStatus_t status =
667  cusparseScsrmm( s.handle ,
668  CUSPARSE_OPERATION_NON_TRANSPOSE ,
669  n , ncol , n , nz ,
670  &alpha ,
671  s.descra ,
672  A.values.data() ,
673  A.graph.row_map.data() ,
674  A.graph.entries.data() ,
675  x.data() ,
676  n ,
677  &beta ,
678  y.data() ,
679  n );
680 #endif
681 
682  if ( CUSPARSE_STATUS_SUCCESS != status ) {
683  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
684  }
685  }
686 };
687 
688 template <>
689 class Multiply<
690  CrsMatrix< double , Kokkos::Cuda > ,
691  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
692  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
693  void ,
694  IntegralRank<2> >
695 {
696 public:
697  typedef Kokkos::Cuda execution_space;
698  typedef execution_space::size_type size_type;
699  typedef Kokkos::View< double**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
700  typedef CrsMatrix< double , execution_space > matrix_type;
701 
702  //--------------------------------------------------------------------------
703 
704  static void apply( const matrix_type & A ,
705  const multi_vector_type & x ,
706  const multi_vector_type & y )
707  {
708  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
709  const double alpha = 1 , beta = 0 ;
710  const int n = A.graph.row_map.extent(0) - 1 ;
711  const int nz = A.graph.entries.extent(0);
712  const size_t ncol = x.extent(1);
713 
714  // Sparse matrix-times-multivector
715 #if USE_NEW_SPMV
716  NEW_SPMM_ALG_DEFAULTS
717  using offset_type = typename matrix_type::graph_type::size_type;
718  using entry_type = typename matrix_type::graph_type::data_type;
719  const cusparseIndexType_t myCusparseOffsetType =
720  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
721  const cusparseIndexType_t myCusparseEntryType =
722  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
723  cusparseSpMatDescr_t A_cusparse;
724  cusparseCreateCsr(
725  &A_cusparse, n, n, nz,
726  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
727  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
728  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
729  cusparseDnMatDescr_t x_cusparse, y_cusparse;
730  cusparseCreateDnMat(
731  &x_cusparse, n, ncol, x.stride(1),
732  (void*)x.data(), CUDA_R_64F, CUSPARSE_ORDER_COL);
733  cusparseCreateDnMat(
734  &y_cusparse, n, ncol, y.stride(1),
735  (void*)y.data(), CUDA_R_64F, CUSPARSE_ORDER_COL);
736  size_t bufferSize = 0;
737  void* dBuffer = NULL;
738  cusparseSpMM_bufferSize(
739  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
740  CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
741  x_cusparse, &beta, y_cusparse, CUDA_R_64F, alg_spmm,
742  &bufferSize);
743  cudaMalloc(&dBuffer, bufferSize);
744  cusparseStatus_t status =
745  cusparseSpMM(s.handle ,
746  CUSPARSE_OPERATION_NON_TRANSPOSE ,
747  CUSPARSE_OPERATION_NON_TRANSPOSE ,
748  &alpha ,
749  A_cusparse ,
750  x_cusparse ,
751  &beta ,
752  y_cusparse ,
753  CUDA_R_64F ,
754  alg_spmm ,
755  dBuffer );
756  cudaFree(dBuffer);
757  cusparseDestroyDnMat(x_cusparse);
758  cusparseDestroyDnMat(y_cusparse);
759  cusparseDestroySpMat(A_cusparse);
760 #else
761  cusparseStatus_t status =
762  cusparseDcsrmm( s.handle ,
763  CUSPARSE_OPERATION_NON_TRANSPOSE ,
764  n , ncol , n , nz ,
765  &alpha ,
766  s.descra ,
767  A.values.data() ,
768  A.graph.row_map.data() ,
769  A.graph.entries.data() ,
770  x.data() ,
771  n ,
772  &beta ,
773  y.data() ,
774  n );
775 #endif
776 
777  if ( CUSPARSE_STATUS_SUCCESS != status ) {
778  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
779  }
780  }
781 };
782 
783 #else
784 // Working on creating a version that doesn't copy vectors to a contiguous
785 // 2-D view and doesn't call CUSPARSE. Not done yet.
786 template <typename Ordinal>
787 class Multiply<
788  CrsMatrix< double , Kokkos::Cuda > ,
789  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
790  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
791  std::vector<Ordinal> ,
792  IntegralRank<2> >
793 {
794 public:
795  typedef Kokkos::Cuda execution_space;
796  typedef execution_space::size_type size_type;
797  typedef Kokkos::View< double*, Kokkos::LayoutLeft, execution_space > vector_type;
798  typedef Kokkos::View< double**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
799  typedef CrsMatrix< double , execution_space > matrix_type;
800  typedef Kokkos::View< size_type*, execution_space > column_indices_type;
801 
802  const matrix_type m_A;
803  const multi_vector_type m_x;
804  multi_vector_type m_y;
805  const column_indices_type m_col;
806  const size_type m_num_col;
807 
808  Multiply( const matrix_type& A,
809  const multi_vector_type& x,
810  multi_vector_type& y,
811  const column_indices_type& col) :
812  m_A(A),
813  m_x(x),
814  m_y(y),
815  m_col(col),
816  m_num_col(col.extent(0)) {}
817 
818  __device__
819  inline void operator() ( const size_type iRow ) const {
820  const size_type iEntryBegin = m_A.graph.row_map[iRow];
821  const size_type iEntryEnd = m_A.graph.row_map[iRow+1];
822 
823  for (size_type j=0; j<m_num_col; j++) {
824  size_type iCol = m_col_indices[j];
825 
826  scalar_type sum = 0.0;
827 
828  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
829  sum += m_A.values(iEntry) * m_x( m_A.graph.entries(iEntry), iCol );
830  }
831 
832  m_y( iRow, iCol ) = sum;
833 
834  }
835  }
836 
837  //--------------------------------------------------------------------------
838 
839  static void apply( const matrix_type & A ,
840  const multi_vector_type & x ,
841  const multi_vector_type & y ,
842  const std::vector<Ordinal> & col_indices )
843  {
844  // Copy col_indices to the device
845  Kokkos::View<Ordinal*,execution_space> col_indices_dev(
846  Kokkos::ViewAllocateWithoutInitializing("col_indices"), ncol);
847  typename Kokkos::View<Ordinal*,execution_space>::HostMirror col_indices_host =
848  Kokkos::create_mirror_view(col_indices_dev);
849  for (size_t i=0; i<ncol; ++i)
850  col_indices_host(i) = col_indices[i];
851  Kokkos::deep_copy(col_indices_dev, col_indices_host);
852 
853  const size_t n = A.graph.row_map.extent(0) - 1 ;
854  Kokkos::parallel_for( n , Multiply(A,x,y,col_indices_dev) );
855  }
856 };
857 
858 #endif
859 
860 template<>
861 class Multiply<
862  CrsMatrix< float , Kokkos::Cuda > ,
863  std::vector< Kokkos::View< float* , Kokkos::Cuda > >,
864  std::vector< Kokkos::View< float* , Kokkos::Cuda > >,
865  void,
866  IntegralRank<1> >
867 {
868 public:
869  typedef Kokkos::Cuda execution_space ;
870  typedef execution_space::size_type size_type ;
871  typedef Kokkos::View< float* , execution_space > vector_type ;
872  typedef CrsMatrix< float , execution_space > matrix_type ;
873 
874  //--------------------------------------------------------------------------
875 
876  static void apply( const matrix_type & A ,
877  const std::vector<vector_type> & x ,
878  const std::vector<vector_type> & y )
879  {
880  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
881  const float alpha = 1 , beta = 0 ;
882  const int n = A.graph.row_map.extent(0) - 1 ;
883  const int nz = A.graph.entries.extent(0);
884  const size_t ncol = x.size();
885 
886  // Copy columns of x into a contiguous vector
887  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
888  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
889 
890  for (size_t col=0; col<ncol; col++) {
891  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
892  vector_type xx_view = Kokkos::subview( xx , span );
893  Kokkos::deep_copy(xx_view, x[col]);
894  }
895 
896  // Sparse matrix-times-multivector
897 #if USE_NEW_SPMV
898  NEW_SPMM_ALG_DEFAULTS
899  using offset_type = typename matrix_type::graph_type::size_type;
900  using entry_type = typename matrix_type::graph_type::data_type;
901  const cusparseIndexType_t myCusparseOffsetType =
902  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
903  const cusparseIndexType_t myCusparseEntryType =
904  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
905  cusparseSpMatDescr_t A_cusparse;
906  cusparseCreateCsr(
907  &A_cusparse, n, n, nz,
908  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
909  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
910  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
911  cusparseDnMatDescr_t x_cusparse, y_cusparse;
912  cusparseCreateDnMat(
913  &x_cusparse, n, ncol, n,
914  (void*)xx.data(), CUDA_R_32F, CUSPARSE_ORDER_COL);
915  cusparseCreateDnMat(
916  &y_cusparse, n, ncol, n,
917  (void*)yy.data(), CUDA_R_32F, CUSPARSE_ORDER_COL);
918  size_t bufferSize = 0;
919  void* dBuffer = NULL;
920  cusparseSpMM_bufferSize(
921  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
922  CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
923  x_cusparse, &beta, y_cusparse, CUDA_R_32F, alg_spmm,
924  &bufferSize);
925  cudaMalloc(&dBuffer, bufferSize);
926  cusparseStatus_t status =
927  cusparseSpMM(s.handle ,
928  CUSPARSE_OPERATION_NON_TRANSPOSE ,
929  CUSPARSE_OPERATION_NON_TRANSPOSE ,
930  &alpha ,
931  A_cusparse ,
932  x_cusparse ,
933  &beta ,
934  y_cusparse ,
935  CUDA_R_32F ,
936  alg_spmm ,
937  dBuffer );
938  cudaFree(dBuffer);
939  cusparseDestroyDnMat(x_cusparse);
940  cusparseDestroyDnMat(y_cusparse);
941  cusparseDestroySpMat(A_cusparse);
942 #else
943  cusparseStatus_t status =
944  cusparseScsrmm( s.handle ,
945  CUSPARSE_OPERATION_NON_TRANSPOSE ,
946  n , ncol , n , nz ,
947  &alpha ,
948  s.descra ,
949  A.values.data() ,
950  A.graph.row_map.data() ,
951  A.graph.entries.data() ,
952  xx.data() ,
953  n ,
954  &beta ,
955  yy.data() ,
956  n );
957 #endif
958 
959  if ( CUSPARSE_STATUS_SUCCESS != status ) {
960  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
961  }
962 
963  // Copy columns out of continguous multivector
964  for (size_t col=0; col<ncol; col++) {
965  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
966  vector_type yy_view = Kokkos::subview( yy , span );
967  Kokkos::deep_copy(y[col], yy_view );
968  }
969  }
970 };
971 
972 template<>
973 class Multiply<
974  CrsMatrix< double , Kokkos::Cuda > ,
975  std::vector< Kokkos::View< double* , Kokkos::Cuda > >,
976  std::vector< Kokkos::View< double* , Kokkos::Cuda > >,
977  void,
978  IntegralRank<1> >
979 {
980 public:
981  typedef Kokkos::Cuda execution_space ;
982  typedef execution_space::size_type size_type ;
983  typedef Kokkos::View< double* , execution_space > vector_type ;
984  typedef CrsMatrix< double , execution_space > matrix_type ;
985 
986  //--------------------------------------------------------------------------
987 
988  static void apply( const matrix_type & A ,
989  const std::vector<vector_type> & x ,
990  const std::vector<vector_type> & y )
991  {
992  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
993  const double alpha = 1 , beta = 0 ;
994  const int n = A.graph.row_map.extent(0) - 1 ;
995  const int nz = A.graph.entries.extent(0);
996  const size_t ncol = x.size();
997 
998  // Copy columns of x into a contiguous vector
999  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
1000  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
1001 
1002  for (size_t col=0; col<ncol; col++) {
1003  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
1004  vector_type xx_view = Kokkos::subview( xx , span );
1005  Kokkos::deep_copy(xx_view, x[col]);
1006  }
1007 
1008  // Sparse matrix-times-multivector
1009 #if USE_NEW_SPMV
1010  NEW_SPMM_ALG_DEFAULTS
1011  using offset_type = typename matrix_type::graph_type::size_type;
1012  using entry_type = typename matrix_type::graph_type::data_type;
1013  const cusparseIndexType_t myCusparseOffsetType =
1014  sizeof(offset_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
1015  const cusparseIndexType_t myCusparseEntryType =
1016  sizeof(entry_type) == 4 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
1017  cusparseSpMatDescr_t A_cusparse;
1018  cusparseCreateCsr(
1019  &A_cusparse, n, n, nz,
1020  (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
1021  (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
1022  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
1023  cusparseDnMatDescr_t x_cusparse, y_cusparse;
1024  cusparseCreateDnMat(
1025  &x_cusparse, n, ncol, n,
1026  (void*)xx.data(), CUDA_R_64F, CUSPARSE_ORDER_COL);
1027  cusparseCreateDnMat(
1028  &y_cusparse, n, ncol, n,
1029  (void*)yy.data(), CUDA_R_64F, CUSPARSE_ORDER_COL);
1030  size_t bufferSize = 0;
1031  void* dBuffer = NULL;
1032  cusparseSpMM_bufferSize(
1033  s.handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1034  CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
1035  x_cusparse, &beta, y_cusparse, CUDA_R_64F, alg_spmm,
1036  &bufferSize);
1037  cudaMalloc(&dBuffer, bufferSize);
1038  cusparseStatus_t status =
1039  cusparseSpMM(s.handle ,
1040  CUSPARSE_OPERATION_NON_TRANSPOSE ,
1041  CUSPARSE_OPERATION_NON_TRANSPOSE ,
1042  &alpha ,
1043  A_cusparse ,
1044  x_cusparse ,
1045  &beta ,
1046  y_cusparse ,
1047  CUDA_R_64F ,
1048  alg_spmm ,
1049  dBuffer );
1050  cudaFree(dBuffer);
1051  cusparseDestroyDnMat(x_cusparse);
1052  cusparseDestroyDnMat(y_cusparse);
1053  cusparseDestroySpMat(A_cusparse);
1054 #else
1055  cusparseStatus_t status =
1056  cusparseDcsrmm( s.handle ,
1057  CUSPARSE_OPERATION_NON_TRANSPOSE ,
1058  n , ncol , n , nz ,
1059  &alpha ,
1060  s.descra ,
1061  A.values.data() ,
1062  A.graph.row_map.data() ,
1063  A.graph.entries.data() ,
1064  xx.data() ,
1065  n ,
1066  &beta ,
1067  yy.data() ,
1068  n );
1069 #endif
1070 
1071  if ( CUSPARSE_STATUS_SUCCESS != status ) {
1072  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
1073  }
1074 
1075  // Copy columns out of continguous multivector
1076  for (size_t col=0; col<ncol; col++) {
1077  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
1078  vector_type yy_view = Kokkos::subview( yy , span );
1079  Kokkos::deep_copy(y[col], yy_view );
1080  }
1081  }
1082 };
1083 
1084 //----------------------------------------------------------------------------
1085 
1086 } // namespace Stokhos
1087 
1088 #endif /* #ifdef HAVE_STOKHOS_CUSPARSE */
1089 
1090 #endif /* #ifndef STOKHOS_CUDA_CRSMATRIX_HPP */
Kokkos::DefaultExecutionSpace execution_space
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
int n
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)