Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_CrsMatrix.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_CRSMATRIX_HPP
43 #define STOKHOS_CUDA_CRSMATRIX_HPP
44 
45 #include "Stokhos_ConfigDefs.h"
46 #ifdef HAVE_STOKHOS_CUSPARSE
47 
48 #include <utility>
49 #include <sstream>
50 #include <stdexcept>
51 
52 #include <cuda_runtime.h>
53 //#include <cusparse.h>
54 #include <cusparse_v2.h>
55 
56 #include "Kokkos_Core.hpp"
57 
58 #include "Stokhos_Multiply.hpp"
59 #include "Stokhos_CrsMatrix.hpp"
60 
61 namespace Stokhos {
62 
63 class CudaSparseSingleton {
64 public:
65 
66  cusparseStatus_t status;
67  cusparseHandle_t handle;
68  cusparseMatDescr_t descra;
69 
70  static CudaSparseSingleton & singleton();
71 
72 private:
73 
74  CudaSparseSingleton()
75  {
76  status = cusparseCreate(&handle);
77  if(status != CUSPARSE_STATUS_SUCCESS)
78  {
79  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Initialization failed" ) );
80  }
81 
82  status = cusparseCreateMatDescr(&descra);
83  if(status != CUSPARSE_STATUS_SUCCESS)
84  {
85  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Matrix descriptor failed" ) );
86  }
87 
88  cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
89  cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
90  }
91 
92  CudaSparseSingleton( const CudaSparseSingleton & );
93  CudaSparseSingleton & operator = ( const CudaSparseSingleton & );
94 };
95 
96 CudaSparseSingleton & CudaSparseSingleton::singleton()
97 {
98  static CudaSparseSingleton s ; return s ;
99 }
100 
101 template<>
102 class Multiply<
103  CrsMatrix< float , Kokkos::Cuda > ,
104  Kokkos::View< float* , Kokkos::Cuda > ,
105  Kokkos::View< float* , Kokkos::Cuda > ,
106  void,
107  IntegralRank<1> >
108 {
109 public:
110  typedef Kokkos::Cuda execution_space ;
111  typedef execution_space::size_type size_type ;
112  typedef Kokkos::View< float* , execution_space > vector_type ;
113  typedef CrsMatrix< float , execution_space > matrix_type ;
114 
115  //--------------------------------------------------------------------------
116 
117  static void apply( const matrix_type & A ,
118  const vector_type & x ,
119  const vector_type & y )
120  {
121  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
122  const float alpha = 1 , beta = 0 ;
123  const int n = A.graph.row_map.extent(0) - 1 ;
124  const int nz = A.graph.entries.extent(0);
125 
126  cusparseStatus_t status =
127  cusparseScsrmv( s.handle ,
128  CUSPARSE_OPERATION_NON_TRANSPOSE ,
129  n , n , nz ,
130  &alpha ,
131  s.descra ,
132  A.values.data() ,
133  A.graph.row_map.data() ,
134  A.graph.entries.data() ,
135  x.data() ,
136  &beta ,
137  y.data() );
138 
139  if ( CUSPARSE_STATUS_SUCCESS != status ) {
140  throw std::runtime_error( std::string("ERROR - cusparseScsrmv " ) );
141  }
142  }
143 };
144 
145 template<>
146 class Multiply<
147  CrsMatrix< double , Kokkos::Cuda > ,
148  Kokkos::View< double* , Kokkos::Cuda > ,
149  Kokkos::View< double* , Kokkos::Cuda > ,
150  void,
151  IntegralRank<1> >
152 {
153 public:
154  typedef Kokkos::Cuda execution_space ;
155  typedef execution_space::size_type size_type ;
156  typedef Kokkos::View< double* , execution_space > vector_type ;
157  typedef CrsMatrix< double , execution_space > matrix_type ;
158 
159  //--------------------------------------------------------------------------
160 
161  static void apply( const matrix_type & A ,
162  const vector_type & x ,
163  const vector_type & y )
164  {
165  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
166  const double alpha = 1 , beta = 0 ;
167  const int n = A.graph.row_map.extent(0) - 1 ;
168  const int nz = A.graph.entries.extent(0);
169 
170  cusparseStatus_t status =
171  cusparseDcsrmv( s.handle ,
172  CUSPARSE_OPERATION_NON_TRANSPOSE ,
173  n , n , nz ,
174  &alpha ,
175  s.descra ,
176  A.values.data() ,
177  A.graph.row_map.data() ,
178  A.graph.entries.data() ,
179  x.data() ,
180  &beta ,
181  y.data() );
182 
183  if ( CUSPARSE_STATUS_SUCCESS != status ) {
184  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
185  }
186  }
187 };
188 
189 template <typename Ordinal>
190 class Multiply<
191  CrsMatrix< float , Kokkos::Cuda > ,
192  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
193  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
194  std::vector<Ordinal> ,
195  IntegralRank<2> >
196 {
197 public:
198  typedef Kokkos::Cuda execution_space;
199  typedef execution_space::size_type size_type;
200  typedef Kokkos::View< float*, Kokkos::LayoutLeft, execution_space > vector_type;
201  typedef Kokkos::View< float**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
202  typedef CrsMatrix< float , execution_space > matrix_type;
203 
204  //--------------------------------------------------------------------------
205 
206  static void apply( const matrix_type & A ,
207  const multi_vector_type & x ,
208  const multi_vector_type & y ,
209  const std::vector<Ordinal> & col_indices )
210  {
211  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
212  const float alpha = 1 , beta = 0 ;
213  const int n = A.graph.row_map.extent(0) - 1 ;
214  const int nz = A.graph.entries.extent(0);
215  const size_t ncol = col_indices.size();
216 
217  // Copy columns of x into a contiguous vector
218  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
219  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
220 
221  for (size_t col=0; col<ncol; col++) {
222  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
223  vector_type xx_view = Kokkos::subview( xx , span );
224  vector_type x_col =
225  Kokkos::subview( x, Kokkos::ALL(), col_indices[col] );
226  Kokkos::deep_copy(xx_view, x_col);
227  }
228 
229  // Sparse matrix-times-multivector
230  cusparseStatus_t status =
231  cusparseScsrmm( s.handle ,
232  CUSPARSE_OPERATION_NON_TRANSPOSE ,
233  n , ncol , n , nz ,
234  &alpha ,
235  s.descra ,
236  A.values.data() ,
237  A.graph.row_map.data() ,
238  A.graph.entries.data() ,
239  xx.data() ,
240  n ,
241  &beta ,
242  yy.data() ,
243  n );
244 
245  if ( CUSPARSE_STATUS_SUCCESS != status ) {
246  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
247  }
248 
249  // Copy columns out of continguous multivector
250  for (size_t col=0; col<ncol; col++) {
251  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
252  vector_type yy_view = Kokkos::subview( yy , span );
253  vector_type y_col =
254  Kokkos::subview( y, Kokkos::ALL(), col_indices[col] );
255  Kokkos::deep_copy(y_col, yy_view );
256  }
257  }
258 };
259 
260 #define USE_CUSPARSE 1
261 #if USE_CUSPARSE
262 
263 template <typename Ordinal>
264 class Multiply<
265  CrsMatrix< double , Kokkos::Cuda > ,
266  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
267  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
268  std::vector<Ordinal> ,
269  IntegralRank<2> >
270 {
271 public:
272  typedef Kokkos::Cuda execution_space;
273  typedef execution_space::size_type size_type;
274  typedef Kokkos::View< double*, Kokkos::LayoutLeft, execution_space > vector_type;
275  typedef Kokkos::View< double**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
276  typedef CrsMatrix< double , execution_space > matrix_type;
277 
278  //--------------------------------------------------------------------------
279 
280 #define USE_TRANSPOSE 0
281 #if USE_TRANSPOSE
282 
283  // A version that copies the vectors to a transposed 2D view and calls
284  // new CUSPARSE function for transpose layout. Seems to be somewhat
285  // slower????
286 
287  struct GatherTranspose {
288  typedef Kokkos::Cuda execution_space;
289  typedef execution_space::size_type size_type;
290 
291  multi_vector_type m_xt;
292  const multi_vector_type m_x;
293  const Kokkos::View<Ordinal*,execution_space> m_col;
294  const size_type m_ncol;
295 
296  GatherTranspose( multi_vector_type& xt,
297  const multi_vector_type& x,
298  const Kokkos::View<Ordinal*,execution_space>& col ) :
299  m_xt(xt), m_x(x), m_col(col), m_ncol(col.extent(0)) {}
300 
301  __device__
302  inline void operator() (size_type i) const {
303  for (size_type j=0; j<m_ncol; ++j)
304  m_xt(j,i) = m_x(i,m_col(j));
305  }
306 
307  static void apply( multi_vector_type& xt,
308  const multi_vector_type& x,
309  const Kokkos::View<Ordinal*,execution_space>& col ) {
310  const size_type n = x.extent(0);
311  Kokkos::parallel_for( n , GatherTranspose(xt,x,col) );
312  }
313  };
314 
315  static void apply( const matrix_type & A ,
316  const multi_vector_type & x ,
317  const multi_vector_type & y ,
318  const std::vector<Ordinal> & col_indices )
319  {
320  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
321  const double alpha = 1 , beta = 0 ;
322  const int n = A.graph.row_map.extent(0) - 1 ;
323  const int nz = A.graph.entries.extent(0);
324  const size_t ncol = col_indices.size();
325 
326  // Copy col_indices to the device
327  Kokkos::View<Ordinal*,execution_space> col_indices_dev(
328  Kokkos::ViewAllocateWithoutInitializing("col_indices"), ncol);
329  typename Kokkos::View<Ordinal*,execution_space>::HostMirror col_indices_host =
330  Kokkos::create_mirror_view(col_indices_dev);
331  for (size_t i=0; i<ncol; ++i)
332  col_indices_host(i) = col_indices[i];
333  Kokkos::deep_copy(col_indices_dev, col_indices_host);
334 
335  // Copy columns of x into a contiguous multi-vector and transpose
336  multi_vector_type xx(
337  Kokkos::ViewAllocateWithoutInitializing("xx"), ncol , n );
338  GatherTranspose::apply(xx, x, col_indices_dev);
339 
340  // Temporary to store result (this is not transposed)
341  multi_vector_type yy(
342  Kokkos::ViewAllocateWithoutInitializing("yy"), n , ncol );
343 
344  // Sparse matrix-times-multivector
345  cusparseStatus_t status =
346  cusparseDcsrmm2( s.handle ,
347  CUSPARSE_OPERATION_NON_TRANSPOSE ,
348  CUSPARSE_OPERATION_TRANSPOSE ,
349  n , ncol , n , nz ,
350  &alpha ,
351  s.descra ,
352  A.values.data() ,
353  A.graph.row_map.data() ,
354  A.graph.entries.data() ,
355  xx.data() ,
356  ncol ,
357  &beta ,
358  yy.data() ,
359  n );
360 
361  if ( CUSPARSE_STATUS_SUCCESS != status ) {
362  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
363  }
364 
365  // Copy columns out of continguous multivector
366  for (size_t col=0; col<ncol; col++) {
367  vector_type yy_view =
368  Kokkos::subview( yy , Kokkos::ALL(), col );
369  vector_type y_col =
370  Kokkos::subview( y, Kokkos::ALL(), col_indices[col] );
371  Kokkos::deep_copy(y_col, yy_view );
372  }
373  }
374 #else
375  static void apply( const matrix_type & A ,
376  const multi_vector_type & x ,
377  const multi_vector_type & y ,
378  const std::vector<Ordinal> & col_indices )
379  {
380  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
381  const double alpha = 1 , beta = 0 ;
382  const int n = A.graph.row_map.extent(0) - 1 ;
383  const int nz = A.graph.entries.extent(0);
384  const size_t ncol = col_indices.size();
385 
386  // Copy columns of x into a contiguous vector
387  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
388  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
389 
390  for (size_t col=0; col<ncol; col++) {
391  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
392  vector_type xx_view = Kokkos::subview( xx , span );
393  vector_type x_col =
394  Kokkos::subview( x, Kokkos::ALL(), col_indices[col] );
395  Kokkos::deep_copy(xx_view, x_col);
396  }
397 
398  // Sparse matrix-times-multivector
399  cusparseStatus_t status =
400  cusparseDcsrmm( s.handle ,
401  CUSPARSE_OPERATION_NON_TRANSPOSE ,
402  n , ncol , n , nz ,
403  &alpha ,
404  s.descra ,
405  A.values.data() ,
406  A.graph.row_map.data() ,
407  A.graph.entries.data() ,
408  xx.data() ,
409  n ,
410  &beta ,
411  yy.data() ,
412  n );
413 
414  if ( CUSPARSE_STATUS_SUCCESS != status ) {
415  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
416  }
417 
418  // Copy columns out of continguous multivector
419  for (size_t col=0; col<ncol; col++) {
420  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
421  vector_type yy_view = Kokkos::subview( yy , span );
422  vector_type y_col =
423  Kokkos::subview( y, Kokkos::ALL(), col_indices[col] );
424  Kokkos::deep_copy(y_col, yy_view );
425  }
426  }
427 #endif
428 };
429 
430 template <>
431 class Multiply<
432  CrsMatrix< float , Kokkos::Cuda > ,
433  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
434  Kokkos::View< float** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
435  void ,
436  IntegralRank<2> >
437 {
438 public:
439  typedef Kokkos::Cuda execution_space;
440  typedef execution_space::size_type size_type;
441  typedef Kokkos::View< float**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
442  typedef CrsMatrix< float , execution_space > matrix_type;
443 
444  //--------------------------------------------------------------------------
445 
446  static void apply( const matrix_type & A ,
447  const multi_vector_type & x ,
448  const multi_vector_type & y )
449  {
450  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
451  const float alpha = 1 , beta = 0 ;
452  const int n = A.graph.row_map.extent(0) - 1 ;
453  const int nz = A.graph.entries.extent(0);
454  const size_t ncol = x.extent(1);
455 
456  // Sparse matrix-times-multivector
457  cusparseStatus_t status =
458  cusparseScsrmm( s.handle ,
459  CUSPARSE_OPERATION_NON_TRANSPOSE ,
460  n , ncol , n , nz ,
461  &alpha ,
462  s.descra ,
463  A.values.data() ,
464  A.graph.row_map.data() ,
465  A.graph.entries.data() ,
466  x.data() ,
467  n ,
468  &beta ,
469  y.data() ,
470  n );
471 
472  if ( CUSPARSE_STATUS_SUCCESS != status ) {
473  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
474  }
475  }
476 };
477 
478 template <>
479 class Multiply<
480  CrsMatrix< double , Kokkos::Cuda > ,
481  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
482  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
483  void ,
484  IntegralRank<2> >
485 {
486 public:
487  typedef Kokkos::Cuda execution_space;
488  typedef execution_space::size_type size_type;
489  typedef Kokkos::View< double**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
490  typedef CrsMatrix< double , execution_space > matrix_type;
491 
492  //--------------------------------------------------------------------------
493 
494  static void apply( const matrix_type & A ,
495  const multi_vector_type & x ,
496  const multi_vector_type & y )
497  {
498  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
499  const double alpha = 1 , beta = 0 ;
500  const int n = A.graph.row_map.extent(0) - 1 ;
501  const int nz = A.graph.entries.extent(0);
502  const size_t ncol = x.extent(1);
503 
504  // Sparse matrix-times-multivector
505  cusparseStatus_t status =
506  cusparseDcsrmm( s.handle ,
507  CUSPARSE_OPERATION_NON_TRANSPOSE ,
508  n , ncol , n , nz ,
509  &alpha ,
510  s.descra ,
511  A.values.data() ,
512  A.graph.row_map.data() ,
513  A.graph.entries.data() ,
514  x.data() ,
515  n ,
516  &beta ,
517  y.data() ,
518  n );
519 
520  if ( CUSPARSE_STATUS_SUCCESS != status ) {
521  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
522  }
523  }
524 };
525 
526 #else
527 // Working on creating a version that doesn't copy vectors to a contiguous
528 // 2-D view and doesn't call CUSPARSE. Not done yet.
529 template <typename Ordinal>
530 class Multiply<
531  CrsMatrix< double , Kokkos::Cuda > ,
532  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
533  Kokkos::View< double** , Kokkos::LayoutLeft, Kokkos::Cuda > ,
534  std::vector<Ordinal> ,
535  IntegralRank<2> >
536 {
537 public:
538  typedef Kokkos::Cuda execution_space;
539  typedef execution_space::size_type size_type;
540  typedef Kokkos::View< double*, Kokkos::LayoutLeft, execution_space > vector_type;
541  typedef Kokkos::View< double**, Kokkos::LayoutLeft, execution_space > multi_vector_type;
542  typedef CrsMatrix< double , execution_space > matrix_type;
543  typedef Kokkos::View< size_type*, execution_space > column_indices_type;
544 
545  const matrix_type m_A;
546  const multi_vector_type m_x;
547  multi_vector_type m_y;
548  const column_indices_type m_col;
549  const size_type m_num_col;
550 
551  Multiply( const matrix_type& A,
552  const multi_vector_type& x,
553  multi_vector_type& y,
554  const column_indices_type& col) :
555  m_A(A),
556  m_x(x),
557  m_y(y),
558  m_col(col),
559  m_num_col(col.extent(0)) {}
560 
561  __device__
562  inline void operator() ( const size_type iRow ) const {
563  const size_type iEntryBegin = m_A.graph.row_map[iRow];
564  const size_type iEntryEnd = m_A.graph.row_map[iRow+1];
565 
566  for (size_type j=0; j<m_num_col; j++) {
567  size_type iCol = m_col_indices[j];
568 
569  scalar_type sum = 0.0;
570 
571  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
572  sum += m_A.values(iEntry) * m_x( m_A.graph.entries(iEntry), iCol );
573  }
574 
575  m_y( iRow, iCol ) = sum;
576 
577  }
578  }
579 
580  //--------------------------------------------------------------------------
581 
582  static void apply( const matrix_type & A ,
583  const multi_vector_type & x ,
584  const multi_vector_type & y ,
585  const std::vector<Ordinal> & col_indices )
586  {
587  // Copy col_indices to the device
588  Kokkos::View<Ordinal*,execution_space> col_indices_dev(
589  Kokkos::ViewAllocateWithoutInitializing("col_indices"), ncol);
590  typename Kokkos::View<Ordinal*,execution_space>::HostMirror col_indices_host =
591  Kokkos::create_mirror_view(col_indices_dev);
592  for (size_t i=0; i<ncol; ++i)
593  col_indices_host(i) = col_indices[i];
594  Kokkos::deep_copy(col_indices_dev, col_indices_host);
595 
596  const size_t n = A.graph.row_map.extent(0) - 1 ;
597  Kokkos::parallel_for( n , Multiply(A,x,y,col_indices_dev) );
598  }
599 };
600 
601 #endif
602 
603 template<>
604 class Multiply<
605  CrsMatrix< float , Kokkos::Cuda > ,
606  std::vector< Kokkos::View< float* , Kokkos::Cuda > >,
607  std::vector< Kokkos::View< float* , Kokkos::Cuda > >,
608  void,
609  IntegralRank<1> >
610 {
611 public:
612  typedef Kokkos::Cuda execution_space ;
613  typedef execution_space::size_type size_type ;
614  typedef Kokkos::View< float* , execution_space > vector_type ;
615  typedef CrsMatrix< float , execution_space > matrix_type ;
616 
617  //--------------------------------------------------------------------------
618 
619  static void apply( const matrix_type & A ,
620  const std::vector<vector_type> & x ,
621  const std::vector<vector_type> & y )
622  {
623  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
624  const float alpha = 1 , beta = 0 ;
625  const int n = A.graph.row_map.extent(0) - 1 ;
626  const int nz = A.graph.entries.extent(0);
627  const size_t ncol = x.size();
628 
629  // Copy columns of x into a contiguous vector
630  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
631  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
632 
633  for (size_t col=0; col<ncol; col++) {
634  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
635  vector_type xx_view = Kokkos::subview( xx , span );
636  Kokkos::deep_copy(xx_view, x[col]);
637  }
638 
639  // Sparse matrix-times-multivector
640  cusparseStatus_t status =
641  cusparseScsrmm( s.handle ,
642  CUSPARSE_OPERATION_NON_TRANSPOSE ,
643  n , ncol , n , nz ,
644  &alpha ,
645  s.descra ,
646  A.values.data() ,
647  A.graph.row_map.data() ,
648  A.graph.entries.data() ,
649  xx.data() ,
650  n ,
651  &beta ,
652  yy.data() ,
653  n );
654 
655  if ( CUSPARSE_STATUS_SUCCESS != status ) {
656  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
657  }
658 
659  // Copy columns out of continguous multivector
660  for (size_t col=0; col<ncol; col++) {
661  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
662  vector_type yy_view = Kokkos::subview( yy , span );
663  Kokkos::deep_copy(y[col], yy_view );
664  }
665  }
666 };
667 
668 template<>
669 class Multiply<
670  CrsMatrix< double , Kokkos::Cuda > ,
671  std::vector< Kokkos::View< double* , Kokkos::Cuda > >,
672  std::vector< Kokkos::View< double* , Kokkos::Cuda > >,
673  void,
674  IntegralRank<1> >
675 {
676 public:
677  typedef Kokkos::Cuda execution_space ;
678  typedef execution_space::size_type size_type ;
679  typedef Kokkos::View< double* , execution_space > vector_type ;
680  typedef CrsMatrix< double , execution_space > matrix_type ;
681 
682  //--------------------------------------------------------------------------
683 
684  static void apply( const matrix_type & A ,
685  const std::vector<vector_type> & x ,
686  const std::vector<vector_type> & y )
687  {
688  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
689  const double alpha = 1 , beta = 0 ;
690  const int n = A.graph.row_map.extent(0) - 1 ;
691  const int nz = A.graph.entries.extent(0);
692  const size_t ncol = x.size();
693 
694  // Copy columns of x into a contiguous vector
695  vector_type xx( Kokkos::ViewAllocateWithoutInitializing("xx"), n * ncol );
696  vector_type yy( Kokkos::ViewAllocateWithoutInitializing("yy"), n * ncol );
697 
698  for (size_t col=0; col<ncol; col++) {
699  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
700  vector_type xx_view = Kokkos::subview( xx , span );
701  Kokkos::deep_copy(xx_view, x[col]);
702  }
703 
704  // Sparse matrix-times-multivector
705  cusparseStatus_t status =
706  cusparseDcsrmm( s.handle ,
707  CUSPARSE_OPERATION_NON_TRANSPOSE ,
708  n , ncol , n , nz ,
709  &alpha ,
710  s.descra ,
711  A.values.data() ,
712  A.graph.row_map.data() ,
713  A.graph.entries.data() ,
714  xx.data() ,
715  n ,
716  &beta ,
717  yy.data() ,
718  n );
719 
720  if ( CUSPARSE_STATUS_SUCCESS != status ) {
721  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
722  }
723 
724  // Copy columns out of continguous multivector
725  for (size_t col=0; col<ncol; col++) {
726  const std::pair< size_t , size_t > span( n * col , n * ( col + 1 ) );
727  vector_type yy_view = Kokkos::subview( yy , span );
728  Kokkos::deep_copy(y[col], yy_view );
729  }
730  }
731 };
732 
733 //----------------------------------------------------------------------------
734 
735 } // namespace Stokhos
736 
737 #endif /* #ifdef HAVE_STOKHOS_CUSPARSE */
738 
739 #endif /* #ifndef STOKHOS_CUDA_CRSMATRIX_HPP */
Kokkos::DefaultExecutionSpace execution_space
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
int n
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)