Kokkos Core Kernels Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Kokkos_CrsMatrix.hpp
Go to the documentation of this file.
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos: Node API and Parallel Node Kernels
6 // Copyright (2008) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 #ifndef KOKKOS_CRSMATRIX_H_
44 #define KOKKOS_CRSMATRIX_H_
45 
48 
49 // FIXME (mfh 29 Sep 2013) There should never be a reason for using
50 // assert() in these routines. Routines that _could_ fail should
51 // either return an error code (if they are device functions) or throw
52 // an exception (if they are not device functions).
53 #include <assert.h>
54 #include <algorithm>
55 
56 #include <Kokkos_Core.hpp>
57 #include <Kokkos_StaticCrsGraph.hpp>
58 #include <Kokkos_MV.hpp>
59 
60 #ifdef KOKKOS_USE_CUSPARSE
61 # include <cusparse.h>
63 #endif // KOKKOS_USE_CUSPARSE
64 
65 #ifdef KOKKOS_USE_MKL
66 # include <mkl.h>
67 # include <mkl_spblas.h>
68 # include <Kokkos_CrsMatrix_MKL.hpp>
69 #endif // KOKKOS_USE_MKL
70 
71 //#include <Kokkos_Vectorization.hpp>
72 #include <impl/Kokkos_Error.hpp>
73 
74 namespace Kokkos {
75 
113 template<class MatrixType>
116  typedef typename MatrixType::value_type value_type;
118  typedef typename MatrixType::ordinal_type ordinal_type;
119 
120 private:
122  value_type* values_;
124  ordinal_type* colidx_;
126  const int stride_;
127 
128 public:
136  KOKKOS_INLINE_FUNCTION
137  SparseRowView (value_type* const values,
138  ordinal_type* const colidx__,
139  const int stride,
140  const int count) :
141  values_ (values), colidx_ (colidx__), stride_ (stride), length (count)
142  {}
150  KOKKOS_INLINE_FUNCTION
151  SparseRowView (const typename MatrixType::values_type& values,
152  const typename MatrixType::index_type& colidx__,
153  const int& stride,
154  const int& count,
155  const int& idx) :
156  values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count)
157  {}
158 
164  const int length;
165 
170  KOKKOS_INLINE_FUNCTION
171  value_type& value (const int& i) const {
172  return values_[i*stride_];
173  }
174 
179  KOKKOS_INLINE_FUNCTION
180  ordinal_type& colidx (const int& i) const {
181  return colidx_[i*stride_];
182  }
183 };
184 
185 
193 template<class MatrixType>
196  typedef const typename MatrixType::non_const_value_type value_type;
198  typedef const typename MatrixType::non_const_ordinal_type ordinal_type;
199 
200 private:
202  value_type* values_;
204  ordinal_type* colidx_;
206  const int stride_;
207 
208 public:
216  KOKKOS_INLINE_FUNCTION
218  ordinal_type* const colidx__,
219  const int stride,
220  const int count) :
221  values_ (values), colidx_ (colidx__), stride_ (stride), length (count)
222  {}
230  KOKKOS_INLINE_FUNCTION
231  SparseRowViewConst (const typename MatrixType::values_type& values,
232  const typename MatrixType::index_type& colidx__,
233  const int& stride,
234  const int& count,
235  const int& idx) :
236  values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count)
237  {}
238 
244  const int length;
245 
250  KOKKOS_INLINE_FUNCTION
251  value_type& value (const int& i) const {
252  return values_[i*stride_];
253  }
254 
259  KOKKOS_INLINE_FUNCTION
260  ordinal_type& colidx (const int& i) const {
261  return colidx_[i*stride_];
262  }
263 };
264 
265 // A simple struct for storing a kernel launch configuration.
266 // This is currently used by CrsMatrix to allow the user to have some control
267 // over how kernels are launched, however it is currently only exercised by
268 // Stokhos. This is a simpler case of "state" needed by TPLs, and at this point
269 // is just a hack until we figure out how to support state in a general,
270 // extensible way.
271 struct DeviceConfig {
272  struct Dim3 {
273  size_t x, y, z;
274  Dim3(const size_t x_, const size_t y_ = 1, const size_t z_ = 1) :
275  x(x_), y(y_), z(z_) {}
276  };
277 
278  Dim3 block_dim;
279  size_t num_blocks;
280  size_t num_threads_per_block;
281 
282  DeviceConfig(const size_t num_blocks_ = 0,
283  const size_t threads_per_block_x_ = 0,
284  const size_t threads_per_block_y_ = 0,
285  const size_t threads_per_block_z_ = 1) :
286  block_dim(threads_per_block_x_,threads_per_block_y_,threads_per_block_z_),
287  num_blocks(num_blocks_),
288  num_threads_per_block(block_dim.x * block_dim.y * block_dim.z)
289  {}
290 };
291 
292 
305 template<typename ScalarType,
306  typename OrdinalType,
307  class Device,
308  class MemoryTraits = void,
309  typename SizeType = size_t>
310 class CrsMatrix {
311 private:
312  typedef typename Kokkos::ViewTraits<ScalarType*,Device,void,void>::host_mirror_space host_mirror_space ;
313 public:
314  typedef Device device_type;
315  typedef ScalarType value_type;
316  typedef OrdinalType ordinal_type;
317  typedef MemoryTraits memory_traits;
318  typedef SizeType size_type;
319 
322 
355 
358 
366  typedef typename values_type::const_value_type const_value_type;
368  typedef typename values_type::non_const_value_type non_const_value_type;
369  typedef typename index_type ::non_const_value_type non_const_ordinal_type;
370 
371 #ifdef KOKKOS_USE_CUSPARSE
372  cusparseHandle_t cusparse_handle;
373  cusparseMatDescr_t cusparse_descr;
374 #endif // KOKKOS_USE_CUSPARSE
375  StaticCrsGraphType graph;
376  values_type values;
377 
378  // Launch configuration that can be used by overloads/specializations of
379  // MV_multiply(). This is a hack and needs to be replaced by a general
380  // state mechanism.
381  DeviceConfig dev_config;
382 
389  : graph(), values(), _numRows (0), _numCols (0), _nnz (0)
390  {}
391 
392  //------------------------------------
395  template<typename SType,
396  typename OType,
397  class DType,
398  class MTType,
399  typename IType>
401  graph = B.graph;
402  values = B.values;
403  _numRows = B.numRows();
404  _numCols = B.numCols();
405  _nnz = B.nnz();
406  }
407 
408  //------------------------------------
412  CrsMatrix( const std::string & arg_label ,
413  const StaticCrsGraphType & arg_graph )
414  : graph( arg_graph )
415  , values( arg_label , arg_graph.entries.dimension_0() )
416  , _numRows( arg_graph.row_map.dimension_0() - 1 )
417  , _numCols( maximum_entry( arg_graph ) + 1 )
418  , _nnz( arg_graph.entries.dimension_0() )
419  {}
420 
421  //------------------------------------
422 
447  CrsMatrix (const std::string &label,
448  OrdinalType nrows,
449  OrdinalType ncols,
450  OrdinalType annz,
451  ScalarType* val,
452  OrdinalType* rows,
453  OrdinalType* cols,
454  bool pad = false)
455  {
456  import (label, nrows, ncols, annz, val, rows, cols);
457 
458  // FIXME (mfh 09 Aug 2013) Specialize this on the Device type.
459  // Only use cuSPARSE for the Cuda Device.
460 #ifdef KOKKOS_USE_CUSPARSE
461  // FIXME (mfh 09 Aug 2013) This is actually static initialization
462  // of the library; you should do it once for the whole program,
463  // not once per matrix. We need to protect this somehow.
464  cusparseCreate (&cusparse_handle);
465 
466  // This is a per-matrix attribute. It encapsulates things like
467  // whether the matrix is lower or upper triangular, etc. Ditto
468  // for other TPLs like MKL.
469  cusparseCreateMatDescr (&cusparse_descr);
470 #endif // KOKKOS_USE_CUSPARSE
471  }
472 
486  CrsMatrix (const std::string &label,
487  OrdinalType nrows,
488  OrdinalType ncols,
489  OrdinalType annz,
490  values_type vals,
491  row_map_type rows,
492  index_type cols) :
493  _numRows (nrows),
494  _numCols (ncols),
495  _nnz (annz)
496  {
497  graph.row_map = rows;
498  graph.entries = cols;
499  values = vals;
500 #ifdef KOKKOS_USE_CUSPARSE
501  cusparseCreate(&cusparse_handle);
502  cusparseCreateMatDescr(&cusparse_descr);
503 #endif // KOKKOS_USE_CUSPARSE
504  }
505 
519  CrsMatrix (const std::string &label,
520  OrdinalType ncols,
521  values_type vals,
522  StaticCrsGraphType graph_) :
523  graph(graph_),
524  values(vals),
525  _numRows (graph_.row_map.dimension_0()-1),
526  _numCols (ncols),
527  _nnz (graph_.entries.dimension_0())
528  {
529 #ifdef KOKKOS_USE_CUSPARSE
530  cusparseCreate(&cusparse_handle);
531  cusparseCreateMatDescr(&cusparse_descr);
532 #endif // KOKKOS_USE_CUSPARSE
533  }
534 
535  void
536  import (const std::string &label,
537  OrdinalType nrows,
538  OrdinalType ncols,
539  OrdinalType annz,
540  ScalarType* val,
541  OrdinalType* rows,
542  OrdinalType* cols);
543 
545  void
546  generate (const std::string &label,
547  OrdinalType nrows,
548  OrdinalType ncols,
549  OrdinalType target_nnz,
550  OrdinalType varianz_nel_row,
551  OrdinalType width_row);
552 
553  void
554  generate (const std::string &label,
555  OrdinalType nrows,
556  OrdinalType ncols,
557  OrdinalType cols_per_row);
558 
559  void
560  generate (const std::string &label);
561 
562  void
563  generateHostGraph (OrdinalType nrows,
564  OrdinalType ncols,
565  OrdinalType cols_per_row);
566 
567  // FIXME (mfh 29 Sep 2013) See notes on the three-argument version
568  // of this method below.
569  void
570  insertInGraph(OrdinalType rowi, OrdinalType col)
571  {
572  insertInGraph(rowi, &col, 1);
573  }
574 
575  // FIXME (mfh 29 Sep 2013) We need a way to disable atomic updates
576  // for ScalarType types that do not support them. We're pretty much
577  // limited to ScalarType = float, double, and {u}int{32,64}_t. It
578  // could make sense to do atomic add updates elementwise for complex
579  // numbers, but that's about it unless we have transactional memory
580  // extensions. Dan Sunderland explained to me that the "array of
581  // atomic int 'locks'" approach (for ScalarType that don't directly
582  // support atomic updates) won't work on GPUs.
583  KOKKOS_INLINE_FUNCTION
584  void
585  sumIntoValues (const OrdinalType rowi,
586  const OrdinalType cols[],
587  const size_t ncol,
588  ScalarType vals[],
589  const bool force_atomic = false) const
590  {
591  SparseRowView<CrsMatrix> row_view = this->row (rowi);
592  const int length = row_view.length;
593  for (size_t i = 0; i < ncol; ++i) {
594  for (int j = 0; j < length; ++j) {
595  if (row_view.colidx(j) == cols[i]) {
596  if (force_atomic) {
597  atomic_add(&row_view.value(j), vals[i]);
598  } else {
599  row_view.value(j) += vals[i];
600  }
601  }
602  }
603  }
604  }
605 
606  // FIXME (mfh 29 Sep 2013) See above notes on sumIntoValues.
607  KOKKOS_INLINE_FUNCTION
608  void
609  replaceValues (const OrdinalType rowi,
610  const OrdinalType cols[],
611  const size_t ncol,
612  ScalarType vals[],
613  const bool force_atomic = false) const
614  {
615  SparseRowView<CrsMatrix> row_view = this->row (rowi);
616  const int length = row_view.length;
617  for (size_t i = 0; i < ncol; ++i) {
618  for (int j = 0; j < length; ++j) {
619  if (row_view.colidx(j) == cols[i]) {
620  if (force_atomic) {
621  atomic_assign(&row_view.value(j), vals[i]);
622  } else {
623  row_view.value(j) = vals[i];
624  }
625  }
626  }
627  }
628  }
629 
630  // FIXME (mfh 29 Sep 2013) It doesn't really make sense to template
631  // on the scalar or ordinal types of the input, since direct
632  // assignment of the underlying Views (which is how this operator
633  // works) won't work if the types aren't compatible. It would make
634  // more sense to template on things like the Device and
635  // MemoryTraits.
636  //
637  // COMMENT: (CRT 1 Oct 2013) the alternative it to template on the incoming type,
638  // But that way it still matches this function even if you try to assign a vector
639  // to a matrix. Using the same scalar type and ordinal type as the 'this' matrix does
640  // not necessaryily work because of things like const / non-const
641 
642  template<typename aScalarType, typename aOrdinalType, class aDevice, class aMemoryTraits,typename aSizeType>
643  CrsMatrix&
644  operator= (const CrsMatrix<aScalarType,aOrdinalType,aDevice,aMemoryTraits, aSizeType>& mtx)
645  {
646  _numRows = mtx.numRows();
647  _numCols = mtx.numCols();
648  _nnz = mtx.nnz();
649  graph = mtx.graph;
650  values = mtx.values;
651  dev_config = mtx.dev_config;
652  return *this;
653  }
654 
656  KOKKOS_INLINE_FUNCTION
657  ordinal_type numRows() const {
658  return _numRows;
659  }
661  KOKKOS_INLINE_FUNCTION
662  ordinal_type numCols() const {
663  return _numCols;
664  }
666  KOKKOS_INLINE_FUNCTION
667  ordinal_type nnz() const {
668  return _nnz;
669  }
670 
671  friend struct SparseRowView<CrsMatrix>;
672 
674  KOKKOS_INLINE_FUNCTION
676  const int start = graph.row_map(i);
677  const int count = graph.row_map(i+1) - start;
678 
679 
680  // If the last row in a matrix has zero entries the SparseRowView constructor
681  // taking views will not pass the bounds check. The violation is only done
682  // in an address calculation [ &values(start) ] and is not used since count is
683  // zero, but the bounds check will fail. This will most likely also trigger a
684  // invalid read message with valgrind.
685 #if defined ( KOKKOS_EXPRESSION_CHECK )
686  if(count==0) return SparseRowView<CrsMatrix> (NULL,NULL,1,0);
687 #endif
688  return SparseRowView<CrsMatrix> (values, graph.entries, 1, count,start);
689  }
690 
692  KOKKOS_INLINE_FUNCTION
694  const int start = graph.row_map(i);
695  const int count = graph.row_map(i+1) - start;
696 
697 #if defined ( KOKKOS_EXPRESSION_CHECK )
698  if(count==0) return SparseRowViewConst<CrsMatrix> (NULL,NULL,1,0);
699 #endif
700  return SparseRowViewConst<CrsMatrix> (values, graph.entries, 1, count,start);
701  }
702 
703 private:
704 
705  ordinal_type _numRows;
706  ordinal_type _numCols;
707  ordinal_type _nnz;
708 
709 public:
710 
711  // FIXME: [HCE 2013-12-03] The following members will be removed soon.
712 
713  // FIXME (mfh 28 Sep 2013) std::vector should never appear in this
714  // class, except perhaps as an input format for compatibility.
715 
716  std::vector<OrdinalType> h_entries_;
717  std::vector<OrdinalType> rows_;
718 
719  // FIXME (mfh 29 Sep 2013) There should not be an "insertInGraph"
720  // method. If you want to insert into the graph, you should get the
721  // graph and insert into it. If you want to change the structure of
722  // the matrix, you should be required to specify a value to put in
723  // the new spot.
724  //
725  // Furthermore, this should be a device function, by analogy with
726  // UnorderedMap.
727  void
728  insertInGraph (const OrdinalType rowi, OrdinalType *cols, const size_t ncol)
729  {
730  OrdinalType* const start = &h_entries_[rows_[rowi]];
731  OrdinalType* const end = &h_entries_[rows_[rowi+1]];
732  for (size_t i = 0; i < ncol; ++i) {
733  OrdinalType *iter = start;
734  while (iter < end && *iter != -1 && *iter != cols[i]) {
735  ++iter;
736  }
737 
738  // FIXME (mfh 29 Sep 2013) Use of assert() statements is only
739  // acceptable for debugging. It's legitimate for insertions to
740  // fail. We should use the techniques that Dan Sunderland uses
741  // in UnorderedMap; for example:
742  //
743  // 1. Insertion should return an indication of success or failure
744  // 2. The graph should keep track of the number of failed insertions
745 
746  assert (iter != end );
747  *iter = cols[i];
748  }
749  }
750 
751 };
752 
753 //----------------------------------------------------------------------------
754 //----------------------------------------------------------------------------
755 
756 template< typename ScalarType , typename OrdinalType, class Device, class MemoryTraits, typename SizeType >
757 void
758 CrsMatrix<ScalarType , OrdinalType, Device, MemoryTraits, SizeType >::
759 import (const std::string &label,
760  OrdinalType nrows,
761  OrdinalType ncols,
762  OrdinalType annz,
763  ScalarType* val,
764  OrdinalType* rows,
765  OrdinalType* cols)
766 {
767  std::string str = label;
768  values = values_type (str.append (".values"), annz);
769 
770  _numRows = nrows;
771  _numCols = ncols;
772  _nnz = annz;
773 
774  // FIXME (09 Aug 2013) CrsArray only takes std::vector for now.
775  // We'll need to fix that.
776  std::vector<int> row_lengths (_numRows, 0);
777 
778  // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel.
779  for (int i = 0; i < _numRows; ++i) {
780  row_lengths[i] = rows[i + 1] - rows[i];
781  }
782 
783  str = label;
784  graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (".graph"), row_lengths);
785  typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values);
786  typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
787 
788  // FIXME (mfh 21 Jun 2013) This needs to be a parallel copy.
789  // Furthermore, why are the arrays copied twice? -- once here, to a
790  // host view, and once below, in the deep copy?
791  for (OrdinalType i = 0; i < _nnz; ++i) {
792  if (val) {
793  h_values(i) = val[i];
794  }
795  h_entries(i) = cols[i];
796  }
797 
798  Kokkos::deep_copy (values, h_values);
799  Kokkos::deep_copy (graph.entries, h_entries);
800 }
801 
802 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
803 void
805 generate (const std::string &label,
806  OrdinalType nrows,
807  OrdinalType ncols,
808  OrdinalType target_nnz,
809  OrdinalType varianz_nel_row,
810  OrdinalType width_row)
811 {
812  _numRows = nrows;
813  _numCols = ncols;
814 
815  graph.row_map = row_map_type ("CrsMatrix::rowPtr", nrows + 1);
816  typename row_map_type::HostMirror h_row_map = Kokkos::create_mirror_view (graph.row_map);
817 
818  // FIXME (mfh 21 Jun 2013) What is this method actualy doing? It
819  // looks like it's not setting the structure or values of the matrix
820  // at all.
821 
822  OrdinalType elements_per_row = target_nnz / nrows;
823  srand(13721);
824  h_row_map(0) = 0;
825 
826  for (int rowi = 0; rowi < nrows; ++rowi) {
827  // int varianz = (1.0 * rand() / INT_MAX - 0.5) * varianz_nel_row;
828  // h_row_map(row + 1) = h_row_map(row) + elements_per_row + varianz;
829  }
830 
831  _nnz = h_row_map(nrows);
832  values = values_type("CrsMatrix::values", _nnz);
833  graph.entries = index_type("CrsMatrix::colInd", _nnz);
834  typename values_type::HostMirror h_values = Kokkos::create_mirror_view(values);
835  typename index_type::HostMirror h_entries = Kokkos::create_mirror_view(graph.entries);
836 
837  for(int rowi = 0; rowi < nrows; rowi++) {
838  for(int k = h_row_map(rowi); k < h_row_map(rowi + 1); k++) {
839  //int pos = (1.0 * rand() / INT_MAX - 0.5) * width_row;
840 
841  //if(pos < 0) pos += ncols;
842 
843  // if(pos >= ncols) pos -= ncols;
844 
845  // h_entries(k) = pos;
846  // h_values(k) = 100.0 * rand() / INT_MAX - 50.0;
847  }
848  }
849 
850  Kokkos::deep_copy(values, h_values);
851  Kokkos::deep_copy(graph.entries, h_entries);
852  Kokkos::deep_copy(graph.row_map, h_row_map);
853 
854 }
855 
856 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
857 void
859 generate (const std::string &label,
860  OrdinalType nrows,
861  OrdinalType ncols,
862  OrdinalType cols_per_row)
863 {
864  _numRows = nrows;
865  _numCols = ncols;
866  _nnz = nrows*cols_per_row;
867 
868  std::string str = label;
869  values = values_type (str.append (".values"), _nnz);
870 
871 
872  std::vector<int> row_lengths (_numRows, 0);
873 
874  // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel.
875  for (int i = 0; i < _numRows; ++i) {
876  row_lengths[i] = cols_per_row;
877  }
878 
879  str = label;
880  graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (".graph"), row_lengths);
881  typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values);
882  typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
883 
884  // FIXME (mfh 21 Jun 2013) Why is this copy not a parallel copy?
885  // Furthermore, why are the arrays copied twice? -- once here, to a
886  // host view, and once below, in the deep copy?
887  for (OrdinalType i = 0; i < _nnz; ++i) {
888  h_values(i) = ScalarType();
889  h_entries(i) = OrdinalType();
890  }
891 
892  Kokkos::deep_copy (values, h_values);
893  Kokkos::deep_copy (graph.entries, h_entries);
894 }
895 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
896 void
898 generate (const std::string &label)
899 {
900  // Compress the entries
901  size_t ptr_from= 0, ptr_to = 0;
902  int cur_row = 0;
903  while ( ptr_from < h_entries_.size()) {
904  size_t row_stop = rows_[cur_row+1];
905  while (ptr_from < row_stop) {
906  if ( h_entries_[ptr_from] == OrdinalType(-1)) {
907  ptr_from = row_stop;
908  } else {
909  h_entries_[ptr_to++] = h_entries_[ptr_from++];
910  }
911  }
912  rows_[++cur_row] = ptr_to;
913  }
914  OrdinalType nrows = rows_.size()-1;
915  OrdinalType nnz_ = ptr_to;
916 
917  h_entries_.resize(nnz_);
918 
919  //sort the rows
920  for (OrdinalType i=0; i<nrows; ++i )
921  std::sort(&h_entries_[i], &h_entries_[i+1]);
922 
923  // generate the matrix
924  import(label, nrows, nrows, nnz_, NULL, &rows_[0], &h_entries_[0]);
925 
926 }
927 
928 
929 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
930 void
931 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >::
932 generateHostGraph ( OrdinalType nrows,
933  OrdinalType ncols,
934  OrdinalType cols_per_row)
935 {
936  _numRows = nrows;
937  _numCols = ncols;
938  _nnz = nrows*cols_per_row;
939 
940  h_entries_.resize(_nnz, OrdinalType(-1));
941  rows_.resize(_numRows+1);
942  rows_[0] = 0;
943  for (OrdinalType i = 0; i < _numRows; ++i)
944  rows_[i+1] = rows_[i]+cols_per_row;
945 
946 }
947 
948 template<class DeviceType>
949 inline int RowsPerThread(const int NNZPerRow) {
950  if(NNZPerRow == 0) return 1;
951  int result = 2;
952  while(result*NNZPerRow <= 2048) {
953  result*=2;
954  }
955  return result/2;
956 }
957 #ifdef KOKKOS_HAVE_CUDA
958 template<>
959 inline int RowsPerThread<Kokkos::Cuda>(const int NNZPerRow) {
960  return 1;
961 }
962 #endif
963 
964 //----------------------------------------------------------------------------
965 
966 template< class DeviceType , typename ScalarType , int NNZPerRow=27>
967 struct MV_MultiplyShflThreadsPerRow {
968 private:
969 
970  typedef typename Kokkos::Impl::remove_const< ScalarType >::type value_type ;
971 
972 #ifdef KOKKOS_HAVE_CUDA
973  enum { shfl_possible =
974  Kokkos::Impl::is_same< DeviceType , Kokkos::Cuda >::value &&
975  (
976  Kokkos::Impl::is_same< value_type , unsigned int >::value ||
977  Kokkos::Impl::is_same< value_type , int >::value ||
978  Kokkos::Impl::is_same< value_type , float >::value ||
979  Kokkos::Impl::is_same< value_type , double >::value
980  )};
981 #else // NOT KOKKOS_HAVE_CUDA
982  enum { shfl_possible = 0 };
983 #endif // KOKKOS_HAVE_CUDA
984 
985 public:
986 
987 #if defined( __CUDA_ARCH__ )
988  enum { device_value = shfl_possible && ( 300 <= __CUDA_ARCH__ ) ?
989  (NNZPerRow<8?2:
990  (NNZPerRow<16?4:
991  (NNZPerRow<32?8:
992  (NNZPerRow<64?16:
993  32))))
994  :1 };
995 #else
996  enum { device_value = 1 };
997 #endif
998 
999 #ifdef KOKKOS_HAVE_CUDA
1000  inline static int host_value()
1001  { return shfl_possible && ( 300 <= Kokkos::Cuda::device_arch() ) ?
1002  (NNZPerRow<8?2:
1003  (NNZPerRow<16?4:
1004  (NNZPerRow<32?8:
1005  (NNZPerRow<64?16:
1006  32))))
1007  :1; }
1008 #else // NOT KOKKOS_HAVE_CUDA
1009  inline static int host_value() { return 1; }
1010 #endif // KOKKOS_HAVE_CUDA
1011 };
1012 
1013 //----------------------------------------------------------------------------
1014 
1015 template<class RangeVector,
1016  class CrsMatrix,
1017  class DomainVector,
1018  class CoeffVector1,
1019  class CoeffVector2,
1020  int doalpha,
1021  int dobeta,
1022  int ExplicitVectorLength = 8>
1023 struct MV_MultiplyFunctor {
1024  typedef typename CrsMatrix::device_type device_type ;
1025  typedef typename CrsMatrix::ordinal_type size_type ;
1026  typedef typename CrsMatrix::non_const_value_type value_type ;
1027  typedef typename Kokkos::View<value_type*, device_type> range_values;
1028  typedef typename Kokkos::TeamPolicy< device_type > team_policy ;
1029  typedef typename team_policy::member_type team_member ;
1030 
1031  typedef Vectorization<device_type,ExplicitVectorLength> vectorization;
1032 
1033  CoeffVector1 beta;
1034  CoeffVector2 alpha;
1035  CrsMatrix m_A ;
1036  DomainVector m_x ;
1037  RangeVector m_y ;
1038  size_type n;
1039  int rows_per_thread;
1040 
1041  MV_MultiplyFunctor(const CoeffVector1 beta_,
1042  const CoeffVector2 alpha_,
1043  const CrsMatrix m_A_,
1044  const DomainVector m_x_,
1045  const RangeVector m_y_,
1046  const size_type n_,
1047  const int rows_per_thread_):
1048  beta(beta_), alpha(alpha_), m_A(m_A_), m_x(m_x_), m_y(m_y_), n(n_), rows_per_thread(rows_per_thread_) {}
1049 
1050  //--------------------------------------------------------------------------
1051 
1052  template<int UNROLL>
1053  KOKKOS_INLINE_FUNCTION
1054  void strip_mine (const team_member & dev, const size_type & iRow, const size_type& kk) const {
1055 
1056  value_type sum[UNROLL];
1057 
1058 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1059 #pragma ivdep
1060 #endif
1061 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1062 #pragma unroll
1063 #endif
1064  for (size_type k = 0 ; k < UNROLL ; ++k) {
1065  // NOTE (mfh 09 Aug 2013) This requires that assignment from int
1066  // (in this case, 0) to value_type be defined. It's not for
1067  // types like arprec and dd_real.
1068  //
1069  // mfh 29 Sep 2013: On the other hand, arprec and dd_real won't
1070  // work on CUDA devices anyway, since their methods aren't
1071  // device functions. arprec has other issues (e.g., dynamic
1072  // memory allocation, and the array-of-structs memory layout
1073  // which is unfavorable to GPUs), but could be dealt with in the
1074  // same way as Sacado's AD types.
1075  sum[k] = 0;
1076  }
1077 
1078  const SparseRowView<CrsMatrix> row = m_A.row(iRow);
1079 
1080 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1081 #pragma ivdep
1082 #endif
1083 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1084 #pragma unroll
1085 #endif
1086 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
1087 #pragma loop count (15)
1088 #endif
1089  for (size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment ) {
1090  const value_type val = row.value(iEntry);
1091  const size_type ind = row.colidx(iEntry);
1092 
1093 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1094 #pragma unroll
1095 #endif
1096  for (size_type k = 0; k < UNROLL; ++k) {
1097  sum[k] += val * m_x(ind, kk + k);
1098  }
1099  }
1100 
1101  if(doalpha == -1)
1102  for (int ii=0; ii < UNROLL; ++ii) {
1103  sum[ii] = -vectorization::reduce(sum[ii]);
1104  }
1105  else
1106  for (int ii=0; ii < UNROLL; ++ii) {
1107  sum[ii] = vectorization::reduce(sum[ii]);
1108  }
1109 
1110  if (vectorization::is_lane_0(dev)) {
1111  if(doalpha * doalpha != 1) {
1112 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1113 #pragma ivdep
1114 #endif
1115 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1116 #pragma unroll
1117 #endif
1118  for (size_type k = 0; k < UNROLL; ++k) {
1119  sum[k] *= alpha(kk + k);
1120  }
1121  }
1122 
1123  if (dobeta == 0) {
1124 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1125 #pragma ivdep
1126 #endif
1127 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1128 #pragma unroll
1129 #endif
1130  for (size_type k = 0; k < UNROLL; ++k) {
1131  m_y(iRow, kk + k) = sum[k];
1132  }
1133  } else if(dobeta == 1) {
1134 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1135 #pragma ivdep
1136 #endif
1137 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1138 #pragma unroll
1139 #endif
1140  for (size_type k = 0; k < UNROLL; ++k) {
1141  m_y(iRow, kk + k) += sum[k];
1142  }
1143  } else if (dobeta == -1) {
1144 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1145 #pragma ivdep
1146 #endif
1147 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1148 #pragma unroll
1149 #endif
1150  for (size_type k = 0; k < UNROLL; ++k) {
1151  m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
1152  }
1153  } else {
1154 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1155 #pragma ivdep
1156 #endif
1157 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1158 #pragma unroll
1159 #endif
1160  for (size_type k = 0; k < UNROLL; ++k) {
1161  m_y(iRow, kk + k) = beta(kk + k) * m_y(iRow, kk + k) + sum[k] ;
1162  }
1163  }
1164  }
1165  }
1166 
1167  KOKKOS_INLINE_FUNCTION
1168  void strip_mine_1 (const team_member & dev, const size_type& iRow) const {
1169  value_type sum = 0;
1170 
1171  const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1172 
1173 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1174 #pragma ivdep
1175 #endif
1176 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1177 #pragma unroll
1178 #endif
1179 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
1180 #pragma loop count (15)
1181 #endif
1182  for(size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment) {
1183  sum += row.value(iEntry) * m_x(row.colidx(iEntry),0);
1184  }
1185 
1186  sum = vectorization::reduce(sum);
1187 
1188  if (vectorization::is_lane_0(dev)) {
1189 
1190  if(doalpha == -1)
1191  sum *= value_type(-1);
1192  else if(doalpha * doalpha != 1) {
1193  sum *= alpha(0);
1194  }
1195 
1196  if (dobeta == 0) {
1197  m_y(iRow, 0) = sum ;
1198  } else if (dobeta == 1) {
1199  m_y(iRow, 0) += sum ;
1200  } else if (dobeta == -1) {
1201  m_y(iRow, 0) = -m_y(iRow, 0) + sum;
1202  } else {
1203  m_y(iRow, 0) = beta(0) * m_y(iRow, 0) + sum;
1204  }
1205  }
1206  }
1207 
1208 
1209  KOKKOS_INLINE_FUNCTION
1210  void operator()(const team_member & dev) const {
1211  for(int loop = 0; loop < rows_per_thread; loop++) {
1212  const size_type iRow = vectorization::global_thread_rank(dev) * rows_per_thread + loop;
1213  if(iRow>=m_A.numRows()) return;
1214 
1215  size_type kk = 0;
1216 
1217 #ifdef KOKKOS_FAST_COMPILE
1218  for (; kk + 4 <= n; kk += 4) {
1219  strip_mine<4>(dev, iRow, kk);
1220  }
1221  for( ; kk < n; ++kk) {
1222  strip_mine<1>(dev, iRow, kk);
1223  }
1224 #else
1225 # ifdef __CUDA_ARCH__
1226  if ((n > 8) && (n % 8 == 1)) {
1227  strip_mine<9>(dev, iRow, kk);
1228  kk += 9;
1229  }
1230  for(; kk + 8 <= n; kk += 8)
1231  strip_mine<8>(dev, iRow, kk);
1232  if(kk < n)
1233  switch(n - kk) {
1234 # else // NOT a CUDA device
1235  if ((n > 16) && (n % 16 == 1)) {
1236  strip_mine<17>(dev, iRow, kk);
1237  kk += 17;
1238  }
1239 
1240  for (; kk + 16 <= n; kk += 16) {
1241  strip_mine<16>(dev, iRow, kk);
1242  }
1243 
1244  if(kk < n)
1245  switch(n - kk) {
1246  case 15:
1247  strip_mine<15>(dev, iRow, kk);
1248  break;
1249 
1250  case 14:
1251  strip_mine<14>(dev, iRow, kk);
1252  break;
1253 
1254  case 13:
1255  strip_mine<13>(dev, iRow, kk);
1256  break;
1257 
1258  case 12:
1259  strip_mine<12>(dev, iRow, kk);
1260  break;
1261 
1262  case 11:
1263  strip_mine<11>(dev, iRow, kk);
1264  break;
1265 
1266  case 10:
1267  strip_mine<10>(dev, iRow, kk);
1268  break;
1269 
1270  case 9:
1271  strip_mine<9>(dev, iRow, kk);
1272  break;
1273 
1274  case 8:
1275  strip_mine<8>(dev, iRow, kk);
1276  break;
1277 # endif // __CUDA_ARCH__
1278  case 7:
1279  strip_mine<7>(dev, iRow, kk);
1280  break;
1281 
1282  case 6:
1283  strip_mine<6>(dev, iRow, kk);
1284  break;
1285 
1286  case 5:
1287  strip_mine<5>(dev, iRow, kk);
1288  break;
1289 
1290  case 4:
1291  strip_mine<4>(dev, iRow, kk);
1292  break;
1293 
1294  case 3:
1295  strip_mine<3>(dev, iRow, kk);
1296  break;
1297 
1298  case 2:
1299  strip_mine<2>(dev, iRow, kk);
1300  break;
1301 
1302  case 1:
1303  strip_mine_1(dev, iRow);
1304  break;
1305  }
1306 #endif // KOKKOS_FAST_COMPILE
1307  }
1308  }
1309  };
1310 
1311  template<class RangeVector,
1312  class CrsMatrix,
1313  class DomainVector,
1314  class CoeffVector1,
1315  class CoeffVector2,
1316  int doalpha,
1317  int dobeta,
1318  int ExplicitVectorLength = 8>
1319  struct MV_MultiplySingleFunctor {
1320  typedef typename CrsMatrix::device_type device_type ;
1321  typedef typename CrsMatrix::ordinal_type size_type ;
1322  typedef typename CrsMatrix::non_const_value_type value_type ;
1323  typedef typename Kokkos::View<value_type*, typename CrsMatrix::device_type> range_values;
1324  typedef typename Kokkos::TeamPolicy< device_type > team_policy ;
1325  typedef typename team_policy::member_type team_member ;
1326 
1327  //typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
1328  typedef Vectorization<device_type,ExplicitVectorLength> vectorization;
1329 
1330  CoeffVector1 beta;
1331  CoeffVector2 alpha;
1332  CrsMatrix m_A ;
1333  DomainVector m_x ;
1334  RangeVector m_y ;
1335  int rows_per_thread;
1336 
1337  MV_MultiplySingleFunctor(
1338  const CoeffVector1 beta_,
1339  const CoeffVector2 alpha_,
1340  const CrsMatrix m_A_,
1341  const DomainVector m_x_,
1342  const RangeVector m_y_,
1343  const int rows_per_thread_):
1344  beta(beta_), alpha(alpha_),
1345  m_A(m_A_), m_x(m_x_), m_y(m_y_),
1346  rows_per_thread(rows_per_thread_) {}
1347 
1348  KOKKOS_INLINE_FUNCTION
1349  void operator()(const team_member & dev) const {
1350 
1351  for(int loop = 0; loop < rows_per_thread; loop++) {
1352  const size_type iRow = vectorization::global_thread_rank(dev)*rows_per_thread + loop;
1353  if(iRow>=m_A.numRows()) return;
1354  const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1355  const size_type row_length = row.length ;
1356  value_type sum = 0;
1357 
1358  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
1359  #pragma ivdep
1360  #endif
1361  #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1362  #pragma unroll
1363  #endif
1364  #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
1365  #pragma loop count (15)
1366  #endif
1367  for (size_type iEntry = vectorization::begin(); iEntry < row_length; iEntry += vectorization::increment) {
1368  sum += row.value(iEntry) * m_x(row.colidx(iEntry));
1369  }
1370 
1371  sum = vectorization::reduce(sum);
1372 
1373 
1374  if (vectorization::is_lane_0(dev)) {
1375  if (doalpha == -1) sum *= value_type(-1);
1376  else if (doalpha * doalpha != 1) {
1377  sum *= alpha(0);
1378  }
1379 
1380  if (dobeta == 0) {
1381  m_y(iRow) = sum ;
1382  } else if (dobeta == 1) {
1383  m_y(iRow) += sum ;
1384  } else if (dobeta == -1) {
1385  m_y(iRow) = -m_y(iRow) + sum;
1386  } else {
1387  m_y(iRow) = beta(0) * m_y(iRow) + sum;
1388  }
1389  }
1390  }
1391  }
1392  };
1393 
1394  namespace Impl {
1395 
1396  template <class RangeVector,
1397  class CrsMatrix,
1398  class DomainVector,
1399  class CoeffVector1,
1400  class CoeffVector2>
1401  void
1402  MV_Multiply_Check_Compatibility (const CoeffVector1 &betav,
1403  const RangeVector &y,
1404  const CoeffVector2 &alphav,
1405  const CrsMatrix &A,
1406  const DomainVector &x,
1407  const int& doalpha,
1408  const int& dobeta)
1409  {
1410  typename DomainVector::size_type numVecs = x.dimension_1();
1411  typename DomainVector::size_type numRows = A.numRows();
1412  typename DomainVector::size_type numCols = A.numCols();
1413 
1414  if (y.dimension_1() != numVecs) {
1415  std::ostringstream msg;
1416  msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and x do not match\n";
1417  msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
1418  << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
1419  << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
1420  << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
1421  << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
1422  msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") x(" << x.dimension_0() << "," << x.dimension_1() << ")\n";
1423  Impl::throw_runtime_exception( msg.str() );
1424  }
1425  if (numRows > y.dimension_0()) {
1426  std::ostringstream msg;
1427  msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of y and A do not match\n";
1428  msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
1429  << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
1430  << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
1431  << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
1432  << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
1433  msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") A(" << A.numRows() << "," << A.numCols() << ")\n";
1434  Impl::throw_runtime_exception( msg.str() );
1435  }
1436  if (numCols > x.dimension_0()) {
1437  std::ostringstream msg;
1438  msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of x and A do not match\n";
1439  msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
1440  << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
1441  << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
1442  << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
1443  << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
1444  msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") A(" << A.numRows() << "," << A.numCols() << ")\n";
1445  Impl::throw_runtime_exception( msg.str() );
1446  }
1447  if (dobeta==2) {
1448  if (betav.dimension_0()!=numVecs) {
1449  std::ostringstream msg;
1450  msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and b do not match\n";
1451  msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
1452  << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
1453  << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
1454  << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
1455  << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
1456  msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") b(" << betav.dimension_0() << ")\n";
1457  Impl::throw_runtime_exception( msg.str() );
1458  }
1459  }
1460  if(doalpha==2) {
1461  if(alphav.dimension_0()!=numVecs) {
1462  std::ostringstream msg;
1463  msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of x and b do not match\n";
1464  msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
1465  << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
1466  << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
1467  << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
1468  << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
1469  msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") b(" << betav.dimension_0() << ")\n";
1470  Impl::throw_runtime_exception( msg.str() );
1471  }
1472  }
1473  }
1474  } // namespace Impl
1475 
1476  // This TansposeFunctor is functional, but not necessarily performant.
1477  template<class RangeVector,
1478  class CrsMatrix,
1479  class DomainVector,
1480  class CoeffVector1,
1481  class CoeffVector2,
1482  int doalpha,
1483  int dobeta,
1484  int NNZPerRow = 27 >
1485  struct MV_MultiplyTransposeFunctor {
1486  typedef typename CrsMatrix::device_type device_type ;
1487  typedef typename CrsMatrix::ordinal_type size_type ;
1488  typedef typename CrsMatrix::non_const_value_type value_type ;
1489  typedef typename Kokkos::View<value_type*, device_type> range_values;
1490 
1491  typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
1492 
1493  CoeffVector1 beta;
1494  CoeffVector2 alpha;
1495  CrsMatrix m_A ;
1496  DomainVector m_x ;
1497  RangeVector m_y ;
1498  size_type n;
1499 
1500  KOKKOS_INLINE_FUNCTION
1501  void operator()(int i) const {
1502  const size_type iRow = i/ShflThreadsPerRow::device_value;
1503  const int lane = i%ShflThreadsPerRow::device_value;
1504 
1505  const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1506 
1507  for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) {
1508  const value_type val = row.value(iEntry);
1509  const size_type ind = row.colidx(iEntry);
1510 
1511  if(doalpha!=1) {
1512 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1513 #pragma unroll
1514 #endif
1515  for (size_type k = 0; k < n; ++k) {
1516  atomic_add(&m_y(ind,k), value_type(alpha(k) * val * m_x(iRow, k)));
1517  }
1518  } else {
1519 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
1520 #pragma unroll
1521 #endif
1522  for (size_type k = 0; k < n; ++k) {
1523  atomic_add(&m_y(ind,k), value_type(val * m_x(iRow, k)));
1524  }
1525  }
1526  }
1527  }
1528  };
1529 
1530  // This TansposeFunctor is functional, but not necessarily performant.
1531  template<class RangeVector,
1532  class CrsMatrix,
1533  class DomainVector,
1534  class CoeffVector1,
1535  class CoeffVector2,
1536  int doalpha,
1537  int dobeta,
1538  int NNZPerRow = 27 >
1539  struct MV_MultiplyTransposeSingleFunctor {
1540  typedef typename CrsMatrix::device_type device_type ;
1541  typedef typename CrsMatrix::ordinal_type size_type ;
1542  typedef typename CrsMatrix::non_const_value_type value_type ;
1543  typedef typename Kokkos::View<value_type*, device_type> range_values;
1544 
1545  typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
1546 
1547  CoeffVector1 beta;
1548  CoeffVector2 alpha;
1549  CrsMatrix m_A ;
1550  DomainVector m_x ;
1551  RangeVector m_y ;
1552  size_type n;
1553 
1554  KOKKOS_INLINE_FUNCTION
1555  void operator()(int i) const {
1556  const size_type iRow = i/ShflThreadsPerRow::device_value;
1557  const int lane = i%ShflThreadsPerRow::device_value;
1558 
1559  const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
1560 
1561  for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) {
1562  const value_type val = row.value(iEntry);
1563  const size_type ind = row.colidx(iEntry);
1564 
1565  if(doalpha!=1) {
1566  atomic_add(&m_y(ind), value_type(alpha(0) * val * m_x(iRow)));
1567  } else {
1568  atomic_add(&m_y(ind), value_type(val * m_x(iRow)));
1569  }
1570  }
1571  }
1572  };
1573 
1574 template <class RangeVector,
1575  class TCrsMatrix,
1576  class DomainVector,
1577  class CoeffVector1,
1578  class CoeffVector2,
1579  int doalpha,
1580  int dobeta>
1581 void
1582 MV_MultiplyTranspose (typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav,
1583  const RangeVector &y,
1584  const CoeffVector2 &alphav,
1585  const TCrsMatrix &A,
1586  const DomainVector &x)
1587 {
1588  //Special case for zero Rows RowMap
1589  if(A.numRows() == -1) return;
1590 
1591  if (doalpha == 0) {
1592  if (dobeta==2) {
1593  MV_MulScalar(y,betav,y);
1594  } else {
1595  MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
1596  }
1597  return;
1598  } else {
1599  typedef View< typename RangeVector::non_const_data_type ,
1600  typename RangeVector::array_layout ,
1601  typename RangeVector::device_type ,
1602  typename RangeVector::memory_traits >
1603  RangeVectorType;
1604 
1605  typedef View< typename DomainVector::const_data_type ,
1606  typename DomainVector::array_layout ,
1607  typename DomainVector::device_type ,
1608  Kokkos::MemoryRandomAccess >
1609  DomainVectorType;
1610 
1611  typedef View< typename CoeffVector1::const_data_type ,
1612  typename CoeffVector1::array_layout ,
1613  typename CoeffVector1::device_type ,
1614  Kokkos::MemoryRandomAccess >
1615  CoeffVector1Type;
1616 
1617  typedef View< typename CoeffVector2::const_data_type ,
1618  typename CoeffVector2::array_layout ,
1619  typename CoeffVector2::device_type ,
1620  Kokkos::MemoryRandomAccess >
1621  CoeffVector2Type;
1622 
1623  typedef CrsMatrix<typename TCrsMatrix::const_value_type,
1624  typename TCrsMatrix::ordinal_type,
1625  typename TCrsMatrix::device_type,
1626  typename TCrsMatrix::memory_traits,
1627  typename TCrsMatrix::size_type> CrsMatrixType;
1628 
1629  //Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
1630 /*
1631 #ifndef KOKKOS_FAST_COMPILE
1632 
1633  if(x.dimension_1()==1) {
1634  typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type,Kokkos::MemoryRandomAccess> DomainVector1D;
1635  typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type> DomainVector1DPlain;
1636  typedef View<typename RangeVectorType::value_type*,typename RangeVector::array_layout ,typename RangeVectorType::device_type,typename RangeVector::memory_traits> RangeVector1D;
1637 
1638  typedef MV_MultiplySingleFunctor<RangeVector1D, CrsMatrixType, DomainVector1D,
1639  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta> OpType ;
1640 
1641  Kokkos::subview< RangeVector1D >( y , ALL(),0 );
1642 
1643  OpType op ;
1644  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1645  op.m_A = A ;
1646  op.m_x = Kokkos::subview< DomainVector1DPlain >( x , ALL(),0 ) ;
1647  op.m_y = Kokkos::subview< RangeVector1D >( y , ALL(),0 ) ;
1648  op.beta = betav;
1649  op.alpha = alphav;
1650  op.n = x.dimension(1);
1651  Kokkos::parallel_for (nrow * OpType::ShflThreadsPerRow::host_value(), op);
1652 
1653  } else {
1654  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1655  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta> OpType ;
1656  OpType op ;
1657  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1658  op.m_A = A ;
1659  op.m_x = x ;
1660  op.m_y = y ;
1661  op.beta = betav;
1662  op.alpha = alphav;
1663  op.n = x.dimension(1);
1664  Kokkos::parallel_for(nrow*OpType::ShflThreadsPerRow::host_value() , op);
1665  }
1666 
1667 #else // NOT KOKKOS_FAST_COMPILE
1668 */
1669  typedef MV_MultiplyTransposeFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, CoeffVector1Type, CoeffVector2Type, 2, 2 >
1670  OpType ;
1671 
1672  OpType op ;
1673 
1674  int numVecs = x.dimension_1();
1675  CoeffVector1 beta = betav;
1676  CoeffVector2 alpha = alphav;
1677 
1678  if (doalpha != 2) {
1679  alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
1680  typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
1681  typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
1682 
1683  for (int i = 0; i < numVecs; ++i)
1684  h_a(i) = s_a;
1685 
1686  Kokkos::deep_copy(alpha, h_a);
1687  }
1688 
1689  if (dobeta != 2) {
1690  beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
1691  typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
1692  typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
1693 
1694  for(int i = 0; i < numVecs; i++)
1695  h_b(i) = s_b;
1696 
1697  Kokkos::deep_copy(beta, h_b);
1698  }
1699 
1700  if (dobeta==2) {
1701  MV_MulScalar(y,betav,y);
1702  } else {
1703  if (dobeta!=1)
1704  MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
1705  }
1706 
1707  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1708  op.m_A = A;
1709  op.m_x = x;
1710  op.m_y = y;
1711  op.beta = beta;
1712  op.alpha = alpha;
1713  op.n = x.dimension_1();
1714  Kokkos::parallel_for (nrow * OpType::ShflThreadsPerRow::host_value() , op );
1715 
1716 //#endif // KOKKOS_FAST_COMPILE
1717  }
1718 }
1719 
1720 template<class RangeVector, class CrsMatrix, class DomainVector, class CoeffVector1, class CoeffVector2>
1721 void
1722 MV_MultiplyTranspose (const CoeffVector1& betav,
1723  const RangeVector& y,
1724  const CoeffVector2& alphav,
1725  const CrsMatrix& A,
1726  const DomainVector& x,
1727  int beta,
1728  int alpha)
1729 {
1730  if (beta == 0) {
1731  if(alpha == 0)
1732  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A , x);
1733  else if(alpha == 1)
1734  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A , x);
1735  else if(alpha == -1)
1736  MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A , x);
1737  else
1738  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A , x);
1739  } else if(beta == 1) {
1740  if(alpha == 0)
1741  return;
1742  else if(alpha == 1)
1743  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A , x);
1744  else if(alpha == -1)
1745  MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A , x);
1746  else
1747  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A , x);
1748  } else if(beta == -1) {
1749  if(alpha == 0)
1750  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A , x);
1751  else if(alpha == 1)
1752  MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A , x);
1753  else if(alpha == -1)
1754  MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A , x);
1755  else
1756  MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A , x);
1757  } else {
1758  if(alpha == 0)
1759  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A , x);
1760  else if(alpha == 1)
1761  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A , x);
1762  else if(alpha == -1)
1763  MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A , x);
1764  else
1765  MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A , x);
1766  }
1767 }
1768 
1769 template<class RangeVector, class CrsMatrix, class DomainVector>
1770 void
1771 MV_MultiplyTranspose (typename RangeVector::const_value_type s_b,
1772  const RangeVector& y,
1773  typename DomainVector::const_value_type s_a,
1774  const CrsMatrix& A,
1775  const DomainVector& x)
1776 {
1777 /*#ifdef KOKKOS_USE_CUSPARSE
1778  if (MV_Multiply_Try_CuSparse (s_b, y, s_a, A, x)) {
1779  return;
1780  }
1781 #endif // KOKKOSE_USE_CUSPARSE
1782 #ifdef KOKKOS_USE_MKL
1783  if (MV_Multiply_Try_MKL (s_b, y, s_a, A, x)) {
1784  return;
1785  }
1786 #endif // KOKKOS_USE_MKL*/
1788  aVector a;
1789  aVector b;
1790  int numVecs = x.dimension_1();
1791 
1792  if (s_b == 0) {
1793  if (s_a == 0)
1794  return MV_MultiplyTranspose (a, y, a, A, x, 0, 0);
1795  else if (s_a == 1)
1796  return MV_MultiplyTranspose (a, y, a, A, x, 0, 1);
1797  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1798  return MV_MultiplyTranspose (a, y, a, A, x, 0, -1);
1799  else {
1800  a = aVector("a", numVecs);
1801  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1802  for (int i = 0; i < numVecs; ++i) {
1803  h_a(i) = s_a;
1804  }
1805  Kokkos::deep_copy (a, h_a);
1806  return MV_MultiplyTranspose (a, y, a, A, x, 0, 2);
1807  }
1808  } else if (s_b == 1) {
1809  if (s_a == 0)
1810  return MV_MultiplyTranspose (a, y, a, A, x, 1, 0);
1811  else if (s_a == 1)
1812  return MV_MultiplyTranspose (a, y, a, A, x, 1, 1);
1813  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1814  return MV_MultiplyTranspose (a, y, a, A, x, 1, -1);
1815  else {
1816  a = aVector("a", numVecs);
1817  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1818  for (int i = 0; i < numVecs; ++i) {
1819  h_a(i) = s_a;
1820  }
1821  Kokkos::deep_copy (a, h_a);
1822  return MV_MultiplyTranspose (a, y, a, A, x, 1, 2);
1823  }
1824  } else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) {
1825  if (s_a == 0)
1826  return MV_MultiplyTranspose (a, y, a, A, x, -1, 0);
1827  else if (s_a == 1)
1828  return MV_MultiplyTranspose (a, y, a, A, x, -1, 1);
1829  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1830  return MV_MultiplyTranspose (a, y, a, A, x, -1, -1);
1831  else {
1832  a = aVector("a", numVecs);
1833  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1834  for (int i = 0; i < numVecs; ++i) {
1835  h_a(i) = s_a;
1836  }
1837  Kokkos::deep_copy (a, h_a);
1838  return MV_MultiplyTranspose (a, y, a, A, x, -1, 2);
1839  }
1840  } else {
1841  b = aVector("b", numVecs);
1842  typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b);
1843  for (int i = 0; i < numVecs; ++i) {
1844  h_b(i) = s_b;
1845  }
1846  Kokkos::deep_copy(b, h_b);
1847 
1848  if (s_a == 0)
1849  return MV_MultiplyTranspose (b, y, a, A, x, 2, 0);
1850  else if (s_a == 1)
1851  return MV_MultiplyTranspose (b, y, a, A, x, 2, 1);
1852  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
1853  return MV_MultiplyTranspose (b, y, a, A, x, 2, -1);
1854  else {
1855  a = aVector("a", numVecs);
1856  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
1857  for (int i = 0; i < numVecs; ++i) {
1858  h_a(i) = s_a;
1859  }
1860  Kokkos::deep_copy (a, h_a);
1861  return MV_MultiplyTranspose (b, y, a, A, x, 2, 2);
1862  }
1863  }
1864 }
1865 
1866 template< class DeviceType >
1868 inline
1869 mv_multiply_team_policy( const int nrow , const int rows_per_thread, const int increment )
1870 {
1871 #ifdef KOKKOS_HAVE_CUDA
1872  const int teamsize = Impl::is_same< DeviceType , Kokkos::Cuda>::value ? 256 : 1;//hwloc::get_available_threads_per_core() ;
1873 #else
1874  const int teamsize = 1;//hwloc::get_available_threads_per_core();
1875 #endif
1876  const int nteams = (((nrow+rows_per_thread-1)/rows_per_thread)
1877  *increment+teamsize-1)/teamsize;
1878  return Kokkos::TeamPolicy< DeviceType >( nteams , teamsize );
1879 }
1880 
1881 
1882  template<class RangeVector,
1883  class TCrsMatrix,
1884  class DomainVector,
1885  class CoeffVector1,
1886  class CoeffVector2,
1887  int doalpha,
1888  int dobeta>
1889  void
1890  MV_MultiplySingle (typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav,
1891  const RangeVector &y,
1892  const CoeffVector2 &alphav,
1893  const TCrsMatrix& A,
1894  const DomainVector& x)
1895  {
1896  if(A.numRows()<=0) return;
1897  if (doalpha == 0) {
1898  if (dobeta==2) {
1899  V_MulScalar(y,betav,y);
1900  }
1901  else {
1902  V_MulScalar(y,typename RangeVector::value_type(dobeta),y);
1903  }
1904  return;
1905  } else {
1906  typedef View< typename RangeVector::non_const_data_type ,
1907  typename RangeVector::array_layout ,
1908  typename RangeVector::device_type ,
1909  typename RangeVector::memory_traits >
1910  RangeVectorType;
1911 
1912  typedef View< typename DomainVector::const_data_type ,
1913  typename DomainVector::array_layout ,
1914  typename DomainVector::device_type ,
1915  //typename DomainVector::memory_traits >
1916  Kokkos::MemoryRandomAccess >
1917  DomainVectorType;
1918 
1919  typedef View< typename CoeffVector1::const_data_type ,
1920  typename CoeffVector1::array_layout ,
1921  typename CoeffVector1::device_type ,
1922  Kokkos::MemoryRandomAccess >
1923  CoeffVector1Type;
1924 
1925  typedef View< typename CoeffVector2::const_data_type ,
1926  typename CoeffVector2::array_layout ,
1927  typename CoeffVector2::device_type ,
1928  Kokkos::MemoryRandomAccess >
1929  CoeffVector2Type;
1930 
1931  typedef CrsMatrix<typename TCrsMatrix::const_value_type,
1932  typename TCrsMatrix::ordinal_type,
1933  typename TCrsMatrix::device_type,
1934  typename TCrsMatrix::memory_traits,
1935  typename TCrsMatrix::size_type>
1936  CrsMatrixType;
1937 
1938  Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
1939 
1940  const int NNZPerRow = A.nnz()/A.numRows();
1941 
1942 #ifndef KOKKOS_FAST_COMPILE
1943 
1944  if(NNZPerRow>=96) {
1945  typedef Vectorization<typename RangeVector::device_type,32> vec_type;
1946  typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1947  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1948 
1949  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1950 
1951  OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1952  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
1953  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1954 
1955  } else if(NNZPerRow>=48) {
1956  typedef Vectorization<typename RangeVector::device_type,16> vec_type;
1957  typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1958  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1959 
1960  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1961 
1962  OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1963  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
1964  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1965 
1966  } else if(NNZPerRow>=24) {
1967  typedef Vectorization<typename RangeVector::device_type,8> vec_type;
1968  typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1969  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1970 
1971  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1972 
1973  OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1974  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
1975  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1976 
1977  } else if(NNZPerRow>=12) {
1978  typedef Vectorization<typename RangeVector::device_type,4> vec_type;
1979  typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1980  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1981 
1982  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1983 
1984  OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1985  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
1986  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1987 
1988  } else if(NNZPerRow>=4) {
1989  typedef Vectorization<typename RangeVector::device_type,2> vec_type;
1990  typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
1991  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
1992 
1993  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
1994 
1995  OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
1996  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
1997  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
1998 
1999  } else {
2000  typedef Vectorization<typename RangeVector::device_type,1> vec_type;
2001  typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2002  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
2003 
2004  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2005 
2006  OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2007  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2008  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2009 
2010  }
2011 #else // NOT KOKKOS_FAST_COMPILE
2012  typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2013  typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2014  CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment > OpType ;
2015 
2016  int numVecs = x.dimension_1(); // == 1
2017  CoeffVector1 beta = betav;
2018  CoeffVector2 alpha = alphav;
2019 
2020  if(doalpha!=2) {
2021  alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
2022  typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
2023  typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
2024 
2025  for(int i = 0; i < numVecs; i++)
2026  h_a(i) = s_a;
2027 
2028  Kokkos::deep_copy(alpha, h_a);
2029  }
2030  if(dobeta!=2) {
2031  beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
2032  typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
2033  typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
2034 
2035  for(int i = 0; i < numVecs; i++)
2036  h_b(i) = s_b;
2037 
2038  Kokkos::deep_copy(beta, h_b);
2039  }
2040 
2041  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2042 
2043  OpType op(beta,alpha,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2044  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2045  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2046 
2047 
2048 #endif // KOKKOS_FAST_COMPILE
2049  }
2050  }
2051 
2052 template <class RangeVector,
2053  class TCrsMatrix,
2054  class DomainVector,
2055  class CoeffVector1,
2056  class CoeffVector2,
2057  int doalpha,
2058  int dobeta>
2059  void
2060  MV_Multiply (typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav,
2061  const RangeVector &y,
2062  const CoeffVector2 &alphav,
2063  const TCrsMatrix &A,
2064  const DomainVector &x)
2065  {
2066  //Special case for zero Rows RowMap
2067  if(A.numRows() <= 0) return;
2068 
2069  if (doalpha == 0) {
2070  if (dobeta==2) {
2071  MV_MulScalar(y,betav,y);
2072  } else {
2073  MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
2074  }
2075  return;
2076  } else {
2077  typedef View< typename RangeVector::non_const_data_type ,
2078  typename RangeVector::array_layout ,
2079  typename RangeVector::device_type ,
2080  typename RangeVector::memory_traits >
2081  RangeVectorType;
2082 
2083  typedef View< typename DomainVector::const_data_type ,
2084  typename DomainVector::array_layout ,
2085  typename DomainVector::device_type ,
2086  Kokkos::MemoryRandomAccess >
2087  DomainVectorType;
2088 
2089  typedef View< typename CoeffVector1::const_data_type ,
2090  typename CoeffVector1::array_layout ,
2091  typename CoeffVector1::device_type ,
2092  Kokkos::MemoryRandomAccess >
2093  CoeffVector1Type;
2094 
2095  typedef View< typename CoeffVector2::const_data_type ,
2096  typename CoeffVector2::array_layout ,
2097  typename CoeffVector2::device_type ,
2098  Kokkos::MemoryRandomAccess >
2099  CoeffVector2Type;
2100 
2101  typedef CrsMatrix<typename TCrsMatrix::const_value_type,
2102  typename TCrsMatrix::ordinal_type,
2103  typename TCrsMatrix::device_type,
2104  typename TCrsMatrix::memory_traits,
2105  typename TCrsMatrix::size_type> CrsMatrixType;
2106 
2107  Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
2108 
2109  const int NNZPerRow = A.nnz()/A.numRows();
2110 
2111 #ifndef KOKKOS_FAST_COMPILE
2112 
2113  if(x.dimension_1()==1) {
2114  typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type> DomainVector1D;
2115  typedef View<typename RangeVectorType::value_type*,typename RangeVector::array_layout ,typename RangeVectorType::device_type,typename RangeVector::memory_traits> RangeVector1D;
2116  RangeVector1D y_sub = RangeVector1D(y.ptr_on_device(),y.dimension_0());
2117  DomainVector1D x_sub = DomainVector1D(x.ptr_on_device(),x.dimension_0());
2118 
2119  return MV_MultiplySingle<RangeVector1D,TCrsMatrix,DomainVector1D,CoeffVector1,CoeffVector2,doalpha,dobeta>
2120  (betav,y_sub,alphav,A,x_sub);
2121 
2122  } else {
2123 
2124  //Currently for multiple right hand sides its not worth it to use more than 8 threads per row on GPUs
2125  if(NNZPerRow>=96) {
2126  typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2127  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2128  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2129 
2130  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2131 
2132  OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2133  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2134  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2135 
2136  } else if(NNZPerRow>=48) {
2137  typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2138  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2139  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2140 
2141  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2142 
2143  OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2144  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2145  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2146 
2147  } else if(NNZPerRow>=16) {
2148  typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2149  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2150  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2151 
2152  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2153 
2154  OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2155  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2156  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2157 
2158  } else if(NNZPerRow>=12) {
2159  typedef Vectorization<typename RangeVector::device_type,4> vec_type;
2160  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2161  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2162 
2163  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2164 
2165  OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2166  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2167  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2168 
2169  } else if(NNZPerRow>=4) {
2170  typedef Vectorization<typename RangeVector::device_type,2> vec_type;
2171  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2172  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2173 
2174  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2175 
2176  OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2177  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2178  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2179 
2180  } else {
2181  typedef Vectorization<typename RangeVector::device_type,1> vec_type;
2182  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2183  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
2184 
2185  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2186 
2187  OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2188  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2189  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2190 
2191  }
2192  }
2193 
2194 #else // NOT KOKKOS_FAST_COMPILE
2195 
2196  typedef Vectorization<typename RangeVector::device_type,8> vec_type;
2197  typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
2198  CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment >
2199  OpType ;
2200 
2201  int numVecs = x.dimension_1();
2202  CoeffVector1 beta = betav;
2203  CoeffVector2 alpha = alphav;
2204 
2205  if (doalpha != 2) {
2206  alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
2207  typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
2208  typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
2209 
2210  for (int i = 0; i < numVecs; ++i)
2211  h_a(i) = s_a;
2212 
2213  Kokkos::deep_copy(alpha, h_a);
2214  }
2215 
2216  if (dobeta != 2) {
2217  beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
2218  typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
2219  typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
2220 
2221  for(int i = 0; i < numVecs; i++)
2222  h_b(i) = s_b;
2223 
2224  Kokkos::deep_copy(beta, h_b);
2225  }
2226 
2227  const typename CrsMatrixType::ordinal_type nrow = A.numRows();
2228 
2229  OpType op(beta,alpha,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
2230 
2231  Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
2232  ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
2233 
2234 
2235 #endif // KOKKOS_FAST_COMPILE
2236  }
2237  }
2238 
2239  template<class RangeVector,
2240  class TCrsMatrix,
2241  class DomainVector,
2242  class CoeffVector1,
2243  class CoeffVector2,
2244  int doalpha,
2245  int dobeta>
2246  void
2247  MV_Multiply (typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav,
2248  const RangeVector &y,
2249  const CoeffVector2 &alphav,
2250  const TCrsMatrix& A,
2251  const DomainVector& x) {
2252  return MV_MultiplySingle<RangeVector,TCrsMatrix,DomainVector,CoeffVector1,CoeffVector2,doalpha,dobeta>
2253  (betav,y,alphav,A,x);
2254  }
2255 
2256  template<class RangeVector, class CrsMatrix, class DomainVector, class CoeffVector1, class CoeffVector2>
2257  void
2258  MV_Multiply (const CoeffVector1& betav,
2259  const RangeVector& y,
2260  const CoeffVector2& alphav,
2261  const CrsMatrix& A,
2262  const DomainVector& x,
2263  int beta,
2264  int alpha)
2265  {
2266  if (beta == 0) {
2267  if(alpha == 0)
2268  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A , x);
2269  else if(alpha == 1)
2270  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A , x);
2271  else if(alpha == -1)
2272  MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A , x);
2273  else
2274  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A , x);
2275  } else if(beta == 1) {
2276  if(alpha == 0)
2277  return;
2278  else if(alpha == 1)
2279  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A , x);
2280  else if(alpha == -1)
2281  MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A , x);
2282  else
2283  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A , x);
2284  } else if(beta == -1) {
2285  if(alpha == 0)
2286  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A , x);
2287  else if(alpha == 1)
2288  MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A , x);
2289  else if(alpha == -1)
2290  MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A , x);
2291  else
2292  MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A , x);
2293  } else {
2294  if(alpha == 0)
2295  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A , x);
2296  else if(alpha == 1)
2297  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A , x);
2298  else if(alpha == -1)
2299  MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A , x);
2300  else
2301  MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A , x);
2302  }
2303  }
2304 
2305  template <class RangeVector, class CrsMatrix, class DomainVector,
2306  class Value1, class Layout1, class Device1, class MemoryManagement1,
2307  class Value2, class Layout2, class Device2, class MemoryManagement2>
2308  void
2310  const RangeVector& y,
2312  const CrsMatrix& A,
2313  const DomainVector& x)
2314  {
2315  return MV_Multiply (betav, y, alphav, A, x, 2, 2);
2316  }
2317 
2318  template <class RangeVector, class CrsMatrix, class DomainVector,
2319  class Value1, class Layout1, class Device1, class MemoryManagement1>
2320  void
2321  MV_Multiply (const RangeVector& y,
2323  const CrsMatrix& A,
2324  const DomainVector& x)
2325  {
2326  return MV_Multiply (alphav, y, alphav, A, x, 0, 2);
2327  }
2328 
2329  template<class RangeVector, class CrsMatrix, class DomainVector>
2330  void
2331  MV_Multiply (const RangeVector& y,
2332  const CrsMatrix& A,
2333  const DomainVector& x)
2334  {
2335  // FIXME (mfh 21 Jun 2013) The way this code is supposed to work, is
2336  // that it tests at run time for each TPL in turn. Shouldn't it
2337  // rather dispatch on the Device type? But I suppose the "try"
2338  // functions do that.
2339  //
2340  // We want to condense this a bit: "Try TPLs" function that tests
2341  // all the suitable TPLs at run time. This would be a run-time test
2342  // that compares the Scalar and Device types to those accepted by
2343  // the TPL(s).
2344 #ifdef KOKKOS_USE_CUSPARSE
2345  if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, 1.0, A, x)) {
2346  return;
2347  }
2348 #endif // KOKKOS_USE_CUSPARSE
2349 #ifdef KOKKOS_USE_MKL
2350  if (MV_Multiply_Try_MKL (0.0, y, 1.0, A, x)) {
2351  return;
2352  }
2353 #endif // KOKKOS_USE_MKL
2355  aVector a;
2356 
2357  return MV_Multiply (a, y, a, A, x, 0, 1);
2358  }
2359 
2360  template<class RangeVector, class CrsMatrix, class DomainVector>
2361  void
2362  MV_Multiply (const RangeVector& y,
2363  typename DomainVector::const_value_type s_a,
2364  const CrsMatrix& A,
2365  const DomainVector& x)
2366  {
2367 #ifdef KOKKOS_USE_CUSPARSE
2368  if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, s_a, A, x)) {
2369  return;
2370  }
2371 #endif // KOKKOS_USE_CUSPARSE
2372 #ifdef KOKKOS_USE_MKL
2373  if (MV_Multiply_Try_MKL (0.0, y, s_a, A, x)) {
2374  return;
2375  }
2376 #endif // KOKKOS_USE_MKL
2378  aVector a;
2379  const int numVecs = x.dimension_1();
2380 
2381  if ((s_a < 1) && (s_a != 0)) {
2382  return MV_Multiply (a, y, a, A, x, 0, -1);
2383  } else if (s_a == 1) {
2384  return MV_Multiply (a, y, a, A, x, 0, 1);
2385  }
2386 
2387  if (s_a != 0) {
2388  a = aVector("a", numVecs);
2389  typename aVector::HostMirror h_a = Kokkos::create_mirror_view (a);
2390  for (int i = 0; i < numVecs; ++i) {
2391  h_a(i) = s_a;
2392  }
2393  Kokkos::deep_copy(a, h_a);
2394  return MV_Multiply (a, y, a, A, x, 0, 2);
2395  }
2396  }
2397 
2398  template<class RangeVector, class CrsMatrix, class DomainVector>
2399  void
2400  MV_Multiply (typename RangeVector::const_value_type s_b,
2401  const RangeVector& y,
2402  typename DomainVector::const_value_type s_a,
2403  const CrsMatrix& A,
2404  const DomainVector& x)
2405  {
2406 #ifdef KOKKOS_USE_CUSPARSE
2407  if (CuSparse::MV_Multiply_Try_CuSparse (s_b, y, s_a, A, x)) {
2408  return;
2409  }
2410 #endif // KOKKOSE_USE_CUSPARSE
2411 #ifdef KOKKOS_USE_MKL
2412  if (MV_Multiply_Try_MKL (s_b, y, s_a, A, x)) {
2413  return;
2414  }
2415 #endif // KOKKOS_USE_MKL
2417  aVector a;
2418  aVector b;
2419  int numVecs = x.dimension_1();
2420 
2421  // [HCE 2013/12/09] Following 'if' appears to be a mistake and has been commented out
2422  // if(numVecs==1)
2423  if (s_b == 0) {
2424  if (s_a == 0)
2425  return MV_Multiply (a, y, a, A, x, 0, 0);
2426  else if (s_a == 1)
2427  return MV_Multiply (a, y, a, A, x, 0, 1);
2428  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2429  return MV_Multiply (a, y, a, A, x, 0, -1);
2430  else {
2431  a = aVector("a", numVecs);
2432  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2433  for (int i = 0; i < numVecs; ++i) {
2434  h_a(i) = s_a;
2435  }
2436  Kokkos::deep_copy (a, h_a);
2437  return MV_Multiply (a, y, a, A, x, 0, 2);
2438  }
2439  } else if (s_b == 1) {
2440  if (s_a == 0)
2441  return MV_Multiply (a, y, a, A, x, 1, 0);
2442  else if (s_a == 1)
2443  return MV_Multiply (a, y, a, A, x, 1, 1);
2444  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2445  return MV_Multiply (a, y, a, A, x, 1, -1);
2446  else {
2447  a = aVector("a", numVecs);
2448  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2449  for (int i = 0; i < numVecs; ++i) {
2450  h_a(i) = s_a;
2451  }
2452  Kokkos::deep_copy (a, h_a);
2453  return MV_Multiply (a, y, a, A, x, 1, 2);
2454  }
2455  } else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) {
2456  if (s_a == 0)
2457  return MV_Multiply (a, y, a, A, x, -1, 0);
2458  else if (s_a == 1)
2459  return MV_Multiply (a, y, a, A, x, -1, 1);
2460  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2461  return MV_Multiply (a, y, a, A, x, -1, -1);
2462  else {
2463  a = aVector("a", numVecs);
2464  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2465  for (int i = 0; i < numVecs; ++i) {
2466  h_a(i) = s_a;
2467  }
2468  Kokkos::deep_copy (a, h_a);
2469  return MV_Multiply (a, y, a, A, x, -1, 2);
2470  }
2471  } else {
2472  b = aVector("b", numVecs);
2473  typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b);
2474  for (int i = 0; i < numVecs; ++i) {
2475  h_b(i) = s_b;
2476  }
2477  Kokkos::deep_copy(b, h_b);
2478 
2479  if (s_a == 0)
2480  return MV_Multiply (b, y, a, A, x, 2, 0);
2481  else if (s_a == 1)
2482  return MV_Multiply (b, y, a, A, x, 2, 1);
2483  else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
2484  return MV_Multiply (b, y, a, A, x, 2, -1);
2485  else {
2486  a = aVector("a", numVecs);
2487  typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
2488  for (int i = 0; i < numVecs; ++i) {
2489  h_a(i) = s_a;
2490  }
2491  Kokkos::deep_copy (a, h_a);
2492  return MV_Multiply (b, y, a, A, x, 2, 2);
2493  }
2494  }
2495  }
2496 
2497  namespace KokkosCrsMatrix {
2511  template <class CrsMatrixDst, class CrsMatrixSrc>
2512  void deep_copy (CrsMatrixDst A, CrsMatrixSrc B) {
2513  Kokkos::deep_copy(A.graph.entries, B.graph.entries);
2514  // FIXME (mfh 09 Aug 2013) This _should_ copy the row map. We
2515  // couldn't do it before because the row map was const, forbidding
2516  // deep_copy.
2517  //
2518  //Kokkos::deep_copy(A.graph.row_map,B.graph.row_map);
2519  Kokkos::deep_copy(A.values, B.values);
2520 
2521  // FIXME (mfh 09 Aug 2013) Be sure to copy numRows, numCols, and
2522  // nnz as well.
2523  // (CRT 25 Sep 2013) don't copy rather check that they match.
2524  // Deep_copy in Kokkos is intended for copy between compatible objects.
2525  }
2526  } // namespace KokkosCrsMatrix
2527 
2528 } // namespace Kokkos
2529 
2530 #endif /* KOKKOS_CRSMATRIX_H_ */
values_type::non_const_value_type non_const_value_type
Nonconst version of the type of the entries in the sparse matrix.
KOKKOS_INLINE_FUNCTION SparseRowViewConst< CrsMatrix > rowConst(int i) const
Return a const view of row i of the matrix.
void deep_copy(const View< DT, DL, DD, DM, DS > &dst, typename Impl::enable_if<(Impl::is_same< typename ViewTraits< DT, DL, DD, DM >::non_const_value_type, typename ViewTraits< DT, DL, DD, DM >::value_type >::value), typename ViewTraits< DT, DL, DD, DM >::const_value_type >::type &value)
Deep copy a value into a view.
KOKKOS_INLINE_FUNCTION value_type & value(const int &i) const
(Const) reference to the value of entry i in this row of the sparse matrix.
KOKKOS_INLINE_FUNCTION SparseRowView(const typename MatrixType::values_type &values, const typename MatrixType::index_type &colidx__, const int &stride, const int &count, const int &idx)
Constructor.
Const view of a row of a sparse matrix.
View of a row of a sparse matrix.
const int length
Number of entries in the row.
KOKKOS_INLINE_FUNCTION ordinal_type numCols() const
The number of columns in the sparse matrix.
KOKKOS_INLINE_FUNCTION ordinal_type & colidx(const int &i) const
Reference to the column index of entry i in this row of the sparse matrix.
CrsMatrix(const std::string &label, OrdinalType nrows, OrdinalType ncols, OrdinalType annz, ScalarType *val, OrdinalType *rows, OrdinalType *cols, bool pad=false)
Constructor that copies raw arrays of host data in coordinate format.
Kokkos::StaticCrsGraph< OrdinalType, Kokkos::LayoutLeft, Device, SizeType > StaticCrsGraphType
Type of the graph structure of the sparse matrix.
KOKKOS_INLINE_FUNCTION SparseRowViewConst(value_type *const values, ordinal_type *const colidx__, const int stride, const int count)
Constructor.
values_type::const_value_type const_value_type
Const version of the type of the entries in the sparse matrix.
KOKKOS_INLINE_FUNCTION ordinal_type & colidx(const int &i) const
(Const) reference to the column index of entry i in this row of the sparse matrix.
KOKKOS_INLINE_FUNCTION ordinal_type nnz() const
The number of stored entries in the sparse matrix.
CrsMatrix(const CrsMatrix< SType, OType, DType, MTType, IType > &B)
Copy Constructor.
KOKKOS_INLINE_FUNCTION SparseRowViewConst(const typename MatrixType::values_type &values, const typename MatrixType::index_type &colidx__, const int &stride, const int &count, const int &idx)
Constructor.
CrsMatrix(const std::string &arg_label, const StaticCrsGraphType &arg_graph)
Construct with a graph that will be shared.
KOKKOS_INLINE_FUNCTION ordinal_type numRows() const
The number of rows in the sparse matrix.
Compressed sparse row implementation of a sparse matrix.
CrsMatrix(const std::string &label, OrdinalType nrows, OrdinalType ncols, OrdinalType annz, values_type vals, row_map_type rows, index_type cols)
Constructor that accepts a row map, column indices, and values.
const MatrixType::non_const_value_type value_type
The type of the values in the row.
StaticCrsGraphType::entries_type index_type
Type of column indices in the sparse matrix.
Kokkos::View< value_type *, Kokkos::LayoutRight, device_type, MemoryTraits > values_type
Kokkos Array type of the entries (values) in the sparse matrix.
cuSPARSE implementation of Kokkos::CrsMatrix kernels.
MatrixType::value_type value_type
The type of the values in the row.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, typename Impl::enable_if< !Impl::is_integral< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
CrsMatrix(const std::string &label, OrdinalType ncols, values_type vals, StaticCrsGraphType graph_)
Constructor that accepts a a static graph, and values.
const MatrixType::non_const_ordinal_type ordinal_type
The type of the column indices in the row.
StaticCrsGraphType::row_map_type row_map_type
Type of the "row map" (which contains the offset for each row's data).
Intel MKL version of Kokkos' sparse matrix interface.
KOKKOS_INLINE_FUNCTION SparseRowView(value_type *const values, ordinal_type *const colidx__, const int stride, const int count)
Constructor.
CrsMatrix()
Default constructor; constructs an empty sparse matrix.
const int length
Number of entries in the row.
KOKKOS_INLINE_FUNCTION SparseRowView< CrsMatrix > row(int i) const
Return a view of row i of the matrix.
Execution policy for parallel work over a league of teams of threads.
KOKKOS_INLINE_FUNCTION value_type & value(const int &i) const
Reference to the value of entry i in this row of the sparse matrix.
void generate(const std::string &label, OrdinalType nrows, OrdinalType ncols, OrdinalType target_nnz, OrdinalType varianz_nel_row, OrdinalType width_row)
This is a method only for testing that creates a random sparse matrix.
MatrixType::ordinal_type ordinal_type
The type of the column indices in the row.
CrsMatrix< ScalarType, OrdinalType, host_mirror_space, MemoryTraits > HostMirror
Type of a host-memory mirror of the sparse matrix.