Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_CrsMatrix_UQ_PCE.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_HPP
44 
45 #include "Sacado_UQ_PCE.hpp"
46 #include "Kokkos_View_UQ_PCE.hpp"
48 #include "KokkosSparse_CrsMatrix.hpp"
49 #include "KokkosSparse_spmv.hpp"
50 
51 #include "Kokkos_Blas1_UQ_PCE.hpp" // for some utilities
52 
53 #include "Stokhos_Multiply.hpp"
55 
56 namespace Stokhos {
57 
58 //----------------------------------------------------------------------------
59 // Specialization of KokkosSparse::CrsMatrix for Sacado::UQ::PCE scalar type
60 //----------------------------------------------------------------------------
61 
62 // Kernel implementing y = A * x where
63 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
64 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
65 // x and y are rank 1
66 template <typename MatrixDevice,
67  typename MatrixStorage,
68  typename MatrixOrdinal,
69  typename MatrixMemory,
70  typename MatrixSize,
71  typename InputStorage,
72  typename ... InputP,
73  typename OutputStorage,
74  typename ... OutputP>
75 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
76  MatrixOrdinal,
77  MatrixDevice,
78  MatrixMemory,
79  MatrixSize>,
80  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
81  InputP... >,
82  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
83  OutputP... >
84  >
85 {
86 public:
90 
91  typedef MatrixDevice execution_space;
92 
93  typedef KokkosSparse::CrsMatrix< MatrixValue,
94  MatrixOrdinal,
95  MatrixDevice,
96  MatrixMemory,
97  MatrixSize> matrix_type;
98  typedef typename matrix_type::values_type matrix_values_type;
100  typedef typename tensor_type::size_type size_type;
101  typedef Kokkos::View< InputVectorValue*,
102  InputP... > input_vector_type;
103  typedef Kokkos::View< OutputVectorValue*,
104  OutputP... > output_vector_type;
105 
106 private:
107 
108  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
109  typedef typename matrix_values_type::array_type matrix_array_type;
110  typedef typename input_vector_type::array_type input_array_type;
111  typedef typename output_vector_type::array_type output_array_type;
112 
117 
125 
126  Multiply( const matrix_type & A ,
127  const input_vector_type & x ,
128  const output_vector_type & y ,
129  const input_scalar & a ,
130  const output_scalar & b )
131  : m_A_values( A.values )
132  , m_A_graph( A.graph )
133  , m_x( x )
134  , m_y( y )
135  , m_tensor( Kokkos::cijk(A.values) )
136  , m_a( a )
137  , m_b( b )
138  {}
139 
140 public:
141 
142  //
143  // Non-team functor interface -- no threads within PCE multiply
144  //
145  // Note: Rember that matrix currently is always LayoutRight!
146  //
147  KOKKOS_INLINE_FUNCTION
148  void operator()( const size_type iBlockRow ) const
149  {
150  // Prefer that y[ m_tensor.dimension() ] be scratch space
151  // on the local thread, but cannot dynamically allocate
152  output_scalar * const y = & m_y(0,iBlockRow);
153 
154  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
155  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
156 
157  // Leading dimension guaranteed contiguous for LayoutLeft
158  if ( m_b == output_scalar(0) )
159  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
160  y[j] = 0 ;
161  else
162  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
163  y[j] = m_b * y[j] ;
164  //loop over cols of A
165  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
166  const input_scalar * const x = & m_x( 0 , m_A_graph.entries(iEntry) );
167  const matrix_scalar * const A = & m_A_values( iEntry , 0 );
168 
170  m_tensor , A , x , y , m_a );
171  }
172 
173  }
174 
175 #if defined(__MIC__)
176 
177  //
178  // Team functor interface with threading within PCE multiply
179  //
180  // Note: Rember that matrix currently is always LayoutRight!
181  //
182  // This is a MIC-specific version of that processes multiple FEM columns
183  // at a time to reduce tensor reads
184  //
185  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
186  KOKKOS_INLINE_FUNCTION
187  void operator()( const team_member & device ) const
188  {
189  const size_type iBlockRow = device.league_rank();
190 
191  // Check for valid row
192  const size_type row_count = m_A_graph.row_map.extent(0)-1;
193  if (iBlockRow >= row_count)
194  return;
195 
196  const size_type num_thread = device.team_size();
197  const size_type thread_idx = device.team_rank();
198  const Kokkos::pair<size_type,size_type> work_range =
199  details::compute_work_range<output_scalar>(
200  device, m_tensor.dimension(), num_thread, thread_idx);
201 
202  // Prefer that y[ m_tensor.dimension() ] be scratch space
203  // on the local thread, but cannot dynamically allocate
204  output_scalar * const y = & m_y(0,iBlockRow);
205 
206  // Leading dimension guaranteed contiguous for LayoutLeft
207  if ( m_b == output_scalar(0) )
208  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
209  y[j] = 0 ;
210  else
211  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
212  y[j] = m_b * y[j] ;
213 
214  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
215  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
216  const size_type BlockSize = 9;
217  const size_type numBlock =
218  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
219 
220  const matrix_scalar* sh_A[BlockSize];
221  const input_scalar* sh_x[BlockSize];
222 
223  size_type iBlockEntry = iBlockEntryBeg;
224  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
225  const size_type block_size =
226  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
227 
228  for ( size_type col = 0; col < block_size; ++col ) {
229  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
230  sh_x[col] = & m_x( 0 , iBlockColumn );
231  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
232  }
233 
234  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
235 
236  const size_type nEntry = m_tensor.num_entry(iy);
237  const size_type iEntryBeg = m_tensor.entry_begin(iy);
238  const size_type iEntryEnd = iEntryBeg + nEntry;
239  size_type iEntry = iEntryBeg;
240 
241  output_scalar ytmp = 0 ;
242 
243  // Do entries with a blocked loop of size blocksize
244  const size_type nBlock = nEntry / tensor_type::vectorsize;
245  const size_type nEntryB = nBlock * tensor_type::vectorsize;
246  const size_type iEnd = iEntryBeg + nEntryB;
247 
248  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
249  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
250  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
251  VecTV vy;
252  vy.zero();
253  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
254  const size_type *j = &m_tensor.coord(iEntry,0);
255  const size_type *k = &m_tensor.coord(iEntry,1);
256  ValTV c(&(m_tensor.value(iEntry)));
257 
258  for ( size_type col = 0; col < block_size; ++col ) {
259  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
260  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
261 
262  // vy += c * ( aj * xk + ak * xj)
263  aj.times_equal(xk);
264  aj.multiply_add(ak, xj);
265  vy.multiply_add(c, aj);
266  }
267  }
268  ytmp += vy.sum();
269 
270  // The number of nonzeros is always constrained to be a multiple of 8
271 
272  const size_type rem = iEntryEnd-iEntry;
273  if (rem >= 8) {
274  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
275  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
276  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
277  const size_type *j = &m_tensor.coord(iEntry,0);
278  const size_type *k = &m_tensor.coord(iEntry,1);
279  ValTV2 c(&(m_tensor.value(iEntry)));
280 
281  for ( size_type col = 0; col < block_size; ++col ) {
282  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
283  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
284 
285  // vy += c * ( aj * xk + ak * xj)
286  aj.times_equal(xk);
287  aj.multiply_add(ak, xj);
288  aj.times_equal(c);
289  ytmp += aj.sum();
290  }
291  }
292 
293  y[iy] += m_a * ytmp ;
294  }
295 
296  // Add a team barrier to keep the thread team in-sync before going on
297  // to the next block
298  device.team_barrier();
299  }
300 
301  }
302 
303 #else
304 
305  //
306  // Team functor interface with threading within PCE multiply
307  //
308  // Note: Rember that matrix currently is always LayoutRight!
309  //
310  // This is a general, hand-vectorized version that processes multiple FEM
311  // columns at a time to reduce tensor reads. Note that auto-vectorization
312  // doesn't work here because of the inner-loop over FEM columns.
313  //
314  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
315  KOKKOS_INLINE_FUNCTION
316  void operator()( const team_member & device ) const
317  {
318  const size_type iBlockRow = device.league_rank();
319 
320  // Check for valid row
321  const size_type row_count = m_A_graph.row_map.extent(0)-1;
322  if (iBlockRow >= row_count)
323  return;
324 
325  const size_type num_thread = device.team_size();
326  const size_type thread_idx = device.team_rank();
327  const Kokkos::pair<size_type,size_type> work_range =
328  details::compute_work_range<output_scalar>(
329  device, m_tensor.dimension(), num_thread, thread_idx);
330 
331  // Prefer that y[ m_tensor.dimension() ] be scratch space
332  // on the local thread, but cannot dynamically allocate
333  output_scalar * const y = & m_y(0,iBlockRow);
334 
335  // Leading dimension guaranteed contiguous for LayoutLeft
336  if ( m_b == output_scalar(0) )
337  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
338  y[j] = 0 ;
339  else
340  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
341  y[j] = m_b * y[j] ;
342 
343  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
344  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
345  const size_type BlockSize = 14;
346  const size_type numBlock =
347  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
348 
349  const matrix_scalar* sh_A[BlockSize];
350  const input_scalar* sh_x[BlockSize];
351 
352  size_type iBlockEntry = iBlockEntryBeg;
353  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
354  const size_type block_size =
355  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
356 
357  for ( size_type col = 0; col < block_size; ++col ) {
358  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
359  sh_x[col] = & m_x( 0 , iBlockColumn );
360  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
361  }
362 
363  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
364 
365  const size_type nEntry = m_tensor.num_entry(iy);
366  const size_type iEntryBeg = m_tensor.entry_begin(iy);
367  const size_type iEntryEnd = iEntryBeg + nEntry;
368  size_type iEntry = iEntryBeg;
369 
370  output_scalar ytmp = 0 ;
371 
372  // Do entries with a blocked loop of size blocksize
373  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
374  const size_type nBlock = nEntry / tensor_type::vectorsize;
375  const size_type nEntryB = nBlock * tensor_type::vectorsize;
376  const size_type iEnd = iEntryBeg + nEntryB;
377 
381  VecTV vy;
382  vy.zero();
383  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
384  const size_type *j = &m_tensor.coord(iEntry,0);
385  const size_type *k = &m_tensor.coord(iEntry,1);
386  ValTV c(&(m_tensor.value(iEntry)));
387 
388  for ( size_type col = 0; col < block_size; ++col ) {
389  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
390  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
391 
392  // vy += c * ( aj * xk + ak * xj)
393  aj.times_equal(xk);
394  aj.multiply_add(ak, xj);
395  vy.multiply_add(c, aj);
396  }
397  }
398  ytmp += vy.sum();
399  }
400 
401  // Do remaining entries with a scalar loop
402  for ( ; iEntry<iEntryEnd; ++iEntry) {
403  const size_type j = m_tensor.coord(iEntry,0);
404  const size_type k = m_tensor.coord(iEntry,1);
405  tensor_scalar cijk = m_tensor.value(iEntry);
406 
407  for ( size_type col = 0; col < block_size; ++col ) {
408  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
409  sh_A[col][k] * sh_x[col][j] );
410  }
411 
412  }
413 
414  y[iy] += m_a * ytmp ;
415  }
416 
417  // Add a team barrier to keep the thread team in-sync before going on
418  // to the next block
419  device.team_barrier();
420  }
421 
422  }
423 
424 #endif
425 
426  static void apply( const matrix_type & A ,
427  const input_vector_type & x ,
428  const output_vector_type & y ,
429  const input_scalar & a = input_scalar(1) ,
430  const output_scalar & b = output_scalar(0) )
431  {
432  // Generally the block algorithm seems to perform better on the MIC,
433  // as long as the stochastic size isn't too big, but doesn't perform
434  // any better on the CPU (probably because the CPU has a fat L3 cache
435  // to store the sparse 3 tensor).
436 #ifdef __MIC__
437  const bool use_block_algorithm = true;
438 #else
439  const bool use_block_algorithm = false;
440 #endif
441 
442  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
443  if (use_block_algorithm) {
444 #ifdef __MIC__
445  const size_t team_size = 4; // 4 hyperthreads for MIC
446 #else
447  const size_t team_size = 2; // 2 for everything else
448 #endif
449  const size_t league_size = row_count;
450  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
451  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
452  }
453  else {
454  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
455  }
456  }
457 };
458 
459 // Kernel implementing y = A * x where
460 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
461 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
462 // x and y are rank 2
463 template <typename MatrixDevice,
464  typename MatrixStorage,
465  typename MatrixOrdinal,
466  typename MatrixMemory,
467  typename MatrixSize,
468  typename InputStorage,
469  typename ... InputP,
470  typename OutputStorage,
471  typename ... OutputP>
472 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
473  MatrixOrdinal,
474  MatrixDevice,
475  MatrixMemory,
476  MatrixSize>,
477  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
478  InputP... >,
479  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
480  OutputP... >
481  >
482 {
483 public:
487 
488  typedef MatrixDevice execution_space;
489 
490  typedef KokkosSparse::CrsMatrix< MatrixValue,
491  MatrixOrdinal,
492  MatrixDevice,
493  MatrixMemory,
494  MatrixSize> matrix_type;
495  typedef typename matrix_type::values_type matrix_values_type;
497  typedef typename tensor_type::size_type size_type;
498  typedef Kokkos::View< InputVectorValue**,
499  InputP... > input_vector_type;
500  typedef Kokkos::View< OutputVectorValue**,
501  OutputP... > output_vector_type;
502 
503 private:
504 
505  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
506  typedef typename matrix_values_type::array_type matrix_array_type;
507  typedef typename input_vector_type::array_type input_array_type;
508  typedef typename output_vector_type::array_type output_array_type;
509 
514 
522 
523  Multiply( const matrix_type & A ,
524  const input_vector_type & x ,
525  const output_vector_type & y ,
526  const input_scalar & a ,
527  const output_scalar & b )
528  : m_A_values( A.values )
529  , m_A_graph( A.graph )
530  , m_x( x )
531  , m_y( y )
532  , m_tensor( Kokkos::cijk(A.values) )
533  , m_a( a )
534  , m_b( b )
535  {}
536 
537 public:
538 
539  //
540  // Non-team functor interface -- no threads within PCE multiply
541  //
542  // Note: Rember that matrix currently is always LayoutRight!
543  //
544  KOKKOS_INLINE_FUNCTION
545  void operator()( const size_type iBlockRow ) const
546  {
547  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
548  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
549 
550  const size_type num_col = m_y.extent(2);
551 
552  // Leading dimension guaranteed contiguous for LayoutLeft
553  if ( m_b == output_scalar(0) )
554  for (size_type col=0; col<num_col; ++col)
555  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
556  m_y(j, iBlockRow, col) = 0 ;
557  else
558  for (size_type col=0; col<num_col; ++col)
559  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
560  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
561 
562  // Put the x-column loop inside the A-column loop to reuse entries in A.
563  // This way all of the entries for that particular column of A should stay
564  // in L1 cache for all of the columns of x.
565 
566  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
567  const matrix_scalar * const A = &m_A_values( iEntry, 0 );
568  const size_type iBlockCol = m_A_graph.entries(iEntry);
569 
570  for (size_type col=0; col<num_col; ++col) {
571  output_scalar * const y = &m_y( 0, iBlockRow, col );
572  const input_scalar * const x = &m_x( 0, iBlockCol, col );
573  BlockMultiply< tensor_type >::apply( m_tensor , A , x , y , m_a );
574  }
575 
576  }
577 
578  }
579 
580 #if defined(__MIC__)
581 
582  //
583  // Team functor interface with threading within PCE multiply
584  //
585  // Note: Rember that matrix currently is always LayoutRight!
586  //
587  // This is a MIC-specific version of that processes multiple FEM columns
588  // at a time to reduce tensor reads
589  //
590  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
591 
592  KOKKOS_INLINE_FUNCTION
593  void operator()( const team_member & device ) const
594  {
595  const size_type iBlockRow = device.league_rank();
596 
597  // Check for valid row
598  const size_type row_count = m_A_graph.row_map.extent(0)-1;
599  if (iBlockRow >= row_count)
600  return;
601 
602  const size_type num_thread = device.team_size();
603  const size_type thread_idx = device.team_rank();
604  const Kokkos::pair<size_type,size_type> work_range =
605  details::compute_work_range<output_scalar>(
606  device, m_tensor.dimension(), num_thread, thread_idx);
607 
608  const size_type num_col = m_y.extent(2);
609 
610  // Leading dimension guaranteed contiguous for LayoutLeft
611  if ( m_b == output_scalar(0) )
612  for (size_type col=0; col<num_col; ++col)
613  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
614  m_y(j, iBlockRow, col) = 0 ;
615  else
616  for (size_type col=0; col<num_col; ++col)
617  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
618  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
619 
620  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
621  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
622  const size_type BlockSize = 9;
623  const size_type numBlock =
624  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
625 
626  const matrix_scalar* sh_A[BlockSize];
627  const input_scalar* sh_x[BlockSize];
628 
629  size_type iBlockEntry = iBlockEntryBeg;
630  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
631  const size_type block_size =
632  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
633 
634  // Loop over columns of x, y
635  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
636 
637  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
638 
639  for ( size_type col = 0; col < block_size; ++col ) {
640  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
641  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
642  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
643  }
644 
645  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
646 
647  const size_type nEntry = m_tensor.num_entry(iy);
648  const size_type iEntryBeg = m_tensor.entry_begin(iy);
649  const size_type iEntryEnd = iEntryBeg + nEntry;
650  size_type iEntry = iEntryBeg;
651 
652  output_scalar ytmp = 0 ;
653 
654  // Do entries with a blocked loop of size blocksize
655  const size_type nBlock = nEntry / tensor_type::vectorsize;
656  const size_type nEntryB = nBlock * tensor_type::vectorsize;
657  const size_type iEnd = iEntryBeg + nEntryB;
658 
659  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
660  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
661  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
662  VecTV vy;
663  vy.zero();
664  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
665  const size_type *j = &m_tensor.coord(iEntry,0);
666  const size_type *k = &m_tensor.coord(iEntry,1);
667  ValTV c(&(m_tensor.value(iEntry)));
668 
669  for ( size_type col = 0; col < block_size; ++col ) {
670  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
671  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
672 
673  // vy += c * ( aj * xk + ak * xj)
674  aj.times_equal(xk);
675  aj.multiply_add(ak, xj);
676  vy.multiply_add(c, aj);
677  }
678  }
679  ytmp += vy.sum();
680 
681  // The number of nonzeros is always constrained to be a multiple of 8
682 
683  const size_type rem = iEntryEnd-iEntry;
684  if (rem >= 8) {
685  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
686  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
687  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
688  const size_type *j = &m_tensor.coord(iEntry,0);
689  const size_type *k = &m_tensor.coord(iEntry,1);
690  ValTV2 c(&(m_tensor.value(iEntry)));
691 
692  for ( size_type col = 0; col < block_size; ++col ) {
693  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
694  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
695 
696  // vy += c * ( aj * xk + ak * xj)
697  aj.times_equal(xk);
698  aj.multiply_add(ak, xj);
699  aj.times_equal(c);
700  ytmp += aj.sum();
701  }
702  }
703 
704  y[iy] += m_a * ytmp ;
705  }
706 
707  }
708 
709  // Add a team barrier to keep the thread team in-sync before going on
710  // to the next block
711  device.team_barrier();
712 
713  }
714 
715  }
716 
717 #else
718 
719  //
720  // Team functor interface with threading within PCE multiply
721  //
722  // Note: Rember that matrix currently is always LayoutRight!
723  //
724  // This is a general, hand-vectorized version that processes multiple FEM
725  // columns at a time to reduce tensor reads. Note that auto-vectorization
726  // doesn't work here because of the inner-loop over FEM columns.
727  //
728  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
729 
730  KOKKOS_INLINE_FUNCTION
731  void operator()( const team_member & device ) const
732  {
733  const size_type iBlockRow = device.league_rank();
734 
735  // Check for valid row
736  const size_type row_count = m_A_graph.row_map.extent(0)-1;
737  if (iBlockRow >= row_count)
738  return;
739 
740  const size_type num_thread = device.team_size();
741  const size_type thread_idx = device.team_rank();
742  const Kokkos::pair<size_type,size_type> work_range =
743  details::compute_work_range<output_scalar>(
744  device, m_tensor.dimension(), num_thread, thread_idx);
745 
746  const size_type num_col = m_y.extent(2);
747 
748  // Leading dimension guaranteed contiguous for LayoutLeft
749  if ( m_b == output_scalar(0) )
750  for (size_type col=0; col<num_col; ++col)
751  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
752  m_y(j, iBlockRow, col) = 0 ;
753  else
754  for (size_type col=0; col<num_col; ++col)
755  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
756  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
757 
758  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
759  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
760  const size_type BlockSize = 14;
761  const size_type numBlock =
762  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
763 
764  const matrix_scalar* sh_A[BlockSize];
765  const input_scalar* sh_x[BlockSize];
766 
767  size_type iBlockEntry = iBlockEntryBeg;
768  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
769  const size_type block_size =
770  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
771 
772  // Loop over columns of x, y
773  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
774 
775  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
776 
777  for ( size_type col = 0; col < block_size; ++col ) {
778  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
779  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
780  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
781  }
782 
783  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
784 
785  const size_type nEntry = m_tensor.num_entry(iy);
786  const size_type iEntryBeg = m_tensor.entry_begin(iy);
787  const size_type iEntryEnd = iEntryBeg + nEntry;
788  size_type iEntry = iEntryBeg;
789 
790  output_scalar ytmp = 0 ;
791 
792  // Do entries with a blocked loop of size blocksize
793  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
794  const size_type nBlock = nEntry / tensor_type::vectorsize;
795  const size_type nEntryB = nBlock * tensor_type::vectorsize;
796  const size_type iEnd = iEntryBeg + nEntryB;
797 
801  VecTV vy;
802  vy.zero();
803  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
804  const size_type *j = &m_tensor.coord(iEntry,0);
805  const size_type *k = &m_tensor.coord(iEntry,1);
806  ValTV c(&(m_tensor.value(iEntry)));
807 
808  for ( size_type col = 0; col < block_size; ++col ) {
809  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
810  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
811 
812  // vy += c * ( aj * xk + ak * xj)
813  aj.times_equal(xk);
814  aj.multiply_add(ak, xj);
815  vy.multiply_add(c, aj);
816  }
817  }
818  ytmp += vy.sum();
819  }
820 
821  // Do remaining entries with a scalar loop
822  for ( ; iEntry<iEntryEnd; ++iEntry) {
823  const size_type j = m_tensor.coord(iEntry,0);
824  const size_type k = m_tensor.coord(iEntry,1);
825  tensor_scalar cijk = m_tensor.value(iEntry);
826 
827  for ( size_type col = 0; col < block_size; ++col ) {
828  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
829  sh_A[col][k] * sh_x[col][j] );
830  }
831 
832  }
833 
834  y[iy] += m_a * ytmp ;
835  }
836 
837  }
838 
839  // Add a team barrier to keep the thread team in-sync before going on
840  // to the next block
841  device.team_barrier();
842  }
843 
844  }
845 
846 #endif
847 
848  static void apply( const matrix_type & A ,
849  const input_vector_type & x ,
850  const output_vector_type & y ,
851  const input_scalar & a = input_scalar(1) ,
852  const output_scalar & b = output_scalar(0) )
853  {
854  // Generally the block algorithm seems to perform better on the MIC,
855  // as long as the stochastic size isn't too big, but doesn't perform
856  // any better on the CPU (probably because the CPU has a fat L3 cache
857  // to store the sparse 3 tensor).
858 #ifdef __MIC__
859  const bool use_block_algorithm = true;
860 #else
861  const bool use_block_algorithm = false;
862 #endif
863 
864  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
865  if (use_block_algorithm) {
866 #ifdef __MIC__
867  const size_t team_size = 4; // 4 hyperthreads for MIC
868 #else
869  const size_t team_size = 2; // 2 for everything else
870 #endif
871  const size_t league_size = row_count;
872  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
873  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
874  }
875  else {
876  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
877  }
878  }
879 };
880 
881 template <typename MatrixType, typename InputViewType, typename OutputViewType>
882 class MeanMultiply {};
883 
884 // Kernel implementing y = A * x where PCE size of A is 1
885 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
886 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
887 // x and y are rank 1
888 template <typename MatrixDevice,
889  typename MatrixStorage,
890  typename MatrixOrdinal,
891  typename MatrixMemory,
892  typename MatrixSize,
893  typename InputStorage,
894  typename ... InputP,
895  typename OutputStorage,
896  typename ... OutputP>
897 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
898  MatrixOrdinal,
899  MatrixDevice,
900  MatrixMemory,
901  MatrixSize>,
902  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
903  InputP... >,
904  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
905  OutputP... >
906  >
907 {
908 public:
912 
913  typedef KokkosSparse::CrsMatrix< MatrixValue,
914  MatrixOrdinal,
915  MatrixDevice,
916  MatrixMemory,
917  MatrixSize> matrix_type;
918  typedef typename matrix_type::values_type matrix_values_type;
920  typedef Kokkos::View< InputVectorValue*,
921  InputP... > input_vector_type;
922  typedef Kokkos::View< OutputVectorValue*,
923  OutputP... > output_vector_type;
924 
925  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
929 
930  template <int BlockSize>
931  struct BlockKernel {
932  typedef MatrixDevice execution_space;
934  typedef typename input_vector_type::array_type input_array_type;
935  typedef typename output_vector_type::array_type output_array_type;
936 
943  const size_type dim ;
945  const size_type rem ;
947 
949  const input_vector_type & x ,
950  const output_vector_type & y ,
951  const input_scalar & a ,
952  const output_scalar & b )
953  : m_A_values( A.values )
954  , m_A_graph( A.graph )
955  , v_y( y )
956  , v_x( x )
957  , m_a( a )
958  , m_b( b )
959  , dim( dimension_scalar(x) )
960  , numBlocks( dim / BlockSize )
961  , rem( dim % BlockSize )
962  , dim_block( numBlocks*BlockSize )
963  {}
964 
965  KOKKOS_INLINE_FUNCTION
966  void operator()( const size_type iBlockRow ) const
967  {
968 #if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
969  output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
970 #else
971  output_scalar s[BlockSize] = {};
972 #endif
973 
974  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
975  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
976  size_type pce_block = 0;
977  for (; pce_block < dim_block; pce_block+=BlockSize) {
978  output_scalar * const y = &v_y(pce_block, iBlockRow);
979  if (m_b == output_scalar(0))
980 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
981 #pragma ivdep
982 //#pragma vector aligned
983 #endif
984  for (size_type k = 0; k < BlockSize; ++k)
985  s[k] = 0.0;
986  else
987 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
988 #pragma ivdep
989 //#pragma vector aligned
990 #endif
991  for (size_type k = 0; k < BlockSize; ++k)
992  s[k] = m_b*y[k];
993  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
994  const matrix_scalar aA = m_a*m_A_values(iEntry);
995  const size_type col = m_A_graph.entries(iEntry);
996  const input_scalar * const x = &v_x(pce_block, col);
997 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
998 #pragma ivdep
999 //#pragma vector aligned
1000 #endif
1001  for (size_type k = 0; k < BlockSize; ++k)
1002  s[k] += aA*x[k];
1003  }
1004 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1005 #pragma ivdep
1006 //#pragma vector aligned
1007 #endif
1008  for (size_type k = 0; k < BlockSize; ++k) {
1009  y[k] = s[k];
1010  }
1011  }
1012 
1013  // Remaining coeffs
1014  if (rem > 0) {
1015  output_scalar * const y = &v_y(pce_block, iBlockRow);
1016  if (m_b == output_scalar(0))
1017 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1018 #pragma ivdep
1019 //#pragma vector aligned
1020 #endif
1021  for (size_type k = 0; k < rem; ++k)
1022  s[k] = 0.0;
1023  else
1024 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1025 #pragma ivdep
1026 //#pragma vector aligned
1027 #endif
1028  for (size_type k = 0; k < rem; ++k)
1029  s[k] = m_b*y[k];
1030  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1031  const matrix_scalar aA = m_a*m_A_values(iEntry);
1032  const size_type col = m_A_graph.entries(iEntry);
1033  const input_scalar * const x = &v_x(pce_block, col);
1034 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1035 #pragma ivdep
1036 //#pragma vector aligned
1037 #endif
1038  for (size_type k = 0; k < rem; ++k)
1039  s[k] += aA*x[k];
1040  }
1041 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1042 #pragma ivdep
1043 //#pragma vector aligned
1044 #endif
1045  for (size_type k = 0; k < rem; ++k) {
1046  y[k] = s[k];
1047  }
1048  }
1049 
1050  }
1051 
1052  };
1053 
1054  struct Kernel {
1055  typedef MatrixDevice execution_space;
1057  typedef typename input_vector_type::array_type input_array_type;
1058  typedef typename output_vector_type::array_type output_array_type;
1059 
1066  const size_type dim ;
1067 
1068  Kernel( const matrix_type & A ,
1069  const input_vector_type & x ,
1070  const output_vector_type & y ,
1071  const input_scalar & a ,
1072  const output_scalar & b )
1073  : m_A_values( A.values )
1074  , m_A_graph( A.graph )
1075  , v_y( y )
1076  , v_x( x )
1077  , m_a( a )
1078  , m_b( b )
1079  , dim( dimension_scalar(x) )
1080  {}
1081 
1082  KOKKOS_INLINE_FUNCTION
1083  void operator()( const size_type iBlockRow ) const
1084  {
1085  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1086  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1087  output_scalar * const y = &v_y(0, iBlockRow);
1088  if (m_b == output_scalar(0))
1089 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1090 #pragma ivdep
1091 //#pragma vector aligned
1092 #endif
1093  for (size_type k = 0; k < dim; ++k)
1094  y[k] = 0.0;
1095  else
1096 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1097 #pragma ivdep
1098 //#pragma vector aligned
1099 #endif
1100  for (size_type k = 0; k < dim; ++k)
1101  y[k] = m_b*y[k];
1102  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1103  const matrix_scalar aA = m_a*m_A_values(iEntry);
1104  const size_type col = m_A_graph.entries(iEntry);
1105  const input_scalar * const x = &v_x(0, col);
1106 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1107 #pragma ivdep
1108 //#pragma vector aligned
1109 #endif
1110  for (size_type k = 0; k < dim; ++k)
1111  y[k] += aA*x[k];
1112  }
1113  }
1114 
1115  };
1116 
1117  static void apply( const matrix_type & A ,
1118  const input_vector_type & x ,
1119  const output_vector_type & y ,
1120  const input_scalar & a = input_scalar(1) ,
1121  const output_scalar & b = output_scalar(0) )
1122  {
1123  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1124  const size_type dim = Kokkos::dimension_scalar(x);
1125 
1126  // Choose block size appropriately for PCE dimension
1127 #if defined (__MIC__)
1128  if (dim >= 128)
1129  Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1130  else if (dim >= 64)
1131  Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1132  else if (dim >= 32)
1133  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1134  else if (dim >= 16)
1135  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1136  else if (dim >= 8)
1137  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1138  else
1139  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1140 #else
1141  if (dim >= 32)
1142  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1143  else if (dim >= 16)
1144  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1145  else if (dim >= 8)
1146  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1147  else
1148  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1149 #endif
1150  }
1151 };
1152 
1153 // Kernel implementing y = A * x where A has PCE size = 1
1154 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
1155 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1156 // x and y are rank 2
1157 template <typename MatrixDevice,
1158  typename MatrixStorage,
1159  typename MatrixOrdinal,
1160  typename MatrixMemory,
1161  typename MatrixSize,
1162  typename InputStorage,
1163  typename ... InputP,
1164  typename OutputStorage,
1165  typename ... OutputP>
1166 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1167  MatrixOrdinal,
1168  MatrixDevice,
1169  MatrixMemory,
1170  MatrixSize>,
1171  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
1172  InputP... >,
1173  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1174  OutputP... >
1175  >
1176 {
1177 public:
1181 
1182  typedef KokkosSparse::CrsMatrix< MatrixValue,
1183  MatrixOrdinal,
1184  MatrixDevice,
1185  MatrixMemory,
1186  MatrixSize> matrix_type;
1187  typedef Kokkos::View< InputVectorValue**,
1188  InputP... > input_vector_type;
1189  typedef Kokkos::View< OutputVectorValue**,
1190  OutputP... > output_vector_type;
1191 
1192  typedef MatrixDevice execution_space;
1196 
1197  static void apply( const matrix_type & A ,
1198  const input_vector_type & x ,
1199  const output_vector_type & y ,
1200  const input_scalar & a = input_scalar(1) ,
1201  const output_scalar & b = output_scalar(0) )
1202  {
1203  typedef Kokkos::View< InputVectorValue*,
1204  InputP... > input_vector_1d_type;
1205  typedef Kokkos::View< OutputVectorValue*,
1206  OutputP... > output_vector_1d_type;
1207  typedef MeanMultiply<matrix_type,input_vector_1d_type,
1208  output_vector_1d_type> MeanMultiply1D;
1209  const size_type num_col = x.extent(1);
1210  for (size_type i=0; i<num_col; ++i) {
1211  input_vector_1d_type x_col =
1212  Kokkos::subview(x, Kokkos::ALL(), i);
1213  output_vector_1d_type y_col =
1214  Kokkos::subview(y, Kokkos::ALL(), i);
1215  MeanMultiply1D::apply( A, x_col, y_col, a, b );
1216  }
1217  }
1218 };
1219 
1220 } // namespace Stokhos
1221 
1222 namespace KokkosSparse {
1223 
1224 template <typename AlphaType,
1225  typename BetaType,
1226  typename MatrixType,
1227  typename InputType,
1228  typename ... InputP,
1229  typename OutputType,
1230  typename ... OutputP>
1231 typename std::enable_if<
1232  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1233  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1234  >::type
1236  const char mode[],
1237  const AlphaType& a,
1238  const MatrixType& A,
1239  const Kokkos::View< InputType, InputP... >& x,
1240  const BetaType& b,
1241  const Kokkos::View< OutputType, OutputP... >& y,
1242  const RANK_ONE)
1243 {
1244  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1245  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1246  typedef Stokhos::Multiply<MatrixType,InputVectorType,
1247  OutputVectorType> multiply_type;
1248  typedef Stokhos::MeanMultiply<MatrixType,InputVectorType,
1249  OutputVectorType> mean_multiply_type;
1250 
1251  if(mode[0]!='N') {
1253  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1254  }
1255 
1256  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1258  "Stokhos spmv not implemented for non-constant a or b");
1259  }
1260  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1261  mean_multiply_type::apply( A, x, y,
1262  Sacado::Value<AlphaType>::eval(a),
1263  Sacado::Value<BetaType>::eval(b) );
1264  }
1265  else
1266  multiply_type::apply( A, x, y,
1267  Sacado::Value<AlphaType>::eval(a),
1268  Sacado::Value<BetaType>::eval(b) );
1269 }
1270 
1271 template <typename AlphaType,
1272  typename BetaType,
1273  typename MatrixType,
1274  typename InputType,
1275  typename ... InputP,
1276  typename OutputType,
1277  typename ... OutputP>
1278 typename std::enable_if<
1279  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1280  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1281  >::type
1283  const char mode[],
1284  const AlphaType& a,
1285  const MatrixType& A,
1286  const Kokkos::View< InputType, InputP... >& x,
1287  const BetaType& b,
1288  const Kokkos::View< OutputType, OutputP... >& y,
1289  const RANK_TWO)
1290 {
1291  if(mode[0]!='N') {
1293  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1294  }
1295  if (y.extent(1) == 1) {
1296  auto y_1D = subview(y, Kokkos::ALL(), 0);
1297  auto x_1D = subview(x, Kokkos::ALL(), 0);
1298  spmv(mode, a, A, x_1D, b, y_1D, RANK_ONE());
1299  }
1300  else {
1301  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1302  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1303  typedef Stokhos::Multiply<MatrixType,InputVectorType,
1304  OutputVectorType> multiply_type;
1305  typedef Stokhos::MeanMultiply<MatrixType,InputVectorType,
1306  OutputVectorType> mean_multiply_type;
1307 
1308  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1310  "Stokhos spmv not implemented for non-constant a or b");
1311  }
1312  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1313  mean_multiply_type::apply( A, x, y,
1314  Sacado::Value<AlphaType>::eval(a),
1315  Sacado::Value<BetaType>::eval(b));
1316  }
1317  else
1318  multiply_type::apply( A, x, y,
1319  Sacado::Value<AlphaType>::eval(a),
1320  Sacado::Value<BetaType>::eval(b));
1321  }
1322 }
1323 
1324 }
1325 
1326 #endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP */
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Kokkos::DefaultExecutionSpace device
KOKKOS_INLINE_FUNCTION void zero()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)