Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_CrsMatrix_UQ_PCE.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_HPP
44 
45 #include "Sacado_UQ_PCE.hpp"
46 #include "Kokkos_View_UQ_PCE.hpp"
48 #include "KokkosSparse_CrsMatrix.hpp"
49 #include "KokkosSparse_spmv.hpp"
50 
51 #include "Kokkos_Blas1_UQ_PCE.hpp" // for some utilities
52 
53 #include "Stokhos_Multiply.hpp"
55 
56 namespace Stokhos {
57 
58 //----------------------------------------------------------------------------
59 // Specialization of KokkosSparse::CrsMatrix for Sacado::UQ::PCE scalar type
60 //----------------------------------------------------------------------------
61 
62 // Kernel implementing y = A * x where
63 // A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>,
64 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
65 // x and y are rank 1
66 template <typename MatrixDevice,
67  typename MatrixStorage,
68  typename MatrixOrdinal,
69  typename MatrixMemory,
70  typename MatrixSize,
71  typename InputStorage,
72  typename ... InputP,
73  typename OutputStorage,
74  typename ... OutputP>
75 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
76  MatrixOrdinal,
77  MatrixDevice,
78  MatrixMemory,
79  MatrixSize>,
80  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
81  InputP... >,
82  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
83  OutputP... >
84  >
85 {
86 public:
90 
92 
93  typedef KokkosSparse::CrsMatrix< const MatrixValue,
94  MatrixOrdinal,
95  MatrixDevice,
96  MatrixMemory,
97  MatrixSize> matrix_type;
98  typedef typename matrix_type::values_type matrix_values_type;
100  typedef typename tensor_type::size_type size_type;
101  typedef Kokkos::View< const InputVectorValue*,
102  InputP... > input_vector_type;
103  typedef Kokkos::View< OutputVectorValue*,
104  OutputP... > output_vector_type;
105 
106 private:
107 
108  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
109  typedef typename matrix_values_type::array_type matrix_array_type;
110  typedef typename input_vector_type::array_type input_array_type;
111  typedef typename output_vector_type::array_type output_array_type;
112 
117 
125 
126  Multiply( const matrix_type & A ,
127  const input_vector_type & x ,
128  const output_vector_type & y ,
129  const input_scalar & a ,
130  const output_scalar & b )
131  : m_A_values( A.values )
132  , m_A_graph( A.graph )
133  , m_x( x )
134  , m_y( y )
135  , m_tensor( Kokkos::cijk(A.values) )
136  , m_a( a )
137  , m_b( b )
138  {}
139 
140 public:
141 
142  //
143  // Non-team functor interface -- no threads within PCE multiply
144  //
145  // Note: Rember that matrix currently is always LayoutRight!
146  //
147  KOKKOS_INLINE_FUNCTION
148  void operator()( const size_type iBlockRow ) const
149  {
150  // Prefer that y[ m_tensor.dimension() ] be scratch space
151  // on the local thread, but cannot dynamically allocate
152  output_scalar * const y = & m_y(0,iBlockRow);
153 
154  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
155  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
156 
157  // Leading dimension guaranteed contiguous for LayoutLeft
158  if ( m_b == output_scalar(0) )
159  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
160  y[j] = 0 ;
161  else
162  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
163  y[j] = m_b * y[j] ;
164  //loop over cols of A
165  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
166  const input_scalar * const x = & m_x( 0 , m_A_graph.entries(iEntry) );
167  const matrix_scalar * const A = & m_A_values( iEntry , 0 );
168 
170  m_tensor , A , x , y , m_a );
171  }
172 
173  }
174 
175 #if defined(__MIC__)
176 
177  //
178  // Team functor interface with threading within PCE multiply
179  //
180  // Note: Rember that matrix currently is always LayoutRight!
181  //
182  // This is a MIC-specific version of that processes multiple FEM columns
183  // at a time to reduce tensor reads
184  //
185  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
186  KOKKOS_INLINE_FUNCTION
187  void operator()( const team_member & device ) const
188  {
189  const size_type iBlockRow = device.league_rank();
190 
191  // Check for valid row
192  const size_type row_count = m_A_graph.row_map.extent(0)-1;
193  if (iBlockRow >= row_count)
194  return;
195 
196  const size_type num_thread = device.team_size();
197  const size_type thread_idx = device.team_rank();
198  const Kokkos::pair<size_type,size_type> work_range =
199  details::compute_work_range<output_scalar>(
200  device, m_tensor.dimension(), num_thread, thread_idx);
201 
202  // Prefer that y[ m_tensor.dimension() ] be scratch space
203  // on the local thread, but cannot dynamically allocate
204  output_scalar * const y = & m_y(0,iBlockRow);
205 
206  // Leading dimension guaranteed contiguous for LayoutLeft
207  if ( m_b == output_scalar(0) )
208  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
209  y[j] = 0 ;
210  else
211  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
212  y[j] = m_b * y[j] ;
213 
214  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
215  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
216  const size_type BlockSize = 9;
217  const size_type numBlock =
218  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
219 
220  const matrix_scalar* sh_A[BlockSize];
221  const input_scalar* sh_x[BlockSize];
222 
223  size_type iBlockEntry = iBlockEntryBeg;
224  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
225  const size_type block_size =
226  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
227 
228  for ( size_type col = 0; col < block_size; ++col ) {
229  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
230  sh_x[col] = & m_x( 0 , iBlockColumn );
231  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
232  }
233 
234  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
235 
236  const size_type nEntry = m_tensor.num_entry(iy);
237  const size_type iEntryBeg = m_tensor.entry_begin(iy);
238  const size_type iEntryEnd = iEntryBeg + nEntry;
239  size_type iEntry = iEntryBeg;
240 
241  output_scalar ytmp = 0 ;
242 
243  // Do entries with a blocked loop of size blocksize
244  const size_type nBlock = nEntry / tensor_type::vectorsize;
245  const size_type nEntryB = nBlock * tensor_type::vectorsize;
246  const size_type iEnd = iEntryBeg + nEntryB;
247 
248  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
249  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
250  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
251  VecTV vy;
252  vy.zero();
253  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
254  const size_type *j = &m_tensor.coord(iEntry,0);
255  const size_type *k = &m_tensor.coord(iEntry,1);
256  ValTV c(&(m_tensor.value(iEntry)));
257 
258  for ( size_type col = 0; col < block_size; ++col ) {
259  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
260  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
261 
262  // vy += c * ( aj * xk + ak * xj)
263  aj.times_equal(xk);
264  aj.multiply_add(ak, xj);
265  vy.multiply_add(c, aj);
266  }
267  }
268  ytmp += vy.sum();
269 
270  // The number of nonzeros is always constrained to be a multiple of 8
271 
272  const size_type rem = iEntryEnd-iEntry;
273  if (rem >= 8) {
274  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
275  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
276  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
277  const size_type *j = &m_tensor.coord(iEntry,0);
278  const size_type *k = &m_tensor.coord(iEntry,1);
279  ValTV2 c(&(m_tensor.value(iEntry)));
280 
281  for ( size_type col = 0; col < block_size; ++col ) {
282  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
283  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
284 
285  // vy += c * ( aj * xk + ak * xj)
286  aj.times_equal(xk);
287  aj.multiply_add(ak, xj);
288  aj.times_equal(c);
289  ytmp += aj.sum();
290  }
291  }
292 
293  y[iy] += m_a * ytmp ;
294  }
295 
296  // Add a team barrier to keep the thread team in-sync before going on
297  // to the next block
298  device.team_barrier();
299  }
300 
301  }
302 
303 #else
304 
305  //
306  // Team functor interface with threading within PCE multiply
307  //
308  // Note: Rember that matrix currently is always LayoutRight!
309  //
310  // This is a general, hand-vectorized version that processes multiple FEM
311  // columns at a time to reduce tensor reads. Note that auto-vectorization
312  // doesn't work here because of the inner-loop over FEM columns.
313  //
314  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
315  KOKKOS_INLINE_FUNCTION
316  void operator()( const team_member & device ) const
317  {
318  const size_type iBlockRow = device.league_rank();
319 
320  // Check for valid row
321  const size_type row_count = m_A_graph.row_map.extent(0)-1;
322  if (iBlockRow >= row_count)
323  return;
324 
325  const size_type num_thread = device.team_size();
326  const size_type thread_idx = device.team_rank();
327  const Kokkos::pair<size_type,size_type> work_range =
328  details::compute_work_range<output_scalar>(
329  device, m_tensor.dimension(), num_thread, thread_idx);
330 
331  // Prefer that y[ m_tensor.dimension() ] be scratch space
332  // on the local thread, but cannot dynamically allocate
333  output_scalar * const y = & m_y(0,iBlockRow);
334 
335  // Leading dimension guaranteed contiguous for LayoutLeft
336  if ( m_b == output_scalar(0) )
337  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
338  y[j] = 0 ;
339  else
340  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
341  y[j] = m_b * y[j] ;
342 
343  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
344  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
345  const size_type BlockSize = 14;
346  const size_type numBlock =
347  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
348 
349  const matrix_scalar* sh_A[BlockSize];
350  const input_scalar* sh_x[BlockSize];
351 
352  size_type iBlockEntry = iBlockEntryBeg;
353  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
354  const size_type block_size =
355  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
356 
357  for ( size_type col = 0; col < block_size; ++col ) {
358  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
359  sh_x[col] = & m_x( 0 , iBlockColumn );
360  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
361  }
362 
363  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
364 
365  const size_type nEntry = m_tensor.num_entry(iy);
366  const size_type iEntryBeg = m_tensor.entry_begin(iy);
367  const size_type iEntryEnd = iEntryBeg + nEntry;
368  size_type iEntry = iEntryBeg;
369 
370  output_scalar ytmp = 0 ;
371 
372  // Do entries with a blocked loop of size blocksize
373  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
374  const size_type nBlock = nEntry / tensor_type::vectorsize;
375  const size_type nEntryB = nBlock * tensor_type::vectorsize;
376  const size_type iEnd = iEntryBeg + nEntryB;
377 
381  VecTV vy;
382  vy.zero();
383  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
384  const size_type *j = &m_tensor.coord(iEntry,0);
385  const size_type *k = &m_tensor.coord(iEntry,1);
386  ValTV c(&(m_tensor.value(iEntry)));
387 
388  for ( size_type col = 0; col < block_size; ++col ) {
389  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
390  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
391 
392  // vy += c * ( aj * xk + ak * xj)
393  aj.times_equal(xk);
394  aj.multiply_add(ak, xj);
395  vy.multiply_add(c, aj);
396  }
397  }
398  ytmp += vy.sum();
399  }
400 
401  // Do remaining entries with a scalar loop
402  for ( ; iEntry<iEntryEnd; ++iEntry) {
403  const size_type j = m_tensor.coord(iEntry,0);
404  const size_type k = m_tensor.coord(iEntry,1);
405  tensor_scalar cijk = m_tensor.value(iEntry);
406 
407  for ( size_type col = 0; col < block_size; ++col ) {
408  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
409  sh_A[col][k] * sh_x[col][j] );
410  }
411 
412  }
413 
414  y[iy] += m_a * ytmp ;
415  }
416 
417  // Add a team barrier to keep the thread team in-sync before going on
418  // to the next block
419  device.team_barrier();
420  }
421 
422  }
423 
424 #endif
425 
426  static void apply( const matrix_type & A ,
427  const input_vector_type & x ,
428  const output_vector_type & y ,
429  const input_scalar & a = input_scalar(1) ,
430  const output_scalar & b = output_scalar(0) )
431  {
432  // Generally the block algorithm seems to perform better on the MIC,
433  // as long as the stochastic size isn't too big, but doesn't perform
434  // any better on the CPU (probably because the CPU has a fat L3 cache
435  // to store the sparse 3 tensor).
436 #ifdef __MIC__
437  const bool use_block_algorithm = true;
438 #else
439  const bool use_block_algorithm = false;
440 #endif
441 
442  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
443  if (use_block_algorithm) {
444 #ifdef __MIC__
445  const size_t team_size = 4; // 4 hyperthreads for MIC
446 #else
447  const size_t team_size = 2; // 2 for everything else
448 #endif
449  const size_t league_size = row_count;
450  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
451  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
452  }
453  else {
454  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
455  }
456  }
457 };
458 
459 // Kernel implementing y = A * x where
460 // A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>,
461 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
462 // x and y are rank 2
463 template <typename MatrixDevice,
464  typename MatrixStorage,
465  typename MatrixOrdinal,
466  typename MatrixMemory,
467  typename MatrixSize,
468  typename InputStorage,
469  typename ... InputP,
470  typename OutputStorage,
471  typename ... OutputP>
472 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
473  MatrixOrdinal,
474  MatrixDevice,
475  MatrixMemory,
476  MatrixSize>,
477  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
478  InputP... >,
479  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
480  OutputP... >
481  >
482 {
483 public:
487 
489 
490  typedef KokkosSparse::CrsMatrix< const MatrixValue,
491  MatrixOrdinal,
492  MatrixDevice,
493  MatrixMemory,
494  MatrixSize> matrix_type;
495  typedef typename matrix_type::values_type matrix_values_type;
497  typedef typename tensor_type::size_type size_type;
498  typedef Kokkos::View< const InputVectorValue**,
499  InputP... > input_vector_type;
500  typedef Kokkos::View< OutputVectorValue**,
501  OutputP... > output_vector_type;
502 
503 private:
504 
505  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
506  typedef typename matrix_values_type::array_type matrix_array_type;
507  typedef typename input_vector_type::array_type input_array_type;
508  typedef typename output_vector_type::array_type output_array_type;
509 
514 
522 
523  Multiply( const matrix_type & A ,
524  const input_vector_type & x ,
525  const output_vector_type & y ,
526  const input_scalar & a ,
527  const output_scalar & b )
528  : m_A_values( A.values )
529  , m_A_graph( A.graph )
530  , m_x( x )
531  , m_y( y )
532  , m_tensor( Kokkos::cijk(A.values) )
533  , m_a( a )
534  , m_b( b )
535  {}
536 
537 public:
538 
539  //
540  // Non-team functor interface -- no threads within PCE multiply
541  //
542  // Note: Rember that matrix currently is always LayoutRight!
543  //
544  KOKKOS_INLINE_FUNCTION
545  void operator()( const size_type iBlockRow ) const
546  {
547  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
548  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
549 
550  const size_type num_col = m_y.extent(2);
551 
552  // Leading dimension guaranteed contiguous for LayoutLeft
553  if ( m_b == output_scalar(0) )
554  for (size_type col=0; col<num_col; ++col)
555  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
556  m_y(j, iBlockRow, col) = 0 ;
557  else
558  for (size_type col=0; col<num_col; ++col)
559  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
560  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
561 
562  // Put the x-column loop inside the A-column loop to reuse entries in A.
563  // This way all of the entries for that particular column of A should stay
564  // in L1 cache for all of the columns of x.
565 
566  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
567  const matrix_scalar * const A = &m_A_values( iEntry, 0 );
568  const size_type iBlockCol = m_A_graph.entries(iEntry);
569 
570  for (size_type col=0; col<num_col; ++col) {
571  output_scalar * const y = &m_y( 0, iBlockRow, col );
572  const input_scalar * const x = &m_x( 0, iBlockCol, col );
573  BlockMultiply< tensor_type >::apply( m_tensor , A , x , y , m_a );
574  }
575 
576  }
577 
578  }
579 
580 #if defined(__MIC__)
581 
582  //
583  // Team functor interface with threading within PCE multiply
584  //
585  // Note: Rember that matrix currently is always LayoutRight!
586  //
587  // This is a MIC-specific version of that processes multiple FEM columns
588  // at a time to reduce tensor reads
589  //
590  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
591 
592  KOKKOS_INLINE_FUNCTION
593  void operator()( const team_member & device ) const
594  {
595  const size_type iBlockRow = device.league_rank();
596 
597  // Check for valid row
598  const size_type row_count = m_A_graph.row_map.extent(0)-1;
599  if (iBlockRow >= row_count)
600  return;
601 
602  const size_type num_thread = device.team_size();
603  const size_type thread_idx = device.team_rank();
604  const Kokkos::pair<size_type,size_type> work_range =
605  details::compute_work_range<output_scalar>(
606  device, m_tensor.dimension(), num_thread, thread_idx);
607 
608  const size_type num_col = m_y.extent(2);
609 
610  // Leading dimension guaranteed contiguous for LayoutLeft
611  if ( m_b == output_scalar(0) )
612  for (size_type col=0; col<num_col; ++col)
613  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
614  m_y(j, iBlockRow, col) = 0 ;
615  else
616  for (size_type col=0; col<num_col; ++col)
617  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
618  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
619 
620  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
621  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
622  const size_type BlockSize = 9;
623  const size_type numBlock =
624  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
625 
626  const matrix_scalar* sh_A[BlockSize];
627  const input_scalar* sh_x[BlockSize];
628 
629  size_type iBlockEntry = iBlockEntryBeg;
630  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
631  const size_type block_size =
632  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
633 
634  // Loop over columns of x, y
635  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
636 
637  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
638 
639  for ( size_type col = 0; col < block_size; ++col ) {
640  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
641  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
642  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
643  }
644 
645  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
646 
647  const size_type nEntry = m_tensor.num_entry(iy);
648  const size_type iEntryBeg = m_tensor.entry_begin(iy);
649  const size_type iEntryEnd = iEntryBeg + nEntry;
650  size_type iEntry = iEntryBeg;
651 
652  output_scalar ytmp = 0 ;
653 
654  // Do entries with a blocked loop of size blocksize
655  const size_type nBlock = nEntry / tensor_type::vectorsize;
656  const size_type nEntryB = nBlock * tensor_type::vectorsize;
657  const size_type iEnd = iEntryBeg + nEntryB;
658 
659  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
660  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
661  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
662  VecTV vy;
663  vy.zero();
664  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
665  const size_type *j = &m_tensor.coord(iEntry,0);
666  const size_type *k = &m_tensor.coord(iEntry,1);
667  ValTV c(&(m_tensor.value(iEntry)));
668 
669  for ( size_type col = 0; col < block_size; ++col ) {
670  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
671  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
672 
673  // vy += c * ( aj * xk + ak * xj)
674  aj.times_equal(xk);
675  aj.multiply_add(ak, xj);
676  vy.multiply_add(c, aj);
677  }
678  }
679  ytmp += vy.sum();
680 
681  // The number of nonzeros is always constrained to be a multiple of 8
682 
683  const size_type rem = iEntryEnd-iEntry;
684  if (rem >= 8) {
685  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
686  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
687  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
688  const size_type *j = &m_tensor.coord(iEntry,0);
689  const size_type *k = &m_tensor.coord(iEntry,1);
690  ValTV2 c(&(m_tensor.value(iEntry)));
691 
692  for ( size_type col = 0; col < block_size; ++col ) {
693  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
694  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
695 
696  // vy += c * ( aj * xk + ak * xj)
697  aj.times_equal(xk);
698  aj.multiply_add(ak, xj);
699  aj.times_equal(c);
700  ytmp += aj.sum();
701  }
702  }
703 
704  y[iy] += m_a * ytmp ;
705  }
706 
707  }
708 
709  // Add a team barrier to keep the thread team in-sync before going on
710  // to the next block
711  device.team_barrier();
712 
713  }
714 
715  }
716 
717 #else
718 
719  //
720  // Team functor interface with threading within PCE multiply
721  //
722  // Note: Rember that matrix currently is always LayoutRight!
723  //
724  // This is a general, hand-vectorized version that processes multiple FEM
725  // columns at a time to reduce tensor reads. Note that auto-vectorization
726  // doesn't work here because of the inner-loop over FEM columns.
727  //
728  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
729 
730  KOKKOS_INLINE_FUNCTION
731  void operator()( const team_member & device ) const
732  {
733  const size_type iBlockRow = device.league_rank();
734 
735  // Check for valid row
736  const size_type row_count = m_A_graph.row_map.extent(0)-1;
737  if (iBlockRow >= row_count)
738  return;
739 
740  const size_type num_thread = device.team_size();
741  const size_type thread_idx = device.team_rank();
742  const Kokkos::pair<size_type,size_type> work_range =
743  details::compute_work_range<output_scalar>(
744  device, m_tensor.dimension(), num_thread, thread_idx);
745 
746  const size_type num_col = m_y.extent(2);
747 
748  // Leading dimension guaranteed contiguous for LayoutLeft
749  if ( m_b == output_scalar(0) )
750  for (size_type col=0; col<num_col; ++col)
751  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
752  m_y(j, iBlockRow, col) = 0 ;
753  else
754  for (size_type col=0; col<num_col; ++col)
755  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
756  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
757 
758  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
759  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
760  const size_type BlockSize = 14;
761  const size_type numBlock =
762  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
763 
764  const matrix_scalar* sh_A[BlockSize];
765  const input_scalar* sh_x[BlockSize];
766 
767  size_type iBlockEntry = iBlockEntryBeg;
768  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
769  const size_type block_size =
770  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
771 
772  // Loop over columns of x, y
773  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
774 
775  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
776 
777  for ( size_type col = 0; col < block_size; ++col ) {
778  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
779  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
780  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
781  }
782 
783  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
784 
785  const size_type nEntry = m_tensor.num_entry(iy);
786  const size_type iEntryBeg = m_tensor.entry_begin(iy);
787  const size_type iEntryEnd = iEntryBeg + nEntry;
788  size_type iEntry = iEntryBeg;
789 
790  output_scalar ytmp = 0 ;
791 
792  // Do entries with a blocked loop of size blocksize
793  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
794  const size_type nBlock = nEntry / tensor_type::vectorsize;
795  const size_type nEntryB = nBlock * tensor_type::vectorsize;
796  const size_type iEnd = iEntryBeg + nEntryB;
797 
801  VecTV vy;
802  vy.zero();
803  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
804  const size_type *j = &m_tensor.coord(iEntry,0);
805  const size_type *k = &m_tensor.coord(iEntry,1);
806  ValTV c(&(m_tensor.value(iEntry)));
807 
808  for ( size_type col = 0; col < block_size; ++col ) {
809  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
810  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
811 
812  // vy += c * ( aj * xk + ak * xj)
813  aj.times_equal(xk);
814  aj.multiply_add(ak, xj);
815  vy.multiply_add(c, aj);
816  }
817  }
818  ytmp += vy.sum();
819  }
820 
821  // Do remaining entries with a scalar loop
822  for ( ; iEntry<iEntryEnd; ++iEntry) {
823  const size_type j = m_tensor.coord(iEntry,0);
824  const size_type k = m_tensor.coord(iEntry,1);
825  tensor_scalar cijk = m_tensor.value(iEntry);
826 
827  for ( size_type col = 0; col < block_size; ++col ) {
828  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
829  sh_A[col][k] * sh_x[col][j] );
830  }
831 
832  }
833 
834  y[iy] += m_a * ytmp ;
835  }
836 
837  }
838 
839  // Add a team barrier to keep the thread team in-sync before going on
840  // to the next block
841  device.team_barrier();
842  }
843 
844  }
845 
846 #endif
847 
848  static void apply( const matrix_type & A ,
849  const input_vector_type & x ,
850  const output_vector_type & y ,
851  const input_scalar & a = input_scalar(1) ,
852  const output_scalar & b = output_scalar(0) )
853  {
854  // Generally the block algorithm seems to perform better on the MIC,
855  // as long as the stochastic size isn't too big, but doesn't perform
856  // any better on the CPU (probably because the CPU has a fat L3 cache
857  // to store the sparse 3 tensor).
858 #ifdef __MIC__
859  const bool use_block_algorithm = true;
860 #else
861  const bool use_block_algorithm = false;
862 #endif
863 
864  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
865  if (use_block_algorithm) {
866 #ifdef __MIC__
867  const size_t team_size = 4; // 4 hyperthreads for MIC
868 #else
869  const size_t team_size = 2; // 2 for everything else
870 #endif
871  const size_t league_size = row_count;
872  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
873  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
874  }
875  else {
876  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
877  }
878  }
879 };
880 
881 // Kernel implementing y = A * x where
882 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
883 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
884 // x and y are rank 1
885 template <typename MatrixDevice,
886  typename MatrixStorage,
887  typename MatrixOrdinal,
888  typename MatrixMemory,
889  typename MatrixSize,
890  typename InputStorage,
891  typename ... InputP,
892  typename OutputStorage,
893  typename ... OutputP>
894 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
895  MatrixOrdinal,
896  MatrixDevice,
897  MatrixMemory,
898  MatrixSize>,
899  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
900  InputP... >,
901  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
902  OutputP... >
903  >
904 {
905 public:
909 
910  typedef KokkosSparse::CrsMatrix< MatrixValue,
911  MatrixOrdinal,
912  MatrixDevice,
913  MatrixMemory,
914  MatrixSize> matrix_type;
915  typedef typename matrix_type::const_type const_matrix_type;
916 
917  typedef Kokkos::View< const InputVectorValue*,
918  InputP... > input_vector_type;
919  typedef Kokkos::View< OutputVectorValue*,
920  OutputP... > output_vector_type;
921 
924 
925  static void apply( const matrix_type & A ,
926  const input_vector_type & x ,
927  const output_vector_type & y ,
928  const input_scalar & a = input_scalar(1) ,
929  const output_scalar & b = output_scalar(0) )
930  {
931  const_matrix_type cA = A;
933  cA, x, y, a, b);
934  }
935 };
936 
937 // Kernel implementing y = A * x where
938 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
939 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
940 // x and y are rank 2
941 template <typename MatrixDevice,
942  typename MatrixStorage,
943  typename MatrixOrdinal,
944  typename MatrixMemory,
945  typename MatrixSize,
946  typename InputStorage,
947  typename ... InputP,
948  typename OutputStorage,
949  typename ... OutputP>
950 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
951  MatrixOrdinal,
952  MatrixDevice,
953  MatrixMemory,
954  MatrixSize>,
955  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
956  InputP... >,
957  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
958  OutputP... >
959  >
960 {
961 public:
965 
966  typedef KokkosSparse::CrsMatrix< MatrixValue,
967  MatrixOrdinal,
968  MatrixDevice,
969  MatrixMemory,
970  MatrixSize> matrix_type;
971  typedef typename matrix_type::const_type const_matrix_type;
972 
973  typedef Kokkos::View< const InputVectorValue**,
974  InputP... > input_vector_type;
975  typedef Kokkos::View< OutputVectorValue**,
976  OutputP... > output_vector_type;
977 
980 
981  static void apply( const matrix_type & A ,
982  const input_vector_type & x ,
983  const output_vector_type & y ,
984  const input_scalar & a = input_scalar(1) ,
985  const output_scalar & b = output_scalar(0) )
986  {
987  const_matrix_type cA = A;
989  cA, x, y, a, b);
990  }
991 };
992 
993 template <typename MatrixType, typename InputViewType, typename OutputViewType>
994 class MeanMultiply {};
995 
996 // Kernel implementing y = A * x where PCE size of A is 1
997 // A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
998 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
999 // x and y are rank 1
1000 template <typename MatrixDevice,
1001  typename MatrixStorage,
1002  typename MatrixOrdinal,
1003  typename MatrixMemory,
1004  typename MatrixSize,
1005  typename InputStorage,
1006  typename ... InputP,
1007  typename OutputStorage,
1008  typename ... OutputP>
1009 class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
1010  MatrixOrdinal,
1011  MatrixDevice,
1012  MatrixMemory,
1013  MatrixSize>,
1014  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1015  InputP... >,
1016  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1017  OutputP... >
1018  >
1019 {
1020 public:
1024 
1025  typedef KokkosSparse::CrsMatrix< const MatrixValue,
1026  MatrixOrdinal,
1027  MatrixDevice,
1028  MatrixMemory,
1029  MatrixSize> matrix_type;
1030  typedef typename matrix_type::values_type matrix_values_type;
1032  typedef Kokkos::View< const InputVectorValue*,
1033  InputP... > input_vector_type;
1034  typedef Kokkos::View< OutputVectorValue*,
1035  OutputP... > output_vector_type;
1036 
1037  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
1041 
1042  template <int BlockSize>
1043  struct BlockKernel {
1046  typedef typename input_vector_type::array_type input_array_type;
1047  typedef typename output_vector_type::array_type output_array_type;
1048 
1055  const size_type dim ;
1057  const size_type rem ;
1059 
1061  const input_vector_type & x ,
1062  const output_vector_type & y ,
1063  const input_scalar & a ,
1064  const output_scalar & b )
1065  : m_A_values( A.values )
1066  , m_A_graph( A.graph )
1067  , v_y( y )
1068  , v_x( x )
1069  , m_a( a )
1070  , m_b( b )
1071  , dim( dimension_scalar(x) )
1072  , numBlocks( dim / BlockSize )
1073  , rem( dim % BlockSize )
1074  , dim_block( numBlocks*BlockSize )
1075  {}
1076 
1077  KOKKOS_INLINE_FUNCTION
1078  void operator()( const size_type iBlockRow ) const
1079  {
1080 #if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
1081  output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
1082 #else
1083  output_scalar s[BlockSize] = {};
1084 #endif
1085 
1086  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1087  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1088  size_type pce_block = 0;
1089  for (; pce_block < dim_block; pce_block+=BlockSize) {
1090  output_scalar * const y = &v_y(pce_block, iBlockRow);
1091  if (m_b == output_scalar(0))
1092 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1093 #pragma ivdep
1094 //#pragma vector aligned
1095 #endif
1096  for (size_type k = 0; k < BlockSize; ++k)
1097  s[k] = 0.0;
1098  else
1099 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1100 #pragma ivdep
1101 //#pragma vector aligned
1102 #endif
1103  for (size_type k = 0; k < BlockSize; ++k)
1104  s[k] = m_b*y[k];
1105  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1106  const matrix_scalar aA = m_a*m_A_values(iEntry);
1107  const size_type col = m_A_graph.entries(iEntry);
1108  const input_scalar * const x = &v_x(pce_block, col);
1109 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1110 #pragma ivdep
1111 //#pragma vector aligned
1112 #endif
1113  for (size_type k = 0; k < BlockSize; ++k)
1114  s[k] += aA*x[k];
1115  }
1116 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1117 #pragma ivdep
1118 //#pragma vector aligned
1119 #endif
1120  for (size_type k = 0; k < BlockSize; ++k) {
1121  y[k] = s[k];
1122  }
1123  }
1124 
1125  // Remaining coeffs
1126  if (rem > 0) {
1127  output_scalar * const y = &v_y(pce_block, iBlockRow);
1128  if (m_b == output_scalar(0))
1129 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1130 #pragma ivdep
1131 //#pragma vector aligned
1132 #endif
1133  for (size_type k = 0; k < rem; ++k)
1134  s[k] = 0.0;
1135  else
1136 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1137 #pragma ivdep
1138 //#pragma vector aligned
1139 #endif
1140  for (size_type k = 0; k < rem; ++k)
1141  s[k] = m_b*y[k];
1142  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1143  const matrix_scalar aA = m_a*m_A_values(iEntry);
1144  const size_type col = m_A_graph.entries(iEntry);
1145  const input_scalar * const x = &v_x(pce_block, col);
1146 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1147 #pragma ivdep
1148 //#pragma vector aligned
1149 #endif
1150  for (size_type k = 0; k < rem; ++k)
1151  s[k] += aA*x[k];
1152  }
1153 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1154 #pragma ivdep
1155 //#pragma vector aligned
1156 #endif
1157  for (size_type k = 0; k < rem; ++k) {
1158  y[k] = s[k];
1159  }
1160  }
1161 
1162  }
1163 
1164  };
1165 
1166  struct Kernel {
1169  typedef typename input_vector_type::array_type input_array_type;
1170  typedef typename output_vector_type::array_type output_array_type;
1171 
1178  const size_type dim ;
1179 
1180  Kernel( const matrix_type & A ,
1181  const input_vector_type & x ,
1182  const output_vector_type & y ,
1183  const input_scalar & a ,
1184  const output_scalar & b )
1185  : m_A_values( A.values )
1186  , m_A_graph( A.graph )
1187  , v_y( y )
1188  , v_x( x )
1189  , m_a( a )
1190  , m_b( b )
1191  , dim( dimension_scalar(x) )
1192  {}
1193 
1194  KOKKOS_INLINE_FUNCTION
1195  void operator()( const size_type iBlockRow ) const
1196  {
1197  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1198  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1199  output_scalar * const y = &v_y(0, iBlockRow);
1200  if (m_b == output_scalar(0))
1201 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1202 #pragma ivdep
1203 //#pragma vector aligned
1204 #endif
1205  for (size_type k = 0; k < dim; ++k)
1206  y[k] = 0.0;
1207  else
1208 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1209 #pragma ivdep
1210 //#pragma vector aligned
1211 #endif
1212  for (size_type k = 0; k < dim; ++k)
1213  y[k] = m_b*y[k];
1214  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1215  const matrix_scalar aA = m_a*m_A_values(iEntry);
1216  const size_type col = m_A_graph.entries(iEntry);
1217  const input_scalar * const x = &v_x(0, col);
1218 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1219 #pragma ivdep
1220 //#pragma vector aligned
1221 #endif
1222  for (size_type k = 0; k < dim; ++k)
1223  y[k] += aA*x[k];
1224  }
1225  }
1226 
1227  };
1228 
1229  static void apply( const matrix_type & A ,
1230  const input_vector_type & x ,
1231  const output_vector_type & y ,
1232  const input_scalar & a = input_scalar(1) ,
1233  const output_scalar & b = output_scalar(0) )
1234  {
1235  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1236  const size_type dim = Kokkos::dimension_scalar(x);
1237 
1238  // Choose block size appropriately for PCE dimension
1239 #if defined (__MIC__)
1240  if (dim >= 128)
1241  Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1242  else if (dim >= 64)
1243  Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1244  else if (dim >= 32)
1245  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1246  else if (dim >= 16)
1247  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1248  else if (dim >= 8)
1249  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1250  else
1251  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1252 #else
1253  if (dim >= 32)
1254  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1255  else if (dim >= 16)
1256  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1257  else if (dim >= 8)
1258  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1259  else
1260  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1261 #endif
1262  }
1263 };
1264 
1265 // Kernel implementing y = A * x where A has PCE size = 1
1266 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
1267 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1268 // x and y are rank 2
1269 template <typename MatrixDevice,
1270  typename MatrixStorage,
1271  typename MatrixOrdinal,
1272  typename MatrixMemory,
1273  typename MatrixSize,
1274  typename InputStorage,
1275  typename ... InputP,
1276  typename OutputStorage,
1277  typename ... OutputP>
1278 class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
1279  MatrixOrdinal,
1280  MatrixDevice,
1281  MatrixMemory,
1282  MatrixSize>,
1283  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1284  InputP... >,
1285  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1286  OutputP... >
1287  >
1288 {
1289 public:
1293 
1294  typedef KokkosSparse::CrsMatrix< const MatrixValue,
1295  MatrixOrdinal,
1296  MatrixDevice,
1297  MatrixMemory,
1298  MatrixSize> matrix_type;
1299  typedef Kokkos::View< const InputVectorValue**,
1300  InputP... > input_vector_type;
1301  typedef Kokkos::View< OutputVectorValue**,
1302  OutputP... > output_vector_type;
1303 
1308 
1309  static void apply( const matrix_type & A ,
1310  const input_vector_type & x ,
1311  const output_vector_type & y ,
1312  const input_scalar & a = input_scalar(1) ,
1313  const output_scalar & b = output_scalar(0) )
1314  {
1315  typedef Kokkos::View< const InputVectorValue*,
1316  InputP... > input_vector_1d_type;
1317  typedef Kokkos::View< OutputVectorValue*,
1318  OutputP... > output_vector_1d_type;
1319  typedef MeanMultiply<matrix_type, input_vector_1d_type,
1320  output_vector_1d_type> MeanMultiply1D;
1321  const size_type num_col = x.extent(1);
1322  for (size_type i=0; i<num_col; ++i) {
1323  input_vector_1d_type x_col =
1324  Kokkos::subview(x, Kokkos::ALL(), i);
1325  output_vector_1d_type y_col =
1326  Kokkos::subview(y, Kokkos::ALL(), i);
1327  MeanMultiply1D::apply( A, x_col, y_col, a, b );
1328  }
1329  }
1330 };
1331 
1332 // Kernel implementing y = A * x where PCE size of A is 1
1333 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
1334 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
1335 // x and y are rank 1
1336 template <typename MatrixDevice,
1337  typename MatrixStorage,
1338  typename MatrixOrdinal,
1339  typename MatrixMemory,
1340  typename MatrixSize,
1341  typename InputStorage,
1342  typename ... InputP,
1343  typename OutputStorage,
1344  typename ... OutputP>
1345 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1346  MatrixOrdinal,
1347  MatrixDevice,
1348  MatrixMemory,
1349  MatrixSize>,
1350  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1351  InputP... >,
1352  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1353  OutputP... >
1354  >
1355 {
1356 public:
1360 
1361  typedef KokkosSparse::CrsMatrix< MatrixValue,
1362  MatrixOrdinal,
1363  MatrixDevice,
1364  MatrixMemory,
1365  MatrixSize> matrix_type;
1366  typedef typename matrix_type::const_type const_matrix_type;
1367 
1368  typedef Kokkos::View< const InputVectorValue*,
1369  InputP... > input_vector_type;
1370  typedef Kokkos::View< OutputVectorValue*,
1371  OutputP... > output_vector_type;
1372 
1375 
1376  static void apply( const matrix_type & A ,
1377  const input_vector_type & x ,
1378  const output_vector_type & y ,
1379  const input_scalar & a = input_scalar(1) ,
1380  const output_scalar & b = output_scalar(0) )
1381  {
1382  const_matrix_type cA = A;
1384  cA, x, y, a, b);
1385  }
1386 };
1387 
1388 // Kernel implementing y = A * x where PCE size of A is 1
1389 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
1390 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1391 // x and y are rank 2
1392 template <typename MatrixDevice,
1393  typename MatrixStorage,
1394  typename MatrixOrdinal,
1395  typename MatrixMemory,
1396  typename MatrixSize,
1397  typename InputStorage,
1398  typename ... InputP,
1399  typename OutputStorage,
1400  typename ... OutputP>
1401 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1402  MatrixOrdinal,
1403  MatrixDevice,
1404  MatrixMemory,
1405  MatrixSize>,
1406  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1407  InputP... >,
1408  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1409  OutputP... >
1410  >
1411 {
1412 public:
1416 
1417  typedef KokkosSparse::CrsMatrix< MatrixValue,
1418  MatrixOrdinal,
1419  MatrixDevice,
1420  MatrixMemory,
1421  MatrixSize> matrix_type;
1422  typedef typename matrix_type::const_type const_matrix_type;
1423 
1424  typedef Kokkos::View< const InputVectorValue**,
1425  InputP... > input_vector_type;
1426  typedef Kokkos::View< OutputVectorValue**,
1427  OutputP... > output_vector_type;
1428 
1431 
1432  static void apply( const matrix_type & A ,
1433  const input_vector_type & x ,
1434  const output_vector_type & y ,
1435  const input_scalar & a = input_scalar(1) ,
1436  const output_scalar & b = output_scalar(0) )
1437  {
1438  const_matrix_type cA = A;
1440  cA, x, y, a, b);
1441  }
1442 };
1443 
1444 } // namespace Stokhos
1445 
1446 namespace KokkosSparse {
1447 
1448 template <typename AlphaType,
1449  typename BetaType,
1450  typename MatrixType,
1451  typename InputType,
1452  typename ... InputP,
1453  typename OutputType,
1454  typename ... OutputP>
1455 typename std::enable_if<
1456  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1457  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1458  >::type
1460  const char mode[],
1461  const AlphaType& a,
1462  const MatrixType& A,
1463  const Kokkos::View< InputType, InputP... >& x,
1464  const BetaType& b,
1465  const Kokkos::View< OutputType, OutputP... >& y,
1466  const RANK_ONE)
1467 {
1468  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1469  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1470  typedef Stokhos::Multiply<MatrixType, typename InputVectorType::const_type,
1471  OutputVectorType> multiply_type;
1472  typedef Stokhos::MeanMultiply<MatrixType, typename InputVectorType::const_type,
1473  OutputVectorType> mean_multiply_type;
1474 
1475  if(mode[0]!='N') {
1477  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1478  }
1479 
1480  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1482  "Stokhos spmv not implemented for non-constant a or b");
1483  }
1484  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1485  mean_multiply_type::apply( A, x, y,
1486  Sacado::Value<AlphaType>::eval(a),
1487  Sacado::Value<BetaType>::eval(b) );
1488  }
1489  else
1490  multiply_type::apply( A, x, y,
1491  Sacado::Value<AlphaType>::eval(a),
1492  Sacado::Value<BetaType>::eval(b) );
1493 }
1494 
1495 template <typename AlphaType,
1496  typename BetaType,
1497  typename MatrixType,
1498  typename InputType,
1499  typename ... InputP,
1500  typename OutputType,
1501  typename ... OutputP>
1502 typename std::enable_if<
1503  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1504  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1505  >::type
1507  KokkosKernels::Experimental::Controls,
1508  const char mode[],
1509  const AlphaType& a,
1510  const MatrixType& A,
1511  const Kokkos::View< InputType, InputP... >& x,
1512  const BetaType& b,
1513  const Kokkos::View< OutputType, OutputP... >& y,
1514  const RANK_ONE)
1515 {
1516  spmv(mode, a, A, x, b, y, RANK_ONE());
1517 }
1518 
1519 template <typename AlphaType,
1520  typename BetaType,
1521  typename MatrixType,
1522  typename InputType,
1523  typename ... InputP,
1524  typename OutputType,
1525  typename ... OutputP>
1526 typename std::enable_if<
1527  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1528  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1529  >::type
1531  const char mode[],
1532  const AlphaType& a,
1533  const MatrixType& A,
1534  const Kokkos::View< InputType, InputP... >& x,
1535  const BetaType& b,
1536  const Kokkos::View< OutputType, OutputP... >& y,
1537  const RANK_TWO)
1538 {
1539  if(mode[0]!='N') {
1541  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1542  }
1543  if (y.extent(1) == 1) {
1544  auto y_1D = subview(y, Kokkos::ALL(), 0);
1545  auto x_1D = subview(x, Kokkos::ALL(), 0);
1546  spmv(mode, a, A, x_1D, b, y_1D, RANK_ONE());
1547  }
1548  else {
1549  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1550  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1551 
1552  typedef Stokhos::Multiply<MatrixType, typename InputVectorType::const_type,
1553  OutputVectorType> multiply_type;
1554  typedef Stokhos::MeanMultiply<MatrixType, typename InputVectorType::const_type,
1555  OutputVectorType> mean_multiply_type;
1556 
1557  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1559  "Stokhos spmv not implemented for non-constant a or b");
1560  }
1561 
1562  typename InputVectorType::const_type x_const = x;
1563 
1564  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1565  mean_multiply_type::apply( A, x_const, y,
1566  Sacado::Value<AlphaType>::eval(a),
1567  Sacado::Value<BetaType>::eval(b));
1568  }
1569  else
1570  multiply_type::apply( A, x_const, y,
1571  Sacado::Value<AlphaType>::eval(a),
1572  Sacado::Value<BetaType>::eval(b));
1573  }
1574 }
1575 
1576 template <typename AlphaType,
1577  typename BetaType,
1578  typename MatrixType,
1579  typename InputType,
1580  typename ... InputP,
1581  typename OutputType,
1582  typename ... OutputP>
1583 typename std::enable_if<
1584  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1585  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1586  >::type
1588  KokkosKernels::Experimental::Controls,
1589  const char mode[],
1590  const AlphaType& a,
1591  const MatrixType& A,
1592  const Kokkos::View< InputType, InputP... >& x,
1593  const BetaType& b,
1594  const Kokkos::View< OutputType, OutputP... >& y,
1595  const RANK_TWO)
1596 {
1597  spmv(mode, a, A, x, b, y, RANK_TWO());
1598 }
1599 
1600 }
1601 
1602 #endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP */
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Kokkos::DefaultExecutionSpace device
KOKKOS_INLINE_FUNCTION void zero()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)