Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_CrsMatrix_UQ_PCE.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
11 #define KOKKOS_CRSMATRIX_UQ_PCE_HPP
12 
13 #include "Sacado_UQ_PCE.hpp"
14 #include "Kokkos_View_UQ_PCE.hpp"
16 #include "KokkosSparse_CrsMatrix.hpp"
17 #include "KokkosSparse_spmv.hpp"
18 
19 #include "Kokkos_Blas1_UQ_PCE.hpp" // for some utilities
20 
21 #include "Stokhos_Multiply.hpp"
23 
24 namespace Stokhos {
25 
26 namespace Impl {
27  // Remove MemoryRandomAccess memory trait from a given view
28  template <typename ViewType, typename Enabled = void>
30  public:
31  typedef ViewType type;
32  };
33  template <typename ViewType>
35  ViewType,
36  std::enable_if_t<ViewType::memory_traits::is_random_access> > {
37  public:
38  static constexpr unsigned M0 = ViewType::memory_traits::impl_value;
39  static constexpr unsigned M1 =
40  M0 & (Kokkos::Unmanaged | Kokkos::Atomic | Kokkos::Restrict | Kokkos::Aligned);
41  typedef Kokkos::View<typename ViewType::data_type,
42  typename ViewType::array_layout,
43  typename ViewType::device_type,
44  Kokkos::MemoryTraits<M1> > type;
45  };
46 }
47 
48 //----------------------------------------------------------------------------
49 // Specialization of KokkosSparse::CrsMatrix for Sacado::UQ::PCE scalar type
50 //----------------------------------------------------------------------------
51 
52 // Kernel implementing y = A * x where
53 // A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>,
54 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
55 // x and y are rank 1
56 template <typename MatrixDevice,
57  typename MatrixStorage,
58  typename MatrixOrdinal,
59  typename MatrixMemory,
60  typename MatrixSize,
61  typename InputStorage,
62  typename ... InputP,
63  typename OutputStorage,
64  typename ... OutputP>
65 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
66  MatrixOrdinal,
67  MatrixDevice,
68  MatrixMemory,
69  MatrixSize>,
70  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
71  InputP... >,
72  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
73  OutputP... >
74  >
75 {
76 public:
80 
82 
83  typedef KokkosSparse::CrsMatrix< const MatrixValue,
84  MatrixOrdinal,
85  MatrixDevice,
86  MatrixMemory,
87  MatrixSize> matrix_type;
88  typedef typename matrix_type::values_type matrix_values_type;
90  typedef typename tensor_type::size_type size_type;
91  typedef Kokkos::View< const InputVectorValue*,
92  InputP... > input_vector_type;
93  typedef Kokkos::View< OutputVectorValue*,
94  OutputP... > output_vector_type;
95 
96 private:
97 
98  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
99  typedef typename matrix_values_type::array_type matrix_array_type;
102 
107 
115 
116  Multiply( const matrix_type & A ,
117  const input_vector_type & x ,
118  const output_vector_type & y ,
119  const input_scalar & a ,
120  const output_scalar & b )
121  : m_A_values( A.values )
122  , m_A_graph( A.graph )
123  , m_x( x )
124  , m_y( y )
125  , m_tensor( Kokkos::cijk(A.values) )
126  , m_a( a )
127  , m_b( b )
128  {}
129 
130 public:
131 
132  //
133  // Non-team functor interface -- no threads within PCE multiply
134  //
135  // Note: Rember that matrix currently is always LayoutRight!
136  //
137  KOKKOS_INLINE_FUNCTION
138  void operator()( const size_type iBlockRow ) const
139  {
140  // Prefer that y[ m_tensor.dimension() ] be scratch space
141  // on the local thread, but cannot dynamically allocate
142  output_scalar * const y = & m_y(0,iBlockRow);
143 
144  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
145  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
146 
147  // Leading dimension guaranteed contiguous for LayoutLeft
148  if ( m_b == output_scalar(0) )
149  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
150  y[j] = 0 ;
151  else
152  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
153  y[j] = m_b * y[j] ;
154  //loop over cols of A
155  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
156  const input_scalar * const x = & m_x( 0 , m_A_graph.entries(iEntry) );
157  const matrix_scalar * const A = & m_A_values( iEntry , 0 );
158 
160  m_tensor , A , x , y , m_a );
161  }
162 
163  }
164 
165 #if defined(__MIC__)
166 
167  //
168  // Team functor interface with threading within PCE multiply
169  //
170  // Note: Rember that matrix currently is always LayoutRight!
171  //
172  // This is a MIC-specific version of that processes multiple FEM columns
173  // at a time to reduce tensor reads
174  //
175  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
176  KOKKOS_INLINE_FUNCTION
177  void operator()( const team_member & device ) const
178  {
179  const size_type iBlockRow = device.league_rank();
180 
181  // Check for valid row
182  const size_type row_count = m_A_graph.row_map.extent(0)-1;
183  if (iBlockRow >= row_count)
184  return;
185 
186  const size_type num_thread = device.team_size();
187  const size_type thread_idx = device.team_rank();
188  const Kokkos::pair<size_type,size_type> work_range =
189  details::compute_work_range<output_scalar>(
190  device, m_tensor.dimension(), num_thread, thread_idx);
191 
192  // Prefer that y[ m_tensor.dimension() ] be scratch space
193  // on the local thread, but cannot dynamically allocate
194  output_scalar * const y = & m_y(0,iBlockRow);
195 
196  // Leading dimension guaranteed contiguous for LayoutLeft
197  if ( m_b == output_scalar(0) )
198  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
199  y[j] = 0 ;
200  else
201  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
202  y[j] = m_b * y[j] ;
203 
204  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
205  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
206  const size_type BlockSize = 9;
207  const size_type numBlock =
208  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
209 
210  const matrix_scalar* sh_A[BlockSize];
211  const input_scalar* sh_x[BlockSize];
212 
213  size_type iBlockEntry = iBlockEntryBeg;
214  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
215  const size_type block_size =
216  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
217 
218  for ( size_type col = 0; col < block_size; ++col ) {
219  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
220  sh_x[col] = & m_x( 0 , iBlockColumn );
221  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
222  }
223 
224  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
225 
226  const size_type nEntry = m_tensor.num_entry(iy);
227  const size_type iEntryBeg = m_tensor.entry_begin(iy);
228  const size_type iEntryEnd = iEntryBeg + nEntry;
229  size_type iEntry = iEntryBeg;
230 
231  output_scalar ytmp = 0 ;
232 
233  // Do entries with a blocked loop of size blocksize
234  const size_type nBlock = nEntry / tensor_type::vectorsize;
235  const size_type nEntryB = nBlock * tensor_type::vectorsize;
236  const size_type iEnd = iEntryBeg + nEntryB;
237 
238  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
239  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
240  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
241  VecTV vy;
242  vy.zero();
243  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
244  const size_type *j = &m_tensor.coord(iEntry,0);
245  const size_type *k = &m_tensor.coord(iEntry,1);
246  ValTV c(&(m_tensor.value(iEntry)));
247 
248  for ( size_type col = 0; col < block_size; ++col ) {
249  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
250  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
251 
252  // vy += c * ( aj * xk + ak * xj)
253  aj.times_equal(xk);
254  aj.multiply_add(ak, xj);
255  vy.multiply_add(c, aj);
256  }
257  }
258  ytmp += vy.sum();
259 
260  // The number of nonzeros is always constrained to be a multiple of 8
261 
262  const size_type rem = iEntryEnd-iEntry;
263  if (rem >= 8) {
264  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
265  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
266  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
267  const size_type *j = &m_tensor.coord(iEntry,0);
268  const size_type *k = &m_tensor.coord(iEntry,1);
269  ValTV2 c(&(m_tensor.value(iEntry)));
270 
271  for ( size_type col = 0; col < block_size; ++col ) {
272  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
273  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
274 
275  // vy += c * ( aj * xk + ak * xj)
276  aj.times_equal(xk);
277  aj.multiply_add(ak, xj);
278  aj.times_equal(c);
279  ytmp += aj.sum();
280  }
281  }
282 
283  y[iy] += m_a * ytmp ;
284  }
285 
286  // Add a team barrier to keep the thread team in-sync before going on
287  // to the next block
288  device.team_barrier();
289  }
290 
291  }
292 
293 #else
294 
295  //
296  // Team functor interface with threading within PCE multiply
297  //
298  // Note: Rember that matrix currently is always LayoutRight!
299  //
300  // This is a general, hand-vectorized version that processes multiple FEM
301  // columns at a time to reduce tensor reads. Note that auto-vectorization
302  // doesn't work here because of the inner-loop over FEM columns.
303  //
304  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
305  KOKKOS_INLINE_FUNCTION
306  void operator()( const team_member & device ) const
307  {
308  const size_type iBlockRow = device.league_rank();
309 
310  // Check for valid row
311  const size_type row_count = m_A_graph.row_map.extent(0)-1;
312  if (iBlockRow >= row_count)
313  return;
314 
315  const size_type num_thread = device.team_size();
316  const size_type thread_idx = device.team_rank();
317  const Kokkos::pair<size_type,size_type> work_range =
318  details::compute_work_range<output_scalar>(
319  device, m_tensor.dimension(), num_thread, thread_idx);
320 
321  // Prefer that y[ m_tensor.dimension() ] be scratch space
322  // on the local thread, but cannot dynamically allocate
323  output_scalar * const y = & m_y(0,iBlockRow);
324 
325  // Leading dimension guaranteed contiguous for LayoutLeft
326  if ( m_b == output_scalar(0) )
327  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
328  y[j] = 0 ;
329  else
330  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
331  y[j] = m_b * y[j] ;
332 
333  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
334  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
335  const size_type BlockSize = 14;
336  const size_type numBlock =
337  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
338 
339  const matrix_scalar* sh_A[BlockSize];
340  const input_scalar* sh_x[BlockSize];
341 
342  size_type iBlockEntry = iBlockEntryBeg;
343  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
344  const size_type block_size =
345  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
346 
347  for ( size_type col = 0; col < block_size; ++col ) {
348  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
349  sh_x[col] = & m_x( 0 , iBlockColumn );
350  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
351  }
352 
353  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
354 
355  const size_type nEntry = m_tensor.num_entry(iy);
356  const size_type iEntryBeg = m_tensor.entry_begin(iy);
357  const size_type iEntryEnd = iEntryBeg + nEntry;
358  size_type iEntry = iEntryBeg;
359 
360  output_scalar ytmp = 0 ;
361 
362  // Do entries with a blocked loop of size blocksize
363  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
364  const size_type nBlock = nEntry / tensor_type::vectorsize;
365  const size_type nEntryB = nBlock * tensor_type::vectorsize;
366  const size_type iEnd = iEntryBeg + nEntryB;
367 
371  VecTV vy;
372  vy.zero();
373  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
374  const size_type *j = &m_tensor.coord(iEntry,0);
375  const size_type *k = &m_tensor.coord(iEntry,1);
376  ValTV c(&(m_tensor.value(iEntry)));
377 
378  for ( size_type col = 0; col < block_size; ++col ) {
379  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
380  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
381 
382  // vy += c * ( aj * xk + ak * xj)
383  aj.times_equal(xk);
384  aj.multiply_add(ak, xj);
385  vy.multiply_add(c, aj);
386  }
387  }
388  ytmp += vy.sum();
389  }
390 
391  // Do remaining entries with a scalar loop
392  for ( ; iEntry<iEntryEnd; ++iEntry) {
393  const size_type j = m_tensor.coord(iEntry,0);
394  const size_type k = m_tensor.coord(iEntry,1);
395  tensor_scalar cijk = m_tensor.value(iEntry);
396 
397  for ( size_type col = 0; col < block_size; ++col ) {
398  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
399  sh_A[col][k] * sh_x[col][j] );
400  }
401 
402  }
403 
404  y[iy] += m_a * ytmp ;
405  }
406 
407  // Add a team barrier to keep the thread team in-sync before going on
408  // to the next block
409  device.team_barrier();
410  }
411 
412  }
413 
414 #endif
415 
416  static void apply( const matrix_type & A ,
417  const input_vector_type & x ,
418  const output_vector_type & y ,
419  const input_scalar & a = input_scalar(1) ,
420  const output_scalar & b = output_scalar(0) )
421  {
422  // Generally the block algorithm seems to perform better on the MIC,
423  // as long as the stochastic size isn't too big, but doesn't perform
424  // any better on the CPU (probably because the CPU has a fat L3 cache
425  // to store the sparse 3 tensor).
426 #ifdef __MIC__
427  const bool use_block_algorithm = true;
428 #else
429  const bool use_block_algorithm = false;
430 #endif
431 
432  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
433  if (use_block_algorithm) {
434 #ifdef __MIC__
435  const size_t team_size = 4; // 4 hyperthreads for MIC
436 #else
437  const size_t team_size = 2; // 2 for everything else
438 #endif
439  const size_t league_size = row_count;
440  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
441  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
442  }
443  else {
444  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
445  }
446  }
447 };
448 
449 // Kernel implementing y = A * x where
450 // A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>,
451 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
452 // x and y are rank 2
453 template <typename MatrixDevice,
454  typename MatrixStorage,
455  typename MatrixOrdinal,
456  typename MatrixMemory,
457  typename MatrixSize,
458  typename InputStorage,
459  typename ... InputP,
460  typename OutputStorage,
461  typename ... OutputP>
462 class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
463  MatrixOrdinal,
464  MatrixDevice,
465  MatrixMemory,
466  MatrixSize>,
467  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
468  InputP... >,
469  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
470  OutputP... >
471  >
472 {
473 public:
477 
479 
480  typedef KokkosSparse::CrsMatrix< const MatrixValue,
481  MatrixOrdinal,
482  MatrixDevice,
483  MatrixMemory,
484  MatrixSize> matrix_type;
485  typedef typename matrix_type::values_type matrix_values_type;
487  typedef typename tensor_type::size_type size_type;
488  typedef Kokkos::View< const InputVectorValue**,
489  InputP... > input_vector_type;
490  typedef Kokkos::View< OutputVectorValue**,
491  OutputP... > output_vector_type;
492 
493 private:
494 
495  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
496  typedef typename matrix_values_type::array_type matrix_array_type;
499 
504 
512 
513  Multiply( const matrix_type & A ,
514  const input_vector_type & x ,
515  const output_vector_type & y ,
516  const input_scalar & a ,
517  const output_scalar & b )
518  : m_A_values( A.values )
519  , m_A_graph( A.graph )
520  , m_x( x )
521  , m_y( y )
522  , m_tensor( Kokkos::cijk(A.values) )
523  , m_a( a )
524  , m_b( b )
525  {}
526 
527 public:
528 
529  //
530  // Non-team functor interface -- no threads within PCE multiply
531  //
532  // Note: Rember that matrix currently is always LayoutRight!
533  //
534  KOKKOS_INLINE_FUNCTION
535  void operator()( const size_type iBlockRow ) const
536  {
537  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
538  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
539 
540  const size_type num_col = m_y.extent(2);
541 
542  // Leading dimension guaranteed contiguous for LayoutLeft
543  if ( m_b == output_scalar(0) )
544  for (size_type col=0; col<num_col; ++col)
545  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
546  m_y(j, iBlockRow, col) = 0 ;
547  else
548  for (size_type col=0; col<num_col; ++col)
549  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
550  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
551 
552  // Put the x-column loop inside the A-column loop to reuse entries in A.
553  // This way all of the entries for that particular column of A should stay
554  // in L1 cache for all of the columns of x.
555 
556  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
557  const matrix_scalar * const A = &m_A_values( iEntry, 0 );
558  const size_type iBlockCol = m_A_graph.entries(iEntry);
559 
560  for (size_type col=0; col<num_col; ++col) {
561  output_scalar * const y = &m_y( 0, iBlockRow, col );
562  const input_scalar * const x = &m_x( 0, iBlockCol, col );
563  BlockMultiply< tensor_type >::apply( m_tensor , A , x , y , m_a );
564  }
565 
566  }
567 
568  }
569 
570 #if defined(__MIC__)
571 
572  //
573  // Team functor interface with threading within PCE multiply
574  //
575  // Note: Rember that matrix currently is always LayoutRight!
576  //
577  // This is a MIC-specific version of that processes multiple FEM columns
578  // at a time to reduce tensor reads
579  //
580  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
581 
582  KOKKOS_INLINE_FUNCTION
583  void operator()( const team_member & device ) const
584  {
585  const size_type iBlockRow = device.league_rank();
586 
587  // Check for valid row
588  const size_type row_count = m_A_graph.row_map.extent(0)-1;
589  if (iBlockRow >= row_count)
590  return;
591 
592  const size_type num_thread = device.team_size();
593  const size_type thread_idx = device.team_rank();
594  const Kokkos::pair<size_type,size_type> work_range =
595  details::compute_work_range<output_scalar>(
596  device, m_tensor.dimension(), num_thread, thread_idx);
597 
598  const size_type num_col = m_y.extent(2);
599 
600  // Leading dimension guaranteed contiguous for LayoutLeft
601  if ( m_b == output_scalar(0) )
602  for (size_type col=0; col<num_col; ++col)
603  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
604  m_y(j, iBlockRow, col) = 0 ;
605  else
606  for (size_type col=0; col<num_col; ++col)
607  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
608  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
609 
610  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
611  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
612  const size_type BlockSize = 9;
613  const size_type numBlock =
614  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
615 
616  const matrix_scalar* sh_A[BlockSize];
617  const input_scalar* sh_x[BlockSize];
618 
619  size_type iBlockEntry = iBlockEntryBeg;
620  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
621  const size_type block_size =
622  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
623 
624  // Loop over columns of x, y
625  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
626 
627  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
628 
629  for ( size_type col = 0; col < block_size; ++col ) {
630  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
631  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
632  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
633  }
634 
635  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
636 
637  const size_type nEntry = m_tensor.num_entry(iy);
638  const size_type iEntryBeg = m_tensor.entry_begin(iy);
639  const size_type iEntryEnd = iEntryBeg + nEntry;
640  size_type iEntry = iEntryBeg;
641 
642  output_scalar ytmp = 0 ;
643 
644  // Do entries with a blocked loop of size blocksize
645  const size_type nBlock = nEntry / tensor_type::vectorsize;
646  const size_type nEntryB = nBlock * tensor_type::vectorsize;
647  const size_type iEnd = iEntryBeg + nEntryB;
648 
649  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
650  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
651  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
652  VecTV vy;
653  vy.zero();
654  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
655  const size_type *j = &m_tensor.coord(iEntry,0);
656  const size_type *k = &m_tensor.coord(iEntry,1);
657  ValTV c(&(m_tensor.value(iEntry)));
658 
659  for ( size_type col = 0; col < block_size; ++col ) {
660  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
661  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
662 
663  // vy += c * ( aj * xk + ak * xj)
664  aj.times_equal(xk);
665  aj.multiply_add(ak, xj);
666  vy.multiply_add(c, aj);
667  }
668  }
669  ytmp += vy.sum();
670 
671  // The number of nonzeros is always constrained to be a multiple of 8
672 
673  const size_type rem = iEntryEnd-iEntry;
674  if (rem >= 8) {
675  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
676  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
677  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
678  const size_type *j = &m_tensor.coord(iEntry,0);
679  const size_type *k = &m_tensor.coord(iEntry,1);
680  ValTV2 c(&(m_tensor.value(iEntry)));
681 
682  for ( size_type col = 0; col < block_size; ++col ) {
683  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
684  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
685 
686  // vy += c * ( aj * xk + ak * xj)
687  aj.times_equal(xk);
688  aj.multiply_add(ak, xj);
689  aj.times_equal(c);
690  ytmp += aj.sum();
691  }
692  }
693 
694  y[iy] += m_a * ytmp ;
695  }
696 
697  }
698 
699  // Add a team barrier to keep the thread team in-sync before going on
700  // to the next block
701  device.team_barrier();
702 
703  }
704 
705  }
706 
707 #else
708 
709  //
710  // Team functor interface with threading within PCE multiply
711  //
712  // Note: Rember that matrix currently is always LayoutRight!
713  //
714  // This is a general, hand-vectorized version that processes multiple FEM
715  // columns at a time to reduce tensor reads. Note that auto-vectorization
716  // doesn't work here because of the inner-loop over FEM columns.
717  //
718  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
719 
720  KOKKOS_INLINE_FUNCTION
721  void operator()( const team_member & device ) const
722  {
723  const size_type iBlockRow = device.league_rank();
724 
725  // Check for valid row
726  const size_type row_count = m_A_graph.row_map.extent(0)-1;
727  if (iBlockRow >= row_count)
728  return;
729 
730  const size_type num_thread = device.team_size();
731  const size_type thread_idx = device.team_rank();
732  const Kokkos::pair<size_type,size_type> work_range =
733  details::compute_work_range<output_scalar>(
734  device, m_tensor.dimension(), num_thread, thread_idx);
735 
736  const size_type num_col = m_y.extent(2);
737 
738  // Leading dimension guaranteed contiguous for LayoutLeft
739  if ( m_b == output_scalar(0) )
740  for (size_type col=0; col<num_col; ++col)
741  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
742  m_y(j, iBlockRow, col) = 0 ;
743  else
744  for (size_type col=0; col<num_col; ++col)
745  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
746  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
747 
748  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
749  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
750  const size_type BlockSize = 14;
751  const size_type numBlock =
752  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
753 
754  const matrix_scalar* sh_A[BlockSize];
755  const input_scalar* sh_x[BlockSize];
756 
757  size_type iBlockEntry = iBlockEntryBeg;
758  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
759  const size_type block_size =
760  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
761 
762  // Loop over columns of x, y
763  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
764 
765  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
766 
767  for ( size_type col = 0; col < block_size; ++col ) {
768  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
769  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
770  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
771  }
772 
773  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
774 
775  const size_type nEntry = m_tensor.num_entry(iy);
776  const size_type iEntryBeg = m_tensor.entry_begin(iy);
777  const size_type iEntryEnd = iEntryBeg + nEntry;
778  size_type iEntry = iEntryBeg;
779 
780  output_scalar ytmp = 0 ;
781 
782  // Do entries with a blocked loop of size blocksize
783  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
784  const size_type nBlock = nEntry / tensor_type::vectorsize;
785  const size_type nEntryB = nBlock * tensor_type::vectorsize;
786  const size_type iEnd = iEntryBeg + nEntryB;
787 
791  VecTV vy;
792  vy.zero();
793  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
794  const size_type *j = &m_tensor.coord(iEntry,0);
795  const size_type *k = &m_tensor.coord(iEntry,1);
796  ValTV c(&(m_tensor.value(iEntry)));
797 
798  for ( size_type col = 0; col < block_size; ++col ) {
799  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
800  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
801 
802  // vy += c * ( aj * xk + ak * xj)
803  aj.times_equal(xk);
804  aj.multiply_add(ak, xj);
805  vy.multiply_add(c, aj);
806  }
807  }
808  ytmp += vy.sum();
809  }
810 
811  // Do remaining entries with a scalar loop
812  for ( ; iEntry<iEntryEnd; ++iEntry) {
813  const size_type j = m_tensor.coord(iEntry,0);
814  const size_type k = m_tensor.coord(iEntry,1);
815  tensor_scalar cijk = m_tensor.value(iEntry);
816 
817  for ( size_type col = 0; col < block_size; ++col ) {
818  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
819  sh_A[col][k] * sh_x[col][j] );
820  }
821 
822  }
823 
824  y[iy] += m_a * ytmp ;
825  }
826 
827  }
828 
829  // Add a team barrier to keep the thread team in-sync before going on
830  // to the next block
831  device.team_barrier();
832  }
833 
834  }
835 
836 #endif
837 
838  static void apply( const matrix_type & A ,
839  const input_vector_type & x ,
840  const output_vector_type & y ,
841  const input_scalar & a = input_scalar(1) ,
842  const output_scalar & b = output_scalar(0) )
843  {
844  // Generally the block algorithm seems to perform better on the MIC,
845  // as long as the stochastic size isn't too big, but doesn't perform
846  // any better on the CPU (probably because the CPU has a fat L3 cache
847  // to store the sparse 3 tensor).
848 #ifdef __MIC__
849  const bool use_block_algorithm = true;
850 #else
851  const bool use_block_algorithm = false;
852 #endif
853 
854  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
855  if (use_block_algorithm) {
856 #ifdef __MIC__
857  const size_t team_size = 4; // 4 hyperthreads for MIC
858 #else
859  const size_t team_size = 2; // 2 for everything else
860 #endif
861  const size_t league_size = row_count;
862  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
863  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
864  }
865  else {
866  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
867  }
868  }
869 };
870 
871 // Kernel implementing y = A * x where
872 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
873 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
874 // x and y are rank 1
875 template <typename MatrixDevice,
876  typename MatrixStorage,
877  typename MatrixOrdinal,
878  typename MatrixMemory,
879  typename MatrixSize,
880  typename InputStorage,
881  typename ... InputP,
882  typename OutputStorage,
883  typename ... OutputP>
884 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
885  MatrixOrdinal,
886  MatrixDevice,
887  MatrixMemory,
888  MatrixSize>,
889  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
890  InputP... >,
891  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
892  OutputP... >
893  >
894 {
895 public:
899 
900  typedef KokkosSparse::CrsMatrix< MatrixValue,
901  MatrixOrdinal,
902  MatrixDevice,
903  MatrixMemory,
904  MatrixSize> matrix_type;
905  typedef typename matrix_type::const_type const_matrix_type;
906 
907  typedef Kokkos::View< const InputVectorValue*,
908  InputP... > input_vector_type;
909  typedef Kokkos::View< OutputVectorValue*,
910  OutputP... > output_vector_type;
911 
914 
915  static void apply( const matrix_type & A ,
916  const input_vector_type & x ,
917  const output_vector_type & y ,
918  const input_scalar & a = input_scalar(1) ,
919  const output_scalar & b = output_scalar(0) )
920  {
921  const_matrix_type cA = A;
923  cA, x, y, a, b);
924  }
925 };
926 
927 // Kernel implementing y = A * x where
928 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
929 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
930 // x and y are rank 2
931 template <typename MatrixDevice,
932  typename MatrixStorage,
933  typename MatrixOrdinal,
934  typename MatrixMemory,
935  typename MatrixSize,
936  typename InputStorage,
937  typename ... InputP,
938  typename OutputStorage,
939  typename ... OutputP>
940 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
941  MatrixOrdinal,
942  MatrixDevice,
943  MatrixMemory,
944  MatrixSize>,
945  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
946  InputP... >,
947  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
948  OutputP... >
949  >
950 {
951 public:
955 
956  typedef KokkosSparse::CrsMatrix< MatrixValue,
957  MatrixOrdinal,
958  MatrixDevice,
959  MatrixMemory,
960  MatrixSize> matrix_type;
961  typedef typename matrix_type::const_type const_matrix_type;
962 
963  typedef Kokkos::View< const InputVectorValue**,
964  InputP... > input_vector_type;
965  typedef Kokkos::View< OutputVectorValue**,
966  OutputP... > output_vector_type;
967 
970 
971  static void apply( const matrix_type & A ,
972  const input_vector_type & x ,
973  const output_vector_type & y ,
974  const input_scalar & a = input_scalar(1) ,
975  const output_scalar & b = output_scalar(0) )
976  {
977  const_matrix_type cA = A;
979  cA, x, y, a, b);
980  }
981 };
982 
983 template <typename MatrixType, typename InputViewType, typename OutputViewType>
984 class MeanMultiply {};
985 
986 // Kernel implementing y = A * x where PCE size of A is 1
987 // A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
988 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
989 // x and y are rank 1
990 template <typename MatrixDevice,
991  typename MatrixStorage,
992  typename MatrixOrdinal,
993  typename MatrixMemory,
994  typename MatrixSize,
995  typename InputStorage,
996  typename ... InputP,
997  typename OutputStorage,
998  typename ... OutputP>
999 class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
1000  MatrixOrdinal,
1001  MatrixDevice,
1002  MatrixMemory,
1003  MatrixSize>,
1004  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1005  InputP... >,
1006  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1007  OutputP... >
1008  >
1009 {
1010 public:
1014 
1015  typedef KokkosSparse::CrsMatrix< const MatrixValue,
1016  MatrixOrdinal,
1017  MatrixDevice,
1018  MatrixMemory,
1019  MatrixSize> matrix_type;
1020  typedef typename matrix_type::values_type matrix_values_type;
1022  typedef Kokkos::View< const InputVectorValue*,
1023  InputP... > input_vector_type;
1024  typedef Kokkos::View< OutputVectorValue*,
1025  OutputP... > output_vector_type;
1026 
1027  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
1031 
1032  template <int BlockSize>
1033  struct BlockKernel {
1038 
1045  const size_type dim ;
1047  const size_type rem ;
1049 
1051  const input_vector_type & x ,
1052  const output_vector_type & y ,
1053  const input_scalar & a ,
1054  const output_scalar & b )
1055  : m_A_values( A.values )
1056  , m_A_graph( A.graph )
1057  , v_y( y )
1058  , v_x( x )
1059  , m_a( a )
1060  , m_b( b )
1061  , dim( dimension_scalar(x) )
1062  , numBlocks( dim / BlockSize )
1063  , rem( dim % BlockSize )
1064  , dim_block( numBlocks*BlockSize )
1065  {}
1066 
1067  KOKKOS_INLINE_FUNCTION
1068  void operator()( const size_type iBlockRow ) const
1069  {
1070 #if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
1071  output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
1072 #else
1073  output_scalar s[BlockSize] = {};
1074 #endif
1075 
1076  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1077  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1078  size_type pce_block = 0;
1079  for (; pce_block < dim_block; pce_block+=BlockSize) {
1080  output_scalar * const y = &v_y(pce_block, iBlockRow);
1081  if (m_b == output_scalar(0))
1082 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1083 #pragma ivdep
1084 //#pragma vector aligned
1085 #endif
1086  for (size_type k = 0; k < BlockSize; ++k)
1087  s[k] = 0.0;
1088  else
1089 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1090 #pragma ivdep
1091 //#pragma vector aligned
1092 #endif
1093  for (size_type k = 0; k < BlockSize; ++k)
1094  s[k] = m_b*y[k];
1095  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1096  const matrix_scalar aA = m_a*m_A_values(iEntry);
1097  const size_type col = m_A_graph.entries(iEntry);
1098  const input_scalar * const x = &v_x(pce_block, col);
1099 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1100 #pragma ivdep
1101 //#pragma vector aligned
1102 #endif
1103  for (size_type k = 0; k < BlockSize; ++k)
1104  s[k] += aA*x[k];
1105  }
1106 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1107 #pragma ivdep
1108 //#pragma vector aligned
1109 #endif
1110  for (size_type k = 0; k < BlockSize; ++k) {
1111  y[k] = s[k];
1112  }
1113  }
1114 
1115  // Remaining coeffs
1116  if (rem > 0) {
1117  output_scalar * const y = &v_y(pce_block, iBlockRow);
1118  if (m_b == output_scalar(0))
1119 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1120 #pragma ivdep
1121 //#pragma vector aligned
1122 #endif
1123  for (size_type k = 0; k < rem; ++k)
1124  s[k] = 0.0;
1125  else
1126 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1127 #pragma ivdep
1128 //#pragma vector aligned
1129 #endif
1130  for (size_type k = 0; k < rem; ++k)
1131  s[k] = m_b*y[k];
1132  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1133  const matrix_scalar aA = m_a*m_A_values(iEntry);
1134  const size_type col = m_A_graph.entries(iEntry);
1135  const input_scalar * const x = &v_x(pce_block, col);
1136 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1137 #pragma ivdep
1138 //#pragma vector aligned
1139 #endif
1140  for (size_type k = 0; k < rem; ++k)
1141  s[k] += aA*x[k];
1142  }
1143 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1144 #pragma ivdep
1145 //#pragma vector aligned
1146 #endif
1147  for (size_type k = 0; k < rem; ++k) {
1148  y[k] = s[k];
1149  }
1150  }
1151 
1152  }
1153 
1154  };
1155 
1156  struct Kernel {
1161 
1168  const size_type dim ;
1169 
1170  Kernel( const matrix_type & A ,
1171  const input_vector_type & x ,
1172  const output_vector_type & y ,
1173  const input_scalar & a ,
1174  const output_scalar & b )
1175  : m_A_values( A.values )
1176  , m_A_graph( A.graph )
1177  , v_y( y )
1178  , v_x( x )
1179  , m_a( a )
1180  , m_b( b )
1181  , dim( dimension_scalar(x) )
1182  {}
1183 
1184  KOKKOS_INLINE_FUNCTION
1185  void operator()( const size_type iBlockRow ) const
1186  {
1187  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1188  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1189  output_scalar * const y = &v_y(0, iBlockRow);
1190  if (m_b == output_scalar(0))
1191 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1192 #pragma ivdep
1193 //#pragma vector aligned
1194 #endif
1195  for (size_type k = 0; k < dim; ++k)
1196  y[k] = 0.0;
1197  else
1198 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1199 #pragma ivdep
1200 //#pragma vector aligned
1201 #endif
1202  for (size_type k = 0; k < dim; ++k)
1203  y[k] = m_b*y[k];
1204  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1205  const matrix_scalar aA = m_a*m_A_values(iEntry);
1206  const size_type col = m_A_graph.entries(iEntry);
1207  const input_scalar * const x = &v_x(0, col);
1208 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1209 #pragma ivdep
1210 //#pragma vector aligned
1211 #endif
1212  for (size_type k = 0; k < dim; ++k)
1213  y[k] += aA*x[k];
1214  }
1215  }
1216 
1217  };
1218 
1219  static void apply( const matrix_type & A ,
1220  const input_vector_type & x ,
1221  const output_vector_type & y ,
1222  const input_scalar & a = input_scalar(1) ,
1223  const output_scalar & b = output_scalar(0) )
1224  {
1225  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1226  const size_type dim = Kokkos::dimension_scalar(x);
1227 
1228  // Choose block size appropriately for PCE dimension
1229 #if defined (__MIC__)
1230  if (dim >= 128)
1231  Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1232  else if (dim >= 64)
1233  Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1234  else if (dim >= 32)
1235  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1236  else if (dim >= 16)
1237  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1238  else if (dim >= 8)
1239  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1240  else
1241  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1242 #else
1243  if (dim >= 32)
1244  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1245  else if (dim >= 16)
1246  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1247  else if (dim >= 8)
1248  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1249  else
1250  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1251 #endif
1252  }
1253 };
1254 
1255 // Kernel implementing y = A * x where A has PCE size = 1
1256 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
1257 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1258 // x and y are rank 2
1259 template <typename MatrixDevice,
1260  typename MatrixStorage,
1261  typename MatrixOrdinal,
1262  typename MatrixMemory,
1263  typename MatrixSize,
1264  typename InputStorage,
1265  typename ... InputP,
1266  typename OutputStorage,
1267  typename ... OutputP>
1268 class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
1269  MatrixOrdinal,
1270  MatrixDevice,
1271  MatrixMemory,
1272  MatrixSize>,
1273  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1274  InputP... >,
1275  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1276  OutputP... >
1277  >
1278 {
1279 public:
1283 
1284  typedef KokkosSparse::CrsMatrix< const MatrixValue,
1285  MatrixOrdinal,
1286  MatrixDevice,
1287  MatrixMemory,
1288  MatrixSize> matrix_type;
1289  typedef Kokkos::View< const InputVectorValue**,
1290  InputP... > input_vector_type;
1291  typedef Kokkos::View< OutputVectorValue**,
1292  OutputP... > output_vector_type;
1293 
1298 
1299  static void apply( const matrix_type & A ,
1300  const input_vector_type & x ,
1301  const output_vector_type & y ,
1302  const input_scalar & a = input_scalar(1) ,
1303  const output_scalar & b = output_scalar(0) )
1304  {
1305  typedef Kokkos::View< const InputVectorValue*,
1306  InputP... > input_vector_1d_type;
1307  typedef Kokkos::View< OutputVectorValue*,
1308  OutputP... > output_vector_1d_type;
1309  typedef MeanMultiply<matrix_type, input_vector_1d_type,
1310  output_vector_1d_type> MeanMultiply1D;
1311  const size_type num_col = x.extent(1);
1312  for (size_type i=0; i<num_col; ++i) {
1313  input_vector_1d_type x_col =
1314  Kokkos::subview(x, Kokkos::ALL(), i);
1315  output_vector_1d_type y_col =
1316  Kokkos::subview(y, Kokkos::ALL(), i);
1317  MeanMultiply1D::apply( A, x_col, y_col, a, b );
1318  }
1319  }
1320 };
1321 
1322 // Kernel implementing y = A * x where PCE size of A is 1
1323 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
1324 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
1325 // x and y are rank 1
1326 template <typename MatrixDevice,
1327  typename MatrixStorage,
1328  typename MatrixOrdinal,
1329  typename MatrixMemory,
1330  typename MatrixSize,
1331  typename InputStorage,
1332  typename ... InputP,
1333  typename OutputStorage,
1334  typename ... OutputP>
1335 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1336  MatrixOrdinal,
1337  MatrixDevice,
1338  MatrixMemory,
1339  MatrixSize>,
1340  Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1341  InputP... >,
1342  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1343  OutputP... >
1344  >
1345 {
1346 public:
1350 
1351  typedef KokkosSparse::CrsMatrix< MatrixValue,
1352  MatrixOrdinal,
1353  MatrixDevice,
1354  MatrixMemory,
1355  MatrixSize> matrix_type;
1356  typedef typename matrix_type::const_type const_matrix_type;
1357 
1358  typedef Kokkos::View< const InputVectorValue*,
1359  InputP... > input_vector_type;
1360  typedef Kokkos::View< OutputVectorValue*,
1361  OutputP... > output_vector_type;
1362 
1365 
1366  static void apply( const matrix_type & A ,
1367  const input_vector_type & x ,
1368  const output_vector_type & y ,
1369  const input_scalar & a = input_scalar(1) ,
1370  const output_scalar & b = output_scalar(0) )
1371  {
1372  const_matrix_type cA = A;
1374  cA, x, y, a, b);
1375  }
1376 };
1377 
1378 // Kernel implementing y = A * x where PCE size of A is 1
1379 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
1380 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1381 // x and y are rank 2
1382 template <typename MatrixDevice,
1383  typename MatrixStorage,
1384  typename MatrixOrdinal,
1385  typename MatrixMemory,
1386  typename MatrixSize,
1387  typename InputStorage,
1388  typename ... InputP,
1389  typename OutputStorage,
1390  typename ... OutputP>
1391 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1392  MatrixOrdinal,
1393  MatrixDevice,
1394  MatrixMemory,
1395  MatrixSize>,
1396  Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1397  InputP... >,
1398  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1399  OutputP... >
1400  >
1401 {
1402 public:
1406 
1407  typedef KokkosSparse::CrsMatrix< MatrixValue,
1408  MatrixOrdinal,
1409  MatrixDevice,
1410  MatrixMemory,
1411  MatrixSize> matrix_type;
1412  typedef typename matrix_type::const_type const_matrix_type;
1413 
1414  typedef Kokkos::View< const InputVectorValue**,
1415  InputP... > input_vector_type;
1416  typedef Kokkos::View< OutputVectorValue**,
1417  OutputP... > output_vector_type;
1418 
1421 
1422  static void apply( const matrix_type & A ,
1423  const input_vector_type & x ,
1424  const output_vector_type & y ,
1425  const input_scalar & a = input_scalar(1) ,
1426  const output_scalar & b = output_scalar(0) )
1427  {
1428  const_matrix_type cA = A;
1430  cA, x, y, a, b);
1431  }
1432 };
1433 
1434 } // namespace Stokhos
1435 
1436 namespace KokkosSparse {
1437 
1438 template <
1439 #if KOKKOSKERNELS_VERSION >= 40199
1440  typename ExecutionSpace,
1441 #endif
1442 #if KOKKOSKERNELS_VERSION >= 40299
1443  typename Handle,
1444 #endif
1445  typename AlphaType,
1446  typename BetaType,
1447  typename MatrixType,
1448  typename InputType,
1449  typename ... InputP,
1450  typename OutputType,
1451  typename ... OutputP>
1452 typename std::enable_if<
1453  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1454  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1455 #if KOKKOSKERNELS_VERSION >= 40299
1456  && KokkosSparse::is_crs_matrix_v<MatrixType>
1457  && (Kokkos::View< OutputType, OutputP... >::rank() == 1)
1458 #endif
1459  >::type
1461 #if KOKKOSKERNELS_VERSION >= 40199
1462  const ExecutionSpace& space,
1463 #endif
1464 #if KOKKOSKERNELS_VERSION < 40299
1465  KokkosKernels::Experimental::Controls,
1466 #else
1467  Handle* handle,
1468 #endif
1469  const char mode[],
1470  const AlphaType& a,
1471  const MatrixType& A,
1472  const Kokkos::View< InputType, InputP... >& x,
1473  const BetaType& b,
1474  const Kokkos::View< OutputType, OutputP... >& y
1475 #if KOKKOSKERNELS_VERSION < 40299
1476  , const RANK_ONE
1477 #endif
1478  )
1479 {
1480  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1481  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1482  typedef Stokhos::Multiply<MatrixType, typename InputVectorType::const_type,
1483  OutputVectorType> multiply_type;
1484  typedef Stokhos::MeanMultiply<MatrixType, typename InputVectorType::const_type,
1485  OutputVectorType> mean_multiply_type;
1486 
1487 #if KOKKOSKERNELS_VERSION >= 40199
1488  if(space != ExecutionSpace()) {
1490  "Stokhos spmv not implemented for non-default execution space instance");
1491  }
1492 #endif
1493  if(mode[0]!='N') {
1495  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1496  }
1497 
1498  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1500  "Stokhos spmv not implemented for non-constant a or b");
1501  }
1502  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1503  mean_multiply_type::apply( A, x, y,
1504  Sacado::Value<AlphaType>::eval(a),
1505  Sacado::Value<BetaType>::eval(b) );
1506  }
1507  else
1508  multiply_type::apply( A, x, y,
1509  Sacado::Value<AlphaType>::eval(a),
1510  Sacado::Value<BetaType>::eval(b) );
1511 }
1512 
1513 template <
1514 #if KOKKOSKERNELS_VERSION >= 40199
1515  typename ExecutionSpace,
1516 #endif
1517 #if KOKKOSKERNELS_VERSION >= 40299
1518  typename Handle,
1519 #endif
1520  typename AlphaType,
1521  typename BetaType,
1522  typename MatrixType,
1523  typename InputType,
1524  typename ... InputP,
1525  typename OutputType,
1526  typename ... OutputP>
1527 typename std::enable_if<
1528  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1529  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1530 #if KOKKOSKERNELS_VERSION >= 40299
1531  && KokkosSparse::is_crs_matrix_v<MatrixType>
1532  && (Kokkos::View< OutputType, OutputP... >::rank() == 2)
1533 #endif
1534  >::type
1536 #if KOKKOSKERNELS_VERSION >= 40199
1537  const ExecutionSpace& space,
1538 #endif
1539 #if KOKKOSKERNELS_VERSION < 40299
1540  KokkosKernels::Experimental::Controls,
1541 #else
1542  Handle* handle,
1543 #endif
1544  const char mode[],
1545  const AlphaType& a,
1546  const MatrixType& A,
1547  const Kokkos::View< InputType, InputP... >& x,
1548  const BetaType& b,
1549  const Kokkos::View< OutputType, OutputP... >& y
1550 #if KOKKOSKERNELS_VERSION < 40299
1551  , const RANK_TWO
1552 #endif
1553  )
1554 {
1555 #if KOKKOSKERNELS_VERSION >= 40199
1556  if(space != ExecutionSpace()) {
1558  "Stokhos spmv not implemented for non-default execution space instance");
1559  }
1560 #endif
1561  if(mode[0]!='N') {
1563  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1564  }
1565  if (y.extent(1) == 1) {
1566  auto y_1D = subview(y, Kokkos::ALL(), 0);
1567  auto x_1D = subview(x, Kokkos::ALL(), 0);
1568 #if KOKKOSKERNELS_VERSION >= 40299
1569  spmv(space, handle, mode, a, A, x_1D, b, y_1D);
1570 #elif (KOKKOSKERNELS_VERSION < 40299) && (KOKKOSKERNELS_VERSION >= 40199)
1571  spmv(space, KokkosKernels::Experimental::Controls(), mode, a, A, x_1D, b, y_1D, RANK_ONE());
1572 #else
1573  spmv(KokkosKernels::Experimental::Controls(), mode, a, A, x_1D, b, y_1D, RANK_ONE());
1574 #endif
1575  }
1576  else {
1577  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1578  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1579 
1580  typedef Stokhos::Multiply<MatrixType, typename InputVectorType::const_type,
1581  OutputVectorType> multiply_type;
1582  typedef Stokhos::MeanMultiply<MatrixType, typename InputVectorType::const_type,
1583  OutputVectorType> mean_multiply_type;
1584 
1585  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1587  "Stokhos spmv not implemented for non-constant a or b");
1588  }
1589 
1590  typename InputVectorType::const_type x_const = x;
1591 
1592  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1593  mean_multiply_type::apply( A, x_const, y,
1594  Sacado::Value<AlphaType>::eval(a),
1595  Sacado::Value<BetaType>::eval(b));
1596  }
1597  else
1598  multiply_type::apply( A, x_const, y,
1599  Sacado::Value<AlphaType>::eval(a),
1600  Sacado::Value<BetaType>::eval(b));
1601  }
1602 }
1603 
1604 }
1605 
1606 #endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP */
Kokkos::DefaultExecutionSpace execution_space
Kokkos::View< typename ViewType::data_type, typename ViewType::array_layout, typename ViewType::device_type, Kokkos::MemoryTraits< M1 > > type
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Kokkos::DefaultExecutionSpace device
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
KOKKOS_INLINE_FUNCTION void zero()