Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_CrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
43 #define STOKHOS_CRSPRODUCTTENSOR_HPP
44 
45 //#include "Kokkos_View.hpp"
46 //#include "Kokkos_Layout.hpp"
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
50 #include "Stokhos_ProductBasis.hpp"
55 #include "Stokhos_TinyVec.hpp"
56 
57 //----------------------------------------------------------------------------
58 //----------------------------------------------------------------------------
59 
60 namespace Stokhos {
61 
79 template< typename ValueType, class ExecutionSpace, class Memory = void >
81 public:
82 
83  typedef ExecutionSpace execution_space;
84  typedef int size_type;
85  typedef ValueType value_type;
86  typedef Memory memory_type;
87 
88  typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space host_mirror_space ;
90 
91 // Vectorsize used in multiply algorithm
92 #if defined(__AVX__)
93  static const size_type host_vectorsize = 32/sizeof(value_type);
94  static const bool use_intrinsics = true;
95  static const size_type num_entry_align = 1;
96 #elif defined(__MIC__)
97  static const size_type host_vectorsize = 16;
98  static const bool use_intrinsics = true;
99  static const size_type num_entry_align = 8; // avoid use of mask instructions
100 #else
101  static const size_type host_vectorsize = 2;
102  static const bool use_intrinsics = false;
103  static const size_type num_entry_align = 1;
104 #endif
105  static const size_type cuda_vectorsize = 32;
106  static const bool is_cuda =
107 #if defined( KOKKOS_ENABLE_CUDA )
108  Kokkos::Impl::is_same<ExecutionSpace,Kokkos::Cuda>::value;
109 #else
110  false ;
111 #endif
113 
114  // Alignment in terms of number of entries of CRS rows
116 
117 private:
118 
119  template <class, class, class> friend class CrsProductTensor;
120 
121  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > vec_type;
122  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type;
123  typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type;
124  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type;
125  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type;
126  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type;
127 
138 
139  struct CijkRowCount {
140  unsigned count;
141  unsigned basis;
142 
144  : count(0)
145  , basis(0)
146  {}
147  };
148 
150  bool operator() (const CijkRowCount& a, const CijkRowCount& b) const {
151  return a.count < b.count;
152  }
153  };
154 
155 public:
156 
157  KOKKOS_INLINE_FUNCTION
159 
160  KOKKOS_INLINE_FUNCTION
162  m_coord(),
163  m_coord2(),
164  m_value(),
165  m_num_entry(),
166  m_row_map(),
167  m_dim(0),
168  m_entry_max(0),
169  m_nnz(0),
170  m_flops(0),
172 
173  template <class M>
174  KOKKOS_INLINE_FUNCTION
176  m_coord( rhs.m_coord ),
177  m_coord2( rhs.m_coord2 ),
178  m_value( rhs.m_value ),
179  m_num_entry( rhs.m_num_entry ),
180  m_row_map( rhs.m_row_map ),
181  m_dim( rhs.m_dim ),
182  m_entry_max( rhs.m_entry_max ),
183  m_nnz( rhs.m_nnz ),
184  m_flops( rhs.m_flops ),
186 
187  template <class M>
188  KOKKOS_INLINE_FUNCTION
191  {
192  m_coord = rhs.m_coord;
193  m_coord2 = rhs.m_coord2;
194  m_value = rhs.m_value;
195  m_num_entry = rhs.m_num_entry;
196  m_row_map = rhs.m_row_map;
197  m_dim = rhs.m_dim;
198  m_entry_max = rhs.m_entry_max;
199  m_nnz = rhs.m_nnz;
200  m_flops = rhs.m_flops;
202  return *this;
203  }
204 
206  KOKKOS_INLINE_FUNCTION
207  size_type dimension() const { return m_dim; }
208 
210  KOKKOS_INLINE_FUNCTION
211  bool is_empty() const { return m_dim == 0; }
212 
214  KOKKOS_INLINE_FUNCTION
216  { return m_coord.extent(0); }
217 
219  KOKKOS_INLINE_FUNCTION
221  { return m_entry_max; }
222 
224  KOKKOS_INLINE_FUNCTION
226  { return m_row_map[i]; }
227 
229  KOKKOS_INLINE_FUNCTION
231  { return m_row_map[i] + m_num_entry(i); }
232 
234  KOKKOS_INLINE_FUNCTION
236  { return m_num_entry(i); }
237 
239  KOKKOS_INLINE_FUNCTION
240  const size_type& coord( const size_type entry, const size_type c ) const
241  { return m_coord2( entry, c ); }
242 
244  KOKKOS_INLINE_FUNCTION
245  const size_type& coord( const size_type entry ) const
246  { return m_coord( entry ); }
247 
249  KOKKOS_INLINE_FUNCTION
250  const value_type & value( const size_type entry ) const
251  { return m_value( entry ); }
252 
254  KOKKOS_INLINE_FUNCTION
256  { return m_nnz; }
257 
259  KOKKOS_INLINE_FUNCTION
261  { return m_flops; }
262 
264  KOKKOS_INLINE_FUNCTION
266  { return m_avg_entries_per_row; }
267 
268  template <typename OrdinalType>
269  static CrsProductTensor
273  {
275 
276  // Note (etp 01/08/15 Commenting out the sorting as it causes a really
277  // weird compiler error when compiling with NVCC. It seems to think the
278  // < in CompareCijkRowCount() is part of a template parameter. We don't
279  // seem to use this option, so I am just commenting it out.
280 
281  // bool sort_nnz = false;
282  // if (params.isParameter("Sort Nonzeros"))
283  // sort_nnz = params.get<bool>("Sort Nonzeros");
284 
285  // Compute number of non-zeros for each i
286  const size_type dimension = basis.size();
287  std::vector< size_t > coord_work( dimension, (size_t) 0 );
289  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
290  i_it!=Cijk.i_end(); ++i_it) {
291  OrdinalType i = index(i_it);
292  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
293  k_it != Cijk.k_end(i_it); ++k_it) {
294  OrdinalType k = index(k_it);
295  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
296  j_it != Cijk.j_end(k_it); ++j_it) {
297  OrdinalType j = index(j_it);
298  if (j >= k) {
299  ++coord_work[i];
300  ++entry_count;
301  }
302  }
303  }
304  }
305 
306  // Compute average nonzeros per row (must be before padding)
307  size_type avg_entries_per_row = entry_count / dimension;
308 
309  // Pad each row to have size divisible by alignment size
310  for ( size_type i = 0; i < dimension; ++i ) {
311  const size_t rem = coord_work[i] % tensor_align;
312  if (rem > 0) {
313  const size_t pad = tensor_align - rem;
314  coord_work[i] += pad;
315  entry_count += pad;
316  }
317  }
318 
319  // Sort based on number of non-zeros
320  std::vector< CijkRowCount > row_count( dimension );
321  for ( size_type i = 0; i < dimension; ++i ) {
322  row_count[i].count = coord_work[i];
323  row_count[i].basis = i;
324  }
325 
326  // Note (etp 01/08/15 See above.
327 
328  // if (sort_nnz)
329  // std::sort( row_count.begin(), row_count.end(), CompareCijkRowCount() );
330  std::vector<size_type> sorted_row_map( dimension );
331  for ( size_type i = 0; i < dimension; ++i ) {
332  coord_work[i] = row_count[i].count;
333  sorted_row_map[ row_count[i].basis ] = i;
334  }
335 
336  // Allocate tensor data
337  // coord and coord2 are initialized to zero because otherwise we get
338  // seg faults in the MIC algorithm when processing the tail of each
339  // tensor row. Not quite sure why as the coord loads are padded to
340  // length 16 and are masked for the remainder (unless it does load x[j]
341  // anyway and masks off the result, so j needs to be valid).
342  CrsProductTensor tensor;
343  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
344  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
345  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
346  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
347  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
348  tensor.m_dim = dimension;
349  tensor.m_entry_max = 0;
351 
352  // Create mirror, is a view if is host memory
353  typename coord_array_type::HostMirror
354  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
355  typename coord2_array_type::HostMirror
356  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
357  typename value_array_type::HostMirror
358  host_value = Kokkos::create_mirror_view( tensor.m_value );
359  typename entry_array_type::HostMirror
360  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
361  typename entry_array_type::HostMirror
362  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
363 
364  // Compute row map
365  size_type sum = 0;
366  host_row_map(0) = 0;
367  for ( size_type i = 0; i < dimension; ++i ) {
368  sum += coord_work[i];
369  host_row_map(i+1) = sum;
370  host_num_entry(i) = 0;
371  }
372 
373  for ( size_type iCoord = 0; iCoord < dimension; ++iCoord ) {
374  coord_work[iCoord] = host_row_map[iCoord];
375  }
376 
377  // Initialize values and coordinates to zero since we will have extra
378  // ones for alignment
379  Kokkos::deep_copy( host_value, 0.0 );
380  Kokkos::deep_copy( host_coord, 0 );
381  Kokkos::deep_copy( host_coord2, 0 );
382 
383  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
384  i_it!=Cijk.i_end(); ++i_it) {
385  OrdinalType i = index(i_it);
386  const size_type row = sorted_row_map[i];
387  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
388  k_it != Cijk.k_end(i_it); ++k_it) {
389  OrdinalType k = index(k_it);
390  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
391  j_it != Cijk.j_end(k_it); ++j_it) {
392  OrdinalType j = index(j_it);
393  ValueType c = Stokhos::value(j_it);
394  if (j >= k) {
395  const size_type n = coord_work[row]; ++coord_work[row];
396  host_value(n) = (j != k) ? c : 0.5*c;
397  host_coord2(n,0) = j;
398  host_coord2(n,1) = k;
399  host_coord(n) = ( k << 16 ) | j;
400  ++host_num_entry(row);
401  ++tensor.m_nnz;
402  }
403  }
404  }
405  // Align num_entry
406  host_num_entry(row) =
407  (host_num_entry(row) + num_entry_align-1) & ~(num_entry_align-1);
408  }
409 
410  // Copy data to device if necessary
411  Kokkos::deep_copy( tensor.m_coord, host_coord );
412  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
413  Kokkos::deep_copy( tensor.m_value, host_value );
414  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
415  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
416 
417  for ( size_type i = 0; i < dimension; ++i ) {
418  tensor.m_entry_max = std::max( tensor.m_entry_max, host_num_entry(i) );
419  }
420 
421  tensor.m_flops = 5*tensor.m_nnz + dimension;
422 
423  return tensor;
424  }
425 
427  {
428  const size_type dimension = 1;
429  const size_type entry_count = 1;
430 
431  // Allocate tensor data
432  // coord and coord2 are initialized to zero because otherwise we get
433  // seg faults in the MIC algorithm when processing the tail of each
434  // tensor row. Not quite sure why as the coord loads are padded to
435  // length 16 and are masked for the remainder (unless it does load x[j]
436  // anyway and masks off the result, so j needs to be valid).
437  CrsProductTensor tensor;
438  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
439  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
440  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
441  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
442  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
443  tensor.m_dim = dimension;
444  tensor.m_entry_max = 1;
445  tensor.m_avg_entries_per_row = 1;
446  tensor.m_nnz = 1;
447  tensor.m_flops = 5*tensor.m_nnz + dimension;
448 
449  // Create mirror, is a view if is host memory
450  typename coord_array_type::HostMirror
451  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
452  typename coord2_array_type::HostMirror
453  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
454  typename value_array_type::HostMirror
455  host_value = Kokkos::create_mirror_view( tensor.m_value );
456  typename entry_array_type::HostMirror
457  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
458  typename entry_array_type::HostMirror
459  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
460 
461  // Compute row map
462  host_row_map(0) = 0;
463  host_row_map(1) = 1;
464  host_num_entry(0) = 1;
465 
466  // Compute tensor values
467  host_value(0) = 0.5;
468  host_coord2(0,0) = 0;
469  host_coord2(0,1) = 0;
470  host_coord(0) = 0;
471 
472  // Copy data to device if necessary
473  Kokkos::deep_copy( tensor.m_coord, host_coord );
474  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
475  Kokkos::deep_copy( tensor.m_value, host_value );
476  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
477  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
478 
479  return tensor;
480  }
481 
482  static HostMirror
484  HostMirror host_tensor;
485 
486  host_tensor.m_coord = Kokkos::create_mirror_view( tensor.m_coord );
487  host_tensor.m_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
488  host_tensor.m_value = Kokkos::create_mirror_view( tensor.m_value );
489  host_tensor.m_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
490  host_tensor.m_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
491 
492  host_tensor.m_dim = tensor.m_dim;
493  host_tensor.m_entry_max = tensor.m_entry_max;
494  host_tensor.m_avg_entries_per_row = tensor.m_avg_entries_per_row;
495  host_tensor.m_nnz = tensor.m_nnz;
496  host_tensor.m_flops = tensor.m_flops;
497 
498  return host_tensor;
499  }
500 
501  template < class DstDevice, class DstMemory >
502  static void
504  const CrsProductTensor& src ) {
505  Kokkos::deep_copy( dst.m_coord, src.m_coord );
506  Kokkos::deep_copy( dst.m_coord2, src.m_coord2 );
507  Kokkos::deep_copy( dst.m_value, src.m_value );
510  }
511 
512 };
513 
514 template< class Device, typename OrdinalType, typename ValueType>
515 CrsProductTensor<ValueType, Device>
520 {
521  return CrsProductTensor<ValueType, Device>::create(basis, Cijk, params );
522 }
523 
524 template< class Device, typename OrdinalType, typename ValueType,
525  class Memory >
526 CrsProductTensor<ValueType, Device, Memory>
531 {
533  basis, Cijk, params );
534 }
535 
536 template< class Device, typename OrdinalType, typename ValueType>
537 CrsProductTensor<ValueType, Device>
539 {
541 }
542 
543 template< class Device, typename OrdinalType, typename ValueType,
544  class Memory >
545 CrsProductTensor<ValueType, Device, Memory>
547 {
549 }
550 
551 template < class ValueType, class Device, class Memory >
552 inline
553 typename CrsProductTensor<ValueType,Device,Memory>::HostMirror
555 {
557 }
558 
559  template < class ValueType,
560  class DstDevice, class DstMemory,
561  class SrcDevice, class SrcMemory >
562 void
565 {
567 }
568 
569 template < typename ValueType, typename Device >
570 class BlockMultiply< CrsProductTensor< ValueType , Device > >
571 {
572 public:
573 
574  typedef Device execution_space;
577 
578 // Whether to use manual or auto-vectorization
579 #ifdef __MIC__
580 #define USE_AUTO_VECTORIZATION 1
581 #else
582 #define USE_AUTO_VECTORIZATION 0
583 #endif
584 
585 #if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
586 
587  // Version leveraging intel vectorization
588  template< typename MatrixValue , typename VectorValue >
589  KOKKOS_INLINE_FUNCTION
590  static void apply( const tensor_type & tensor ,
591  const MatrixValue * const a ,
592  const VectorValue * const x ,
593  VectorValue * const y ,
594  const VectorValue & alpha = VectorValue(1) )
595  {
596  // The intel compiler doesn't seem to be able to vectorize through
597  // the coord() calls, so extract pointers
598  const size_type * cj = &tensor.coord(0,0);
599  const size_type * ck = &tensor.coord(0,1);
600  const size_type nDim = tensor.dimension();
601 
602  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
603  const size_type nEntry = tensor.num_entry(iy);
604  const size_type iEntryBeg = tensor.entry_begin(iy);
605  const size_type iEntryEnd = iEntryBeg + nEntry;
606  VectorValue ytmp = 0;
607 
608 #pragma simd vectorlength(tensor_type::vectorsize)
609 #pragma ivdep
610 #pragma vector aligned
611  for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
612  const size_type j = cj[iEntry]; //tensor.coord(iEntry,0);
613  const size_type k = ck[iEntry]; //tensor.coord(iEntry,1);
614  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
615  }
616 
617  y[iy] += alpha * ytmp ;
618  }
619  }
620 
621 #elif defined(__MIC__)
622 
623  // Version specific to MIC architecture using manual vectorization
624  template< typename MatrixValue , typename VectorValue >
625  KOKKOS_INLINE_FUNCTION
626  static void apply( const tensor_type & tensor ,
627  const MatrixValue * const a ,
628  const VectorValue * const x ,
629  VectorValue * const y ,
630  const VectorValue & alpha = VectorValue(1) )
631  {
632  const size_type nDim = tensor.dimension();
633  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
634 
635  const size_type nEntry = tensor.num_entry(iy);
636  const size_type iEntryBeg = tensor.entry_begin(iy);
637  const size_type iEntryEnd = iEntryBeg + nEntry;
638  size_type iEntry = iEntryBeg;
639 
640  VectorValue ytmp = 0 ;
641 
642  const size_type nBlock = nEntry / tensor_type::vectorsize;
643  const size_type nEntryB = nBlock * tensor_type::vectorsize;
644  const size_type iEnd = iEntryBeg + nEntryB;
645 
646  typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
647  TV vy;
648  vy.zero();
649  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
650  const size_type *j = &tensor.coord(iEntry,0);
651  const size_type *k = &tensor.coord(iEntry,1);
652  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
653  c(&(tensor.value(iEntry)));
654 
655  // vy += c * ( aj * xk + ak * xj)
656  aj.times_equal(xk);
657  aj.multiply_add(ak, xj);
658  vy.multiply_add(c, aj);
659 
660  }
661  ytmp += vy.sum();
662 
663  // The number of nonzeros is always constrained to be a multiple of 8
664 
665  const size_type rem = iEntryEnd-iEntry;
666  if (rem >= 8) {
667  typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
668  const size_type *j = &tensor.coord(iEntry,0);
669  const size_type *k = &tensor.coord(iEntry,1);
670  TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
671  c(&(tensor.value(iEntry)));
672 
673  // vy += c * ( aj * xk + ak * xj)
674  aj.times_equal(xk);
675  aj.multiply_add(ak, xj);
676  aj.times_equal(c);
677  ytmp += aj.sum();
678  }
679 
680  y[iy] += alpha * ytmp ;
681  }
682  }
683 
684 #else
685 
686  // General version
687  template< typename MatrixValue , typename VectorValue >
688  KOKKOS_INLINE_FUNCTION
689  static void apply( const tensor_type & tensor ,
690  const MatrixValue * const a ,
691  const VectorValue * const x ,
692  VectorValue * const y ,
693  const VectorValue & alpha = VectorValue(1) )
694  {
695  const size_type nDim = tensor.dimension();
696  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
697 
698  const size_type nEntry = tensor.num_entry(iy);
699  const size_type iEntryBeg = tensor.entry_begin(iy);
700  const size_type iEntryEnd = iEntryBeg + nEntry;
701  size_type iEntry = iEntryBeg;
702 
703  VectorValue ytmp = 0 ;
704 
705  // Do entries with a blocked loop of size vectorsize
706  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
707  const size_type nBlock = nEntry / tensor_type::vectorsize;
708  const size_type nEntryB = nBlock * tensor_type::vectorsize;
709  const size_type iEnd = iEntryBeg + nEntryB;
710 
712  TV vy;
713  vy.zero();
714  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
715  const size_type *j = &tensor.coord(iEntry,0);
716  const size_type *k = &tensor.coord(iEntry,1);
717  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.value(iEntry)));
718 
719  // vy += c * ( aj * xk + ak * xj)
720  aj.times_equal(xk);
721  aj.multiply_add(ak, xj);
722  vy.multiply_add(c, aj);
723  }
724  ytmp += vy.sum();
725  }
726 
727  // Do remaining entries with a scalar loop
728  for ( ; iEntry<iEntryEnd; ++iEntry) {
729  const size_type j = tensor.coord(iEntry,0);
730  const size_type k = tensor.coord(iEntry,1);
731 
732  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
733  }
734 
735  y[iy] += alpha * ytmp ;
736  }
737  }
738 #endif
739 
740  KOKKOS_INLINE_FUNCTION
741  static size_type matrix_size( const tensor_type & tensor )
742  { return tensor.dimension(); }
743 
744  KOKKOS_INLINE_FUNCTION
745  static size_type vector_size( const tensor_type & tensor )
746  { return tensor.dimension(); }
747 };
748 
749 // Specialization of Multiply< BlockCrsMatrix< BlockSpec, ... > > > for
750 // CrsProductTensor, which provides a version that processes blocks of FEM
751 // columns together to reduce the number of global reads of the sparse 3 tensor
752 
753 // Even though this isn't specific to Threads, templating on Device creates a
754 // duplicate specialization error for Cuda. Need to see if we can fix this,
755 // or put the implementation in another class easily specialized for Threads,
756 // OpenMP, ...
757 template< typename ValueType , typename MatrixValue , typename VectorValue ,
758  typename Device >
760 public:
761 
762  typedef Device execution_space ;
765  typedef typename BlockSpec::size_type size_type ;
766  typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space > block_vector_type ;
768 
769  const matrix_type m_A ;
772 
774  const block_vector_type & x ,
775  const block_vector_type & y )
776  : m_A( A )
777  , m_x( x )
778  , m_y( y )
779  {}
780 
781  //--------------------------------------------------------------------------
782  // A( storage_size( m_A.block.size() ) , m_A.graph.row_map.size() );
783  // x( m_A.block.dimension() , m_A.graph.row_map.first_count() );
784  // y( m_A.block.dimension() , m_A.graph.row_map.first_count() );
785  //
786 
787  KOKKOS_INLINE_FUNCTION
788  void operator()( const size_type iBlockRow ) const
789  {
790  // Prefer that y[ m_A.block.dimension() ] be scratch space
791  // on the local thread, but cannot dynamically allocate
792  VectorValue * const y = & m_y(0,iBlockRow);
793 
794  const size_type iEntryBegin = m_A.graph.row_map[ iBlockRow ];
795  const size_type iEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
796 
797  // Leading dimension guaranteed contiguous for LayoutLeft
798  for ( size_type j = 0 ; j < m_A.block.dimension() ; ++j ) { y[j] = 0 ; }
799 
800  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
801  const VectorValue * const x = & m_x( 0 , m_A.graph.entries(iEntry) );
802  const MatrixValue * const a = & m_A.values( 0 , iEntry );
803 
805  }
806 
807  }
808 
809  /*
810  * Compute work range = (begin, end) such that adjacent threads write to
811  * separate cache lines
812  */
813  KOKKOS_INLINE_FUNCTION
814  std::pair< size_type , size_type >
815  compute_work_range( const size_type work_count ,
816  const size_type thread_count ,
817  const size_type thread_rank ) const
818  {
819  enum { work_align = 64 / sizeof(VectorValue) };
820  enum { work_shift = Kokkos::Impl::power_of_two< work_align >::value };
821  enum { work_mask = work_align - 1 };
822 
823  const size_type work_per_thread =
824  ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
825  thread_count ) << work_shift ;
826 
827  const size_type work_begin =
828  std::min( thread_rank * work_per_thread , work_count );
829  const size_type work_end =
830  std::min( work_begin + work_per_thread , work_count );
831 
832  return std::make_pair( work_begin , work_end );
833  }
834 
835 #if defined(__MIC__)
836 
837  // A MIC-specific version of the block-multiply algorithm, where block here
838  // means processing multiple FEM columns at a time
839  KOKKOS_INLINE_FUNCTION
840  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
841  {
842  const size_type iBlockRow = device.league_rank();
843 
844  // Check for valid row
845  const size_type row_count = m_A.graph.row_map.extent(0)-1;
846  if (iBlockRow >= row_count)
847  return;
848 
849  const size_type num_thread = device.team_size();
850  const size_type thread_idx = device.team_rank();
851  std::pair<size_type,size_type> work_range =
852  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
853 
854  // Prefer that y[ m_A.block.dimension() ] be scratch space
855  // on the local thread, but cannot dynamically allocate
856  VectorValue * const y = & m_y(0,iBlockRow);
857 
858  // Leading dimension guaranteed contiguous for LayoutLeft
859  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
860  y[j] = 0 ;
861 
862  const tensor_type& tensor = m_A.block.tensor();
863 
864  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
865  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
866  const size_type BlockSize = 9;
867  const size_type numBlock =
868  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
869 
870  const MatrixValue* sh_A[BlockSize];
871  const VectorValue* sh_x[BlockSize];
872 
873  size_type iBlockEntry = iBlockEntryBeg;
874  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
875  const size_type block_size =
876  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
877 
878  for ( size_type col = 0; col < block_size; ++col ) {
879  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
880  sh_x[col] = & m_x( 0 , iBlockColumn );
881  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
882  }
883 
884  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
885 
886  const size_type nEntry = tensor.num_entry(iy);
887  const size_type iEntryBeg = tensor.entry_begin(iy);
888  const size_type iEntryEnd = iEntryBeg + nEntry;
889  size_type iEntry = iEntryBeg;
890 
891  VectorValue ytmp = 0 ;
892 
893  // Do entries with a blocked loop of size blocksize
894  const size_type nBlock = nEntry / tensor_type::vectorsize;
895  const size_type nEntryB = nBlock * tensor_type::vectorsize;
896  const size_type iEnd = iEntryBeg + nEntryB;
897 
901  VecTV vy;
902  vy.zero();
903  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
904  const size_type *j = &tensor.coord(iEntry,0);
905  const size_type *k = &tensor.coord(iEntry,1);
906  ValTV c(&(tensor.value(iEntry)));
907 
908  for ( size_type col = 0; col < block_size; ++col ) {
909  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
910  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
911 
912  // vy += c * ( aj * xk + ak * xj)
913  aj.times_equal(xk);
914  aj.multiply_add(ak, xj);
915  vy.multiply_add(c, aj);
916  }
917  }
918  ytmp += vy.sum();
919 
920  // The number of nonzeros is always constrained to be a multiple of 8
921 
922  const size_type rem = iEntryEnd-iEntry;
923  if (rem >= 8) {
927  const size_type *j = &tensor.coord(iEntry,0);
928  const size_type *k = &tensor.coord(iEntry,1);
929  ValTV2 c(&(tensor.value(iEntry)));
930 
931  for ( size_type col = 0; col < block_size; ++col ) {
932  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
933  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
934 
935  // vy += c * ( aj * xk + ak * xj)
936  aj.times_equal(xk);
937  aj.multiply_add(ak, xj);
938  aj.times_equal(c);
939  ytmp += aj.sum();
940  }
941  }
942 
943  y[iy] += ytmp ;
944  }
945 
946  // Add a team barrier to keep the thread team in-sync before going on
947  // to the next block
948  device.team_barrier();
949  }
950 
951  }
952 
953 #else
954 
955  // A general hand-vectorized version of the block multiply algorithm, where
956  // block here means processing multiple FEM columns at a time. Note that
957  // auto-vectorization of a block algorithm doesn't work, because the
958  // stochastic loop is not the inner-most loop.
959  KOKKOS_INLINE_FUNCTION
960  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
961  {
962  const size_type iBlockRow = device.league_rank();
963 
964  // Check for valid row
965  const size_type row_count = m_A.graph.row_map.extent(0)-1;
966  if (iBlockRow >= row_count)
967  return;
968 
969  const size_type num_thread = device.team_size();
970  const size_type thread_idx = device.team_rank();
971  std::pair<size_type,size_type> work_range =
972  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
973 
974  // Prefer that y[ m_A.block.dimension() ] be scratch space
975  // on the local thread, but cannot dynamically allocate
976  VectorValue * const y = & m_y(0,iBlockRow);
977 
978  // Leading dimension guaranteed contiguous for LayoutLeft
979  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
980  y[j] = 0 ;
981 
982  const tensor_type& tensor = m_A.block.tensor();
983 
984  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
985  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
986  const size_type BlockSize = 14;
987  const size_type numBlock =
988  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
989 
990  const MatrixValue* sh_A[BlockSize];
991  const VectorValue* sh_x[BlockSize];
992 
993  size_type iBlockEntry = iBlockEntryBeg;
994  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
995  const size_type block_size =
996  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
997 
998  for ( size_type col = 0; col < block_size; ++col ) {
999  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
1000  sh_x[col] = & m_x( 0 , iBlockColumn );
1001  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
1002  }
1003 
1004  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
1005 
1006  const size_type nEntry = tensor.num_entry(iy);
1007  const size_type iEntryBeg = tensor.entry_begin(iy);
1008  const size_type iEntryEnd = iEntryBeg + nEntry;
1009  size_type iEntry = iEntryBeg;
1010 
1011  VectorValue ytmp = 0 ;
1012 
1013  // Do entries with a blocked loop of size blocksize
1014  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
1015  const size_type nBlock = nEntry / tensor_type::vectorsize;
1016  const size_type nEntryB = nBlock * tensor_type::vectorsize;
1017  const size_type iEnd = iEntryBeg + nEntryB;
1018 
1022  VecTV vy;
1023  vy.zero();
1024  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
1025  const size_type *j = &tensor.coord(iEntry,0);
1026  const size_type *k = &tensor.coord(iEntry,1);
1027  ValTV c(&(tensor.value(iEntry)));
1028 
1029  for ( size_type col = 0; col < block_size; ++col ) {
1030  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
1031  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
1032 
1033  // vy += c * ( aj * xk + ak * xj)
1034  aj.times_equal(xk);
1035  aj.multiply_add(ak, xj);
1036  vy.multiply_add(c, aj);
1037  }
1038  }
1039  ytmp += vy.sum();
1040  }
1041 
1042  // Do remaining entries with a scalar loop
1043  for ( ; iEntry<iEntryEnd; ++iEntry) {
1044  const size_type j = tensor.coord(iEntry,0);
1045  const size_type k = tensor.coord(iEntry,1);
1046  ValueType cijk = tensor.value(iEntry);
1047 
1048  for ( size_type col = 0; col < block_size; ++col ) {
1049  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
1050  sh_A[col][k] * sh_x[col][j] );
1051  }
1052 
1053  }
1054 
1055  y[iy] += ytmp ;
1056  }
1057 
1058  // Add a team barrier to keep the thread team in-sync before going on
1059  // to the next block
1060  device.team_barrier();
1061  }
1062 
1063  }
1064 
1065 #endif
1066 
1067  static void apply( const matrix_type & A ,
1068  const block_vector_type & x ,
1069  const block_vector_type & y )
1070  {
1071  // Generally the block algorithm seems to perform better on the MIC,
1072  // as long as the stochastic size isn't too big, but doesn't perform
1073  // any better on the CPU (probably because the CPU has a fat L3 cache
1074  // to store the sparse 3 tensor).
1075 #ifdef __MIC__
1076  const bool use_block_algorithm = true;
1077 #else
1078  const bool use_block_algorithm = false;
1079 #endif
1080 
1081  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1082  if (use_block_algorithm) {
1083 #ifdef __MIC__
1084  const size_t team_size = 4; // 4 hyperthreads for MIC
1085 #else
1086  const size_t team_size = 2; // 2 for everything else
1087 #endif
1088  const size_t league_size = row_count;
1089  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1090  Kokkos::parallel_for( config , MultiplyImpl(A,x,y) );
1091  }
1092  else {
1093  Kokkos::parallel_for( row_count , MultiplyImpl(A,x,y) );
1094  }
1095  }
1096 };
1097 
1098 //----------------------------------------------------------------------------
1099 
1100 } /* namespace Stokhos */
1101 
1102 //----------------------------------------------------------------------------
1103 //----------------------------------------------------------------------------
1104 
1105 // Inject some functions into the Kokkos namespace
1106 namespace Kokkos {
1107 
1109  using Stokhos::deep_copy;
1110 
1111 } // namespace Kokkos
1112 
1113 #endif /* #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP */
static const size_type num_entry_align
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Bases defined by combinatorial product of polynomial bases.
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero&#39;s.
k_iterator k_begin() const
Iterator pointing to first k entry.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
CrsProductTensor< ValueType, execution_space > tensor_type
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop&#39;s per multiply-add.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate &#39;i&#39;.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
const block_vector_type m_x
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate &#39;i&#39;.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
CrsProductTensor< ValueType, Device, Memory >::HostMirror create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
CrsProductTensor< value_type, host_mirror_space > HostMirror
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION CrsProductTensor()
Kokkos::DefaultExecutionSpace device
static HostMirror create_mirror_view(const CrsProductTensor &tensor)
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
CRS matrix of dense blocks.
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate &#39;i&#39;.
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
const block_vector_type m_y
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
KOKKOS_INLINE_FUNCTION void zero()
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
int n
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
virtual ordinal_type size() const =0
Return total size of basis.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
static const size_type cuda_vectorsize
static CrsProductTensor createMeanBased()