Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_CrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
11 #define STOKHOS_CRSPRODUCTTENSOR_HPP
12 
13 #include "Kokkos_Core.hpp"
14 
15 #include "Stokhos_Multiply.hpp"
16 #include "Stokhos_ProductBasis.hpp"
21 #include "Stokhos_TinyVec.hpp"
22 
23 //----------------------------------------------------------------------------
24 //----------------------------------------------------------------------------
25 
26 namespace Stokhos {
27 
45 template< typename ValueType, class ExecutionSpace, class Memory = void >
47 public:
48 
49  typedef ExecutionSpace execution_space;
50  typedef int size_type;
51  typedef ValueType value_type;
52  typedef Memory memory_type;
53 
54  typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space host_mirror_space ;
56 
57 // Vectorsize used in multiply algorithm
58 #if defined(__AVX__)
59  static const size_type host_vectorsize = 32/sizeof(value_type);
60  static const bool use_intrinsics = true;
61  static const size_type num_entry_align = 1;
62 #elif defined(__MIC__)
63  static const size_type host_vectorsize = 16;
64  static const bool use_intrinsics = true;
65  static const size_type num_entry_align = 8; // avoid use of mask instructions
66 #else
67  static const size_type host_vectorsize = 2;
68  static const bool use_intrinsics = false;
69  static const size_type num_entry_align = 1;
70 #endif
71  static const size_type cuda_vectorsize = 32;
72  static const bool is_cuda =
73 #if defined( KOKKOS_ENABLE_CUDA )
74  std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
75 #else
76  false ;
77 #endif
79 
80  // Alignment in terms of number of entries of CRS rows
82 
83 private:
84 
85  template <class, class, class> friend class CrsProductTensor;
86 
87  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > vec_type;
88  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type;
89  typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type;
90  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type;
91  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type;
92  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type;
93 
104 
105  struct CijkRowCount {
106  unsigned count;
107  unsigned basis;
108 
110  : count(0)
111  , basis(0)
112  {}
113  };
114 
116  bool operator() (const CijkRowCount& a, const CijkRowCount& b) const {
117  return a.count < b.count;
118  }
119  };
120 
121 public:
122 
123  KOKKOS_INLINE_FUNCTION
125 
126  KOKKOS_INLINE_FUNCTION
128  m_coord(),
129  m_coord2(),
130  m_value(),
131  m_num_entry(),
132  m_row_map(),
133  m_dim(0),
134  m_entry_max(0),
135  m_nnz(0),
136  m_flops(0),
138 
139  template <class M>
140  KOKKOS_INLINE_FUNCTION
142  m_coord( rhs.m_coord ),
143  m_coord2( rhs.m_coord2 ),
144  m_value( rhs.m_value ),
145  m_num_entry( rhs.m_num_entry ),
146  m_row_map( rhs.m_row_map ),
147  m_dim( rhs.m_dim ),
148  m_entry_max( rhs.m_entry_max ),
149  m_nnz( rhs.m_nnz ),
150  m_flops( rhs.m_flops ),
152 
153  template <class M>
154  KOKKOS_INLINE_FUNCTION
157  {
158  m_coord = rhs.m_coord;
159  m_coord2 = rhs.m_coord2;
160  m_value = rhs.m_value;
161  m_num_entry = rhs.m_num_entry;
162  m_row_map = rhs.m_row_map;
163  m_dim = rhs.m_dim;
164  m_entry_max = rhs.m_entry_max;
165  m_nnz = rhs.m_nnz;
166  m_flops = rhs.m_flops;
168  return *this;
169  }
170 
172  KOKKOS_INLINE_FUNCTION
173  size_type dimension() const { return m_dim; }
174 
176  KOKKOS_INLINE_FUNCTION
177  bool is_empty() const { return m_dim == 0; }
178 
180  KOKKOS_INLINE_FUNCTION
182  { return m_coord.extent(0); }
183 
185  KOKKOS_INLINE_FUNCTION
187  { return m_entry_max; }
188 
190  KOKKOS_INLINE_FUNCTION
192  { return m_row_map[i]; }
193 
195  KOKKOS_INLINE_FUNCTION
197  { return m_row_map[i] + m_num_entry(i); }
198 
200  KOKKOS_INLINE_FUNCTION
202  { return m_num_entry(i); }
203 
205  KOKKOS_INLINE_FUNCTION
206  const size_type& coord( const size_type entry, const size_type c ) const
207  { return m_coord2( entry, c ); }
208 
210  KOKKOS_INLINE_FUNCTION
211  const size_type& coord( const size_type entry ) const
212  { return m_coord( entry ); }
213 
215  KOKKOS_INLINE_FUNCTION
216  const value_type & value( const size_type entry ) const
217  { return m_value( entry ); }
218 
220  KOKKOS_INLINE_FUNCTION
222  { return m_nnz; }
223 
225  KOKKOS_INLINE_FUNCTION
227  { return m_flops; }
228 
230  KOKKOS_INLINE_FUNCTION
232  { return m_avg_entries_per_row; }
233 
234  template <typename OrdinalType>
235  static CrsProductTensor
239  {
241 
242  // Note (etp 01/08/15 Commenting out the sorting as it causes a really
243  // weird compiler error when compiling with NVCC. It seems to think the
244  // < in CompareCijkRowCount() is part of a template parameter. We don't
245  // seem to use this option, so I am just commenting it out.
246 
247  // bool sort_nnz = false;
248  // if (params.isParameter("Sort Nonzeros"))
249  // sort_nnz = params.get<bool>("Sort Nonzeros");
250 
251  // Compute number of non-zeros for each i
252  const size_type dimension = basis.size();
253  std::vector< size_t > coord_work( dimension, (size_t) 0 );
255  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
256  i_it!=Cijk.i_end(); ++i_it) {
257  OrdinalType i = index(i_it);
258  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
259  k_it != Cijk.k_end(i_it); ++k_it) {
260  OrdinalType k = index(k_it);
261  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
262  j_it != Cijk.j_end(k_it); ++j_it) {
263  OrdinalType j = index(j_it);
264  if (j >= k) {
265  ++coord_work[i];
266  ++entry_count;
267  }
268  }
269  }
270  }
271 
272  // Compute average nonzeros per row (must be before padding)
273  size_type avg_entries_per_row = entry_count / dimension;
274 
275  // Pad each row to have size divisible by alignment size
276  for ( size_type i = 0; i < dimension; ++i ) {
277  const size_t rem = coord_work[i] % tensor_align;
278  if (rem > 0) {
279  const size_t pad = tensor_align - rem;
280  coord_work[i] += pad;
281  entry_count += pad;
282  }
283  }
284 
285  // Sort based on number of non-zeros
286  std::vector< CijkRowCount > row_count( dimension );
287  for ( size_type i = 0; i < dimension; ++i ) {
288  row_count[i].count = coord_work[i];
289  row_count[i].basis = i;
290  }
291 
292  // Note (etp 01/08/15 See above.
293 
294  // if (sort_nnz)
295  // std::sort( row_count.begin(), row_count.end(), CompareCijkRowCount() );
296  std::vector<size_type> sorted_row_map( dimension );
297  for ( size_type i = 0; i < dimension; ++i ) {
298  coord_work[i] = row_count[i].count;
299  sorted_row_map[ row_count[i].basis ] = i;
300  }
301 
302  // Allocate tensor data
303  // coord and coord2 are initialized to zero because otherwise we get
304  // seg faults in the MIC algorithm when processing the tail of each
305  // tensor row. Not quite sure why as the coord loads are padded to
306  // length 16 and are masked for the remainder (unless it does load x[j]
307  // anyway and masks off the result, so j needs to be valid).
308  CrsProductTensor tensor;
309  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
310  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
311  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
312  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
313  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
314  tensor.m_dim = dimension;
315  tensor.m_entry_max = 0;
317 
318  // Create mirror, is a view if is host memory
319  typename coord_array_type::HostMirror
320  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
321  typename coord2_array_type::HostMirror
322  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
323  typename value_array_type::HostMirror
324  host_value = Kokkos::create_mirror_view( tensor.m_value );
325  typename entry_array_type::HostMirror
326  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
327  typename entry_array_type::HostMirror
328  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
329 
330  // Compute row map
331  size_type sum = 0;
332  host_row_map(0) = 0;
333  for ( size_type i = 0; i < dimension; ++i ) {
334  sum += coord_work[i];
335  host_row_map(i+1) = sum;
336  host_num_entry(i) = 0;
337  }
338 
339  for ( size_type iCoord = 0; iCoord < dimension; ++iCoord ) {
340  coord_work[iCoord] = host_row_map[iCoord];
341  }
342 
343  // Initialize values and coordinates to zero since we will have extra
344  // ones for alignment
345  Kokkos::deep_copy( host_value, 0.0 );
346  Kokkos::deep_copy( host_coord, 0 );
347  Kokkos::deep_copy( host_coord2, 0 );
348 
349  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
350  i_it!=Cijk.i_end(); ++i_it) {
351  OrdinalType i = index(i_it);
352  const size_type row = sorted_row_map[i];
353  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
354  k_it != Cijk.k_end(i_it); ++k_it) {
355  OrdinalType k = index(k_it);
356  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
357  j_it != Cijk.j_end(k_it); ++j_it) {
358  OrdinalType j = index(j_it);
359  ValueType c = Stokhos::value(j_it);
360  if (j >= k) {
361  const size_type n = coord_work[row]; ++coord_work[row];
362  host_value(n) = (j != k) ? c : 0.5*c;
363  host_coord2(n,0) = j;
364  host_coord2(n,1) = k;
365  host_coord(n) = ( k << 16 ) | j;
366  ++host_num_entry(row);
367  ++tensor.m_nnz;
368  }
369  }
370  }
371  // Align num_entry
372  host_num_entry(row) =
373  (host_num_entry(row) + num_entry_align-1) & ~(num_entry_align-1);
374  }
375 
376  // Copy data to device if necessary
377  Kokkos::deep_copy( tensor.m_coord, host_coord );
378  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
379  Kokkos::deep_copy( tensor.m_value, host_value );
380  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
381  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
382 
383  for ( size_type i = 0; i < dimension; ++i ) {
384  tensor.m_entry_max = std::max( tensor.m_entry_max, host_num_entry(i) );
385  }
386 
387  tensor.m_flops = 5*tensor.m_nnz + dimension;
388 
389  return tensor;
390  }
391 
393  {
394  const size_type dimension = 1;
395  const size_type entry_count = 1;
396 
397  // Allocate tensor data
398  // coord and coord2 are initialized to zero because otherwise we get
399  // seg faults in the MIC algorithm when processing the tail of each
400  // tensor row. Not quite sure why as the coord loads are padded to
401  // length 16 and are masked for the remainder (unless it does load x[j]
402  // anyway and masks off the result, so j needs to be valid).
403  CrsProductTensor tensor;
404  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
405  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
406  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
407  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
408  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
409  tensor.m_dim = dimension;
410  tensor.m_entry_max = 1;
411  tensor.m_avg_entries_per_row = 1;
412  tensor.m_nnz = 1;
413  tensor.m_flops = 5*tensor.m_nnz + dimension;
414 
415  // Create mirror, is a view if is host memory
416  typename coord_array_type::HostMirror
417  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
418  typename coord2_array_type::HostMirror
419  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
420  typename value_array_type::HostMirror
421  host_value = Kokkos::create_mirror_view( tensor.m_value );
422  typename entry_array_type::HostMirror
423  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
424  typename entry_array_type::HostMirror
425  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
426 
427  // Compute row map
428  host_row_map(0) = 0;
429  host_row_map(1) = 1;
430  host_num_entry(0) = 1;
431 
432  // Compute tensor values
433  host_value(0) = 0.5;
434  host_coord2(0,0) = 0;
435  host_coord2(0,1) = 0;
436  host_coord(0) = 0;
437 
438  // Copy data to device if necessary
439  Kokkos::deep_copy( tensor.m_coord, host_coord );
440  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
441  Kokkos::deep_copy( tensor.m_value, host_value );
442  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
443  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
444 
445  return tensor;
446  }
447 
448  static HostMirror
450  HostMirror host_tensor;
451 
452  host_tensor.m_coord = Kokkos::create_mirror_view( tensor.m_coord );
453  host_tensor.m_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
454  host_tensor.m_value = Kokkos::create_mirror_view( tensor.m_value );
455  host_tensor.m_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
456  host_tensor.m_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
457 
458  host_tensor.m_dim = tensor.m_dim;
459  host_tensor.m_entry_max = tensor.m_entry_max;
460  host_tensor.m_avg_entries_per_row = tensor.m_avg_entries_per_row;
461  host_tensor.m_nnz = tensor.m_nnz;
462  host_tensor.m_flops = tensor.m_flops;
463 
464  return host_tensor;
465  }
466 
467  template < class DstDevice, class DstMemory >
468  static void
470  const CrsProductTensor& src ) {
471  Kokkos::deep_copy( dst.m_coord, src.m_coord );
472  Kokkos::deep_copy( dst.m_coord2, src.m_coord2 );
473  Kokkos::deep_copy( dst.m_value, src.m_value );
476  }
477 
478 };
479 
480 template< class Device, typename OrdinalType, typename ValueType>
481 CrsProductTensor<ValueType, Device>
486 {
487  return CrsProductTensor<ValueType, Device>::create(basis, Cijk, params );
488 }
489 
490 template< class Device, typename OrdinalType, typename ValueType,
491  class Memory >
492 CrsProductTensor<ValueType, Device, Memory>
497 {
499  basis, Cijk, params );
500 }
501 
502 template< class Device, typename OrdinalType, typename ValueType>
503 CrsProductTensor<ValueType, Device>
505 {
507 }
508 
509 template< class Device, typename OrdinalType, typename ValueType,
510  class Memory >
511 CrsProductTensor<ValueType, Device, Memory>
513 {
515 }
516 
517 template < class ValueType, class Device, class Memory >
518 inline
519 typename CrsProductTensor<ValueType,Device,Memory>::HostMirror
521 {
523 }
524 
525  template < class ValueType,
526  class DstDevice, class DstMemory,
527  class SrcDevice, class SrcMemory >
528 void
531 {
533 }
534 
535 template < typename ValueType, typename Device >
536 class BlockMultiply< CrsProductTensor< ValueType , Device > >
537 {
538 public:
539 
540  typedef Device execution_space;
543 
544 // Whether to use manual or auto-vectorization
545 #ifdef __MIC__
546 #define USE_AUTO_VECTORIZATION 1
547 #else
548 #define USE_AUTO_VECTORIZATION 0
549 #endif
550 
551 #if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
552 
553  // Version leveraging intel vectorization
554  template< typename MatrixValue , typename VectorValue >
555  KOKKOS_INLINE_FUNCTION
556  static void apply( const tensor_type & tensor ,
557  const MatrixValue * const a ,
558  const VectorValue * const x ,
559  VectorValue * const y ,
560  const VectorValue & alpha = VectorValue(1) )
561  {
562  // The intel compiler doesn't seem to be able to vectorize through
563  // the coord() calls, so extract pointers
564  const size_type * cj = &tensor.coord(0,0);
565  const size_type * ck = &tensor.coord(0,1);
566  const size_type nDim = tensor.dimension();
567 
568  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
569  const size_type nEntry = tensor.num_entry(iy);
570  const size_type iEntryBeg = tensor.entry_begin(iy);
571  const size_type iEntryEnd = iEntryBeg + nEntry;
572  VectorValue ytmp = 0;
573 
574 #pragma simd vectorlength(tensor_type::vectorsize)
575 #pragma ivdep
576 #pragma vector aligned
577  for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
578  const size_type j = cj[iEntry]; //tensor.coord(iEntry,0);
579  const size_type k = ck[iEntry]; //tensor.coord(iEntry,1);
580  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
581  }
582 
583  y[iy] += alpha * ytmp ;
584  }
585  }
586 
587 #elif defined(__MIC__)
588 
589  // Version specific to MIC architecture using manual vectorization
590  template< typename MatrixValue , typename VectorValue >
591  KOKKOS_INLINE_FUNCTION
592  static void apply( const tensor_type & tensor ,
593  const MatrixValue * const a ,
594  const VectorValue * const x ,
595  VectorValue * const y ,
596  const VectorValue & alpha = VectorValue(1) )
597  {
598  const size_type nDim = tensor.dimension();
599  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
600 
601  const size_type nEntry = tensor.num_entry(iy);
602  const size_type iEntryBeg = tensor.entry_begin(iy);
603  const size_type iEntryEnd = iEntryBeg + nEntry;
604  size_type iEntry = iEntryBeg;
605 
606  VectorValue ytmp = 0 ;
607 
608  const size_type nBlock = nEntry / tensor_type::vectorsize;
609  const size_type nEntryB = nBlock * tensor_type::vectorsize;
610  const size_type iEnd = iEntryBeg + nEntryB;
611 
612  typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
613  TV vy;
614  vy.zero();
615  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
616  const size_type *j = &tensor.coord(iEntry,0);
617  const size_type *k = &tensor.coord(iEntry,1);
618  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
619  c(&(tensor.value(iEntry)));
620 
621  // vy += c * ( aj * xk + ak * xj)
622  aj.times_equal(xk);
623  aj.multiply_add(ak, xj);
624  vy.multiply_add(c, aj);
625 
626  }
627  ytmp += vy.sum();
628 
629  // The number of nonzeros is always constrained to be a multiple of 8
630 
631  const size_type rem = iEntryEnd-iEntry;
632  if (rem >= 8) {
633  typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
634  const size_type *j = &tensor.coord(iEntry,0);
635  const size_type *k = &tensor.coord(iEntry,1);
636  TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
637  c(&(tensor.value(iEntry)));
638 
639  // vy += c * ( aj * xk + ak * xj)
640  aj.times_equal(xk);
641  aj.multiply_add(ak, xj);
642  aj.times_equal(c);
643  ytmp += aj.sum();
644  }
645 
646  y[iy] += alpha * ytmp ;
647  }
648  }
649 
650 #else
651 
652  // General version
653  template< typename MatrixValue , typename VectorValue >
654  KOKKOS_INLINE_FUNCTION
655  static void apply( const tensor_type & tensor ,
656  const MatrixValue * const a ,
657  const VectorValue * const x ,
658  VectorValue * const y ,
659  const VectorValue & alpha = VectorValue(1) )
660  {
661  const size_type nDim = tensor.dimension();
662  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
663 
664  const size_type nEntry = tensor.num_entry(iy);
665  const size_type iEntryBeg = tensor.entry_begin(iy);
666  const size_type iEntryEnd = iEntryBeg + nEntry;
667  size_type iEntry = iEntryBeg;
668 
669  VectorValue ytmp = 0 ;
670 
671  // Do entries with a blocked loop of size vectorsize
672  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
673  const size_type nBlock = nEntry / tensor_type::vectorsize;
674  const size_type nEntryB = nBlock * tensor_type::vectorsize;
675  const size_type iEnd = iEntryBeg + nEntryB;
676 
678  TV vy;
679  vy.zero();
680  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
681  const size_type *j = &tensor.coord(iEntry,0);
682  const size_type *k = &tensor.coord(iEntry,1);
683  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.value(iEntry)));
684 
685  // vy += c * ( aj * xk + ak * xj)
686  aj.times_equal(xk);
687  aj.multiply_add(ak, xj);
688  vy.multiply_add(c, aj);
689  }
690  ytmp += vy.sum();
691  }
692 
693  // Do remaining entries with a scalar loop
694  for ( ; iEntry<iEntryEnd; ++iEntry) {
695  const size_type j = tensor.coord(iEntry,0);
696  const size_type k = tensor.coord(iEntry,1);
697 
698  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
699  }
700 
701  y[iy] += alpha * ytmp ;
702  }
703  }
704 #endif
705 
706  KOKKOS_INLINE_FUNCTION
707  static size_type matrix_size( const tensor_type & tensor )
708  { return tensor.dimension(); }
709 
710  KOKKOS_INLINE_FUNCTION
711  static size_type vector_size( const tensor_type & tensor )
712  { return tensor.dimension(); }
713 };
714 
715 // Specialization of Multiply< BlockCrsMatrix< BlockSpec, ... > > > for
716 // CrsProductTensor, which provides a version that processes blocks of FEM
717 // columns together to reduce the number of global reads of the sparse 3 tensor
718 
719 // Even though this isn't specific to Threads, templating on Device creates a
720 // duplicate specialization error for Cuda. Need to see if we can fix this,
721 // or put the implementation in another class easily specialized for Threads,
722 // OpenMP, ...
723 template< typename ValueType , typename MatrixValue , typename VectorValue ,
724  typename Device >
726 public:
727 
728  typedef Device execution_space ;
731  typedef typename BlockSpec::size_type size_type ;
732  typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space > block_vector_type ;
734 
735  const matrix_type m_A ;
738 
740  const block_vector_type & x ,
741  const block_vector_type & y )
742  : m_A( A )
743  , m_x( x )
744  , m_y( y )
745  {}
746 
747  //--------------------------------------------------------------------------
748  // A( storage_size( m_A.block.size() ) , m_A.graph.row_map.size() );
749  // x( m_A.block.dimension() , m_A.graph.row_map.first_count() );
750  // y( m_A.block.dimension() , m_A.graph.row_map.first_count() );
751  //
752 
753  KOKKOS_INLINE_FUNCTION
754  void operator()( const size_type iBlockRow ) const
755  {
756  // Prefer that y[ m_A.block.dimension() ] be scratch space
757  // on the local thread, but cannot dynamically allocate
758  VectorValue * const y = & m_y(0,iBlockRow);
759 
760  const size_type iEntryBegin = m_A.graph.row_map[ iBlockRow ];
761  const size_type iEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
762 
763  // Leading dimension guaranteed contiguous for LayoutLeft
764  for ( size_type j = 0 ; j < m_A.block.dimension() ; ++j ) { y[j] = 0 ; }
765 
766  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
767  const VectorValue * const x = & m_x( 0 , m_A.graph.entries(iEntry) );
768  const MatrixValue * const a = & m_A.values( 0 , iEntry );
769 
771  }
772 
773  }
774 
775  /*
776  * Compute work range = (begin, end) such that adjacent threads write to
777  * separate cache lines
778  */
779  KOKKOS_INLINE_FUNCTION
780  std::pair< size_type , size_type >
781  compute_work_range( const size_type work_count ,
782  const size_type thread_count ,
783  const size_type thread_rank ) const
784  {
785  enum { work_align = 64 / sizeof(VectorValue) };
786  enum { work_shift = Stokhos::power_of_two< work_align >::value };
787  enum { work_mask = work_align - 1 };
788 
789  const size_type work_per_thread =
790  ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
791  thread_count ) << work_shift ;
792 
793  const size_type work_begin =
794  std::min( thread_rank * work_per_thread , work_count );
795  const size_type work_end =
796  std::min( work_begin + work_per_thread , work_count );
797 
798  return std::make_pair( work_begin , work_end );
799  }
800 
801 #if defined(__MIC__)
802 
803  // A MIC-specific version of the block-multiply algorithm, where block here
804  // means processing multiple FEM columns at a time
805  KOKKOS_INLINE_FUNCTION
806  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
807  {
808  const size_type iBlockRow = device.league_rank();
809 
810  // Check for valid row
811  const size_type row_count = m_A.graph.row_map.extent(0)-1;
812  if (iBlockRow >= row_count)
813  return;
814 
815  const size_type num_thread = device.team_size();
816  const size_type thread_idx = device.team_rank();
817  std::pair<size_type,size_type> work_range =
818  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
819 
820  // Prefer that y[ m_A.block.dimension() ] be scratch space
821  // on the local thread, but cannot dynamically allocate
822  VectorValue * const y = & m_y(0,iBlockRow);
823 
824  // Leading dimension guaranteed contiguous for LayoutLeft
825  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
826  y[j] = 0 ;
827 
828  const tensor_type& tensor = m_A.block.tensor();
829 
830  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
831  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
832  const size_type BlockSize = 9;
833  const size_type numBlock =
834  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
835 
836  const MatrixValue* sh_A[BlockSize];
837  const VectorValue* sh_x[BlockSize];
838 
839  size_type iBlockEntry = iBlockEntryBeg;
840  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
841  const size_type block_size =
842  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
843 
844  for ( size_type col = 0; col < block_size; ++col ) {
845  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
846  sh_x[col] = & m_x( 0 , iBlockColumn );
847  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
848  }
849 
850  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
851 
852  const size_type nEntry = tensor.num_entry(iy);
853  const size_type iEntryBeg = tensor.entry_begin(iy);
854  const size_type iEntryEnd = iEntryBeg + nEntry;
855  size_type iEntry = iEntryBeg;
856 
857  VectorValue ytmp = 0 ;
858 
859  // Do entries with a blocked loop of size blocksize
860  const size_type nBlock = nEntry / tensor_type::vectorsize;
861  const size_type nEntryB = nBlock * tensor_type::vectorsize;
862  const size_type iEnd = iEntryBeg + nEntryB;
863 
867  VecTV vy;
868  vy.zero();
869  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
870  const size_type *j = &tensor.coord(iEntry,0);
871  const size_type *k = &tensor.coord(iEntry,1);
872  ValTV c(&(tensor.value(iEntry)));
873 
874  for ( size_type col = 0; col < block_size; ++col ) {
875  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
876  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
877 
878  // vy += c * ( aj * xk + ak * xj)
879  aj.times_equal(xk);
880  aj.multiply_add(ak, xj);
881  vy.multiply_add(c, aj);
882  }
883  }
884  ytmp += vy.sum();
885 
886  // The number of nonzeros is always constrained to be a multiple of 8
887 
888  const size_type rem = iEntryEnd-iEntry;
889  if (rem >= 8) {
893  const size_type *j = &tensor.coord(iEntry,0);
894  const size_type *k = &tensor.coord(iEntry,1);
895  ValTV2 c(&(tensor.value(iEntry)));
896 
897  for ( size_type col = 0; col < block_size; ++col ) {
898  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
899  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
900 
901  // vy += c * ( aj * xk + ak * xj)
902  aj.times_equal(xk);
903  aj.multiply_add(ak, xj);
904  aj.times_equal(c);
905  ytmp += aj.sum();
906  }
907  }
908 
909  y[iy] += ytmp ;
910  }
911 
912  // Add a team barrier to keep the thread team in-sync before going on
913  // to the next block
914  device.team_barrier();
915  }
916 
917  }
918 
919 #else
920 
921  // A general hand-vectorized version of the block multiply algorithm, where
922  // block here means processing multiple FEM columns at a time. Note that
923  // auto-vectorization of a block algorithm doesn't work, because the
924  // stochastic loop is not the inner-most loop.
925  KOKKOS_INLINE_FUNCTION
926  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
927  {
928  const size_type iBlockRow = device.league_rank();
929 
930  // Check for valid row
931  const size_type row_count = m_A.graph.row_map.extent(0)-1;
932  if (iBlockRow >= row_count)
933  return;
934 
935  const size_type num_thread = device.team_size();
936  const size_type thread_idx = device.team_rank();
937  std::pair<size_type,size_type> work_range =
938  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
939 
940  // Prefer that y[ m_A.block.dimension() ] be scratch space
941  // on the local thread, but cannot dynamically allocate
942  VectorValue * const y = & m_y(0,iBlockRow);
943 
944  // Leading dimension guaranteed contiguous for LayoutLeft
945  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
946  y[j] = 0 ;
947 
948  const tensor_type& tensor = m_A.block.tensor();
949 
950  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
951  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
952  const size_type BlockSize = 14;
953  const size_type numBlock =
954  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
955 
956  const MatrixValue* sh_A[BlockSize];
957  const VectorValue* sh_x[BlockSize];
958 
959  size_type iBlockEntry = iBlockEntryBeg;
960  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
961  const size_type block_size =
962  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
963 
964  for ( size_type col = 0; col < block_size; ++col ) {
965  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
966  sh_x[col] = & m_x( 0 , iBlockColumn );
967  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
968  }
969 
970  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
971 
972  const size_type nEntry = tensor.num_entry(iy);
973  const size_type iEntryBeg = tensor.entry_begin(iy);
974  const size_type iEntryEnd = iEntryBeg + nEntry;
975  size_type iEntry = iEntryBeg;
976 
977  VectorValue ytmp = 0 ;
978 
979  // Do entries with a blocked loop of size blocksize
980  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
981  const size_type nBlock = nEntry / tensor_type::vectorsize;
982  const size_type nEntryB = nBlock * tensor_type::vectorsize;
983  const size_type iEnd = iEntryBeg + nEntryB;
984 
988  VecTV vy;
989  vy.zero();
990  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
991  const size_type *j = &tensor.coord(iEntry,0);
992  const size_type *k = &tensor.coord(iEntry,1);
993  ValTV c(&(tensor.value(iEntry)));
994 
995  for ( size_type col = 0; col < block_size; ++col ) {
996  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
997  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
998 
999  // vy += c * ( aj * xk + ak * xj)
1000  aj.times_equal(xk);
1001  aj.multiply_add(ak, xj);
1002  vy.multiply_add(c, aj);
1003  }
1004  }
1005  ytmp += vy.sum();
1006  }
1007 
1008  // Do remaining entries with a scalar loop
1009  for ( ; iEntry<iEntryEnd; ++iEntry) {
1010  const size_type j = tensor.coord(iEntry,0);
1011  const size_type k = tensor.coord(iEntry,1);
1012  ValueType cijk = tensor.value(iEntry);
1013 
1014  for ( size_type col = 0; col < block_size; ++col ) {
1015  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
1016  sh_A[col][k] * sh_x[col][j] );
1017  }
1018 
1019  }
1020 
1021  y[iy] += ytmp ;
1022  }
1023 
1024  // Add a team barrier to keep the thread team in-sync before going on
1025  // to the next block
1026  device.team_barrier();
1027  }
1028 
1029  }
1030 
1031 #endif
1032 
1033  static void apply( const matrix_type & A ,
1034  const block_vector_type & x ,
1035  const block_vector_type & y )
1036  {
1037  // Generally the block algorithm seems to perform better on the MIC,
1038  // as long as the stochastic size isn't too big, but doesn't perform
1039  // any better on the CPU (probably because the CPU has a fat L3 cache
1040  // to store the sparse 3 tensor).
1041 #ifdef __MIC__
1042  const bool use_block_algorithm = true;
1043 #else
1044  const bool use_block_algorithm = false;
1045 #endif
1046 
1047  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1048  if (use_block_algorithm) {
1049 #ifdef __MIC__
1050  const size_t team_size = 4; // 4 hyperthreads for MIC
1051 #else
1052  const size_t team_size = 2; // 2 for everything else
1053 #endif
1054  const size_t league_size = row_count;
1055  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1056  Kokkos::parallel_for( config , MultiplyImpl(A,x,y) );
1057  }
1058  else {
1059  Kokkos::parallel_for( row_count , MultiplyImpl(A,x,y) );
1060  }
1061  }
1062 };
1063 
1064 //----------------------------------------------------------------------------
1065 
1066 } /* namespace Stokhos */
1067 
1068 //----------------------------------------------------------------------------
1069 //----------------------------------------------------------------------------
1070 
1071 // Inject some functions into the Kokkos namespace
1072 namespace Kokkos {
1073 
1075  using Stokhos::deep_copy;
1076 
1077 } // namespace Kokkos
1078 
1079 #endif /* #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP */
static const size_type num_entry_align
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Bases defined by combinatorial product of polynomial bases.
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero&#39;s.
k_iterator k_begin() const
Iterator pointing to first k entry.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
CrsProductTensor< ValueType, execution_space > tensor_type
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop&#39;s per multiply-add.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate &#39;i&#39;.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
const block_vector_type m_x
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
static const size_type tensor_align
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate &#39;i&#39;.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
CrsProductTensor< ValueType, Device, Memory >::HostMirror create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
CrsProductTensor< value_type, host_mirror_space > HostMirror
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION CrsProductTensor()
Kokkos::DefaultExecutionSpace device
static HostMirror create_mirror_view(const CrsProductTensor &tensor)
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
CRS matrix of dense blocks.
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate &#39;i&#39;.
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
const block_vector_type m_y
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
KOKKOS_INLINE_FUNCTION void zero()
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
int n
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
virtual ordinal_type size() const =0
Return total size of basis.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
static const size_type cuda_vectorsize
static CrsProductTensor createMeanBased()