Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_CrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
11 #define STOKHOS_CRSPRODUCTTENSOR_HPP
12 
13 #include "Kokkos_Core.hpp"
14 
15 #include "Stokhos_Multiply.hpp"
16 #include "Stokhos_ProductBasis.hpp"
21 #include "Stokhos_TinyVec.hpp"
22 
23 //----------------------------------------------------------------------------
24 //----------------------------------------------------------------------------
25 
26 namespace Stokhos {
27 
45 template< typename ValueType, class ExecutionSpace, class Memory = void >
47 public:
48 
49  typedef ExecutionSpace execution_space;
50  typedef int size_type;
51  typedef ValueType value_type;
52  typedef Memory memory_type;
53 
54  typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space host_mirror_space ;
57 
58 // Vectorsize used in multiply algorithm
59 #if defined(__AVX__)
60  static const size_type host_vectorsize = 32/sizeof(value_type);
61  static const bool use_intrinsics = true;
62  static const size_type num_entry_align = 1;
63 #elif defined(__MIC__)
64  static const size_type host_vectorsize = 16;
65  static const bool use_intrinsics = true;
66  static const size_type num_entry_align = 8; // avoid use of mask instructions
67 #else
68  static const size_type host_vectorsize = 2;
69  static const bool use_intrinsics = false;
70  static const size_type num_entry_align = 1;
71 #endif
72  static const size_type cuda_vectorsize = 32;
73  static const bool is_cuda =
74 #if defined( KOKKOS_ENABLE_CUDA )
75  std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
76 #else
77  false ;
78 #endif
80 
81  // Alignment in terms of number of entries of CRS rows
83 
84 private:
85 
86  template <class, class, class> friend class CrsProductTensor;
87 
88  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > vec_type;
89  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type;
90  typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type;
91  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type;
92  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type;
93  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type;
94 
105 
106  struct CijkRowCount {
107  unsigned count;
108  unsigned basis;
109 
111  : count(0)
112  , basis(0)
113  {}
114  };
115 
117  bool operator() (const CijkRowCount& a, const CijkRowCount& b) const {
118  return a.count < b.count;
119  }
120  };
121 
122 public:
123 
124  KOKKOS_INLINE_FUNCTION
126 
127  KOKKOS_INLINE_FUNCTION
129  m_coord(),
130  m_coord2(),
131  m_value(),
132  m_num_entry(),
133  m_row_map(),
134  m_dim(0),
135  m_entry_max(0),
136  m_nnz(0),
137  m_flops(0),
139 
140  template <class M>
141  KOKKOS_INLINE_FUNCTION
143  m_coord( rhs.m_coord ),
144  m_coord2( rhs.m_coord2 ),
145  m_value( rhs.m_value ),
146  m_num_entry( rhs.m_num_entry ),
147  m_row_map( rhs.m_row_map ),
148  m_dim( rhs.m_dim ),
149  m_entry_max( rhs.m_entry_max ),
150  m_nnz( rhs.m_nnz ),
151  m_flops( rhs.m_flops ),
153 
154  template <class M>
155  KOKKOS_INLINE_FUNCTION
158  {
159  m_coord = rhs.m_coord;
160  m_coord2 = rhs.m_coord2;
161  m_value = rhs.m_value;
162  m_num_entry = rhs.m_num_entry;
163  m_row_map = rhs.m_row_map;
164  m_dim = rhs.m_dim;
165  m_entry_max = rhs.m_entry_max;
166  m_nnz = rhs.m_nnz;
167  m_flops = rhs.m_flops;
169  return *this;
170  }
171 
173  KOKKOS_INLINE_FUNCTION
174  size_type dimension() const { return m_dim; }
175 
177  KOKKOS_INLINE_FUNCTION
178  bool is_empty() const { return m_dim == 0; }
179 
181  KOKKOS_INLINE_FUNCTION
183  { return m_coord.extent(0); }
184 
186  KOKKOS_INLINE_FUNCTION
188  { return m_entry_max; }
189 
191  KOKKOS_INLINE_FUNCTION
193  { return m_row_map[i]; }
194 
196  KOKKOS_INLINE_FUNCTION
198  { return m_row_map[i] + m_num_entry(i); }
199 
201  KOKKOS_INLINE_FUNCTION
203  { return m_num_entry(i); }
204 
206  KOKKOS_INLINE_FUNCTION
207  const size_type& coord( const size_type entry, const size_type c ) const
208  { return m_coord2( entry, c ); }
209 
211  KOKKOS_INLINE_FUNCTION
212  const size_type& coord( const size_type entry ) const
213  { return m_coord( entry ); }
214 
216  KOKKOS_INLINE_FUNCTION
217  const value_type & value( const size_type entry ) const
218  { return m_value( entry ); }
219 
221  KOKKOS_INLINE_FUNCTION
223  { return m_nnz; }
224 
226  KOKKOS_INLINE_FUNCTION
228  { return m_flops; }
229 
231  KOKKOS_INLINE_FUNCTION
233  { return m_avg_entries_per_row; }
234 
235  template <typename OrdinalType>
236  static CrsProductTensor
240  {
242 
243  // Note (etp 01/08/15 Commenting out the sorting as it causes a really
244  // weird compiler error when compiling with NVCC. It seems to think the
245  // < in CompareCijkRowCount() is part of a template parameter. We don't
246  // seem to use this option, so I am just commenting it out.
247 
248  // bool sort_nnz = false;
249  // if (params.isParameter("Sort Nonzeros"))
250  // sort_nnz = params.get<bool>("Sort Nonzeros");
251 
252  // Compute number of non-zeros for each i
253  const size_type dimension = basis.size();
254  std::vector< size_t > coord_work( dimension, (size_t) 0 );
256  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
257  i_it!=Cijk.i_end(); ++i_it) {
258  OrdinalType i = index(i_it);
259  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
260  k_it != Cijk.k_end(i_it); ++k_it) {
261  OrdinalType k = index(k_it);
262  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
263  j_it != Cijk.j_end(k_it); ++j_it) {
264  OrdinalType j = index(j_it);
265  if (j >= k) {
266  ++coord_work[i];
267  ++entry_count;
268  }
269  }
270  }
271  }
272 
273  // Compute average nonzeros per row (must be before padding)
274  size_type avg_entries_per_row = entry_count / dimension;
275 
276  // Pad each row to have size divisible by alignment size
277  for ( size_type i = 0; i < dimension; ++i ) {
278  const size_t rem = coord_work[i] % tensor_align;
279  if (rem > 0) {
280  const size_t pad = tensor_align - rem;
281  coord_work[i] += pad;
282  entry_count += pad;
283  }
284  }
285 
286  // Sort based on number of non-zeros
287  std::vector< CijkRowCount > row_count( dimension );
288  for ( size_type i = 0; i < dimension; ++i ) {
289  row_count[i].count = coord_work[i];
290  row_count[i].basis = i;
291  }
292 
293  // Note (etp 01/08/15 See above.
294 
295  // if (sort_nnz)
296  // std::sort( row_count.begin(), row_count.end(), CompareCijkRowCount() );
297  std::vector<size_type> sorted_row_map( dimension );
298  for ( size_type i = 0; i < dimension; ++i ) {
299  coord_work[i] = row_count[i].count;
300  sorted_row_map[ row_count[i].basis ] = i;
301  }
302 
303  // Allocate tensor data
304  // coord and coord2 are initialized to zero because otherwise we get
305  // seg faults in the MIC algorithm when processing the tail of each
306  // tensor row. Not quite sure why as the coord loads are padded to
307  // length 16 and are masked for the remainder (unless it does load x[j]
308  // anyway and masks off the result, so j needs to be valid).
309  CrsProductTensor tensor;
310  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
311  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
312  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
313  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
314  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
315  tensor.m_dim = dimension;
316  tensor.m_entry_max = 0;
318 
319  // Create mirror, is a view if is host memory
320  typename coord_array_type::host_mirror_type
321  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
322  typename coord2_array_type::host_mirror_type
323  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
324  typename value_array_type::host_mirror_type
325  host_value = Kokkos::create_mirror_view( tensor.m_value );
326  typename entry_array_type::host_mirror_type
327  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
328  typename entry_array_type::host_mirror_type
329  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
330 
331  // Compute row map
332  size_type sum = 0;
333  host_row_map(0) = 0;
334  for ( size_type i = 0; i < dimension; ++i ) {
335  sum += coord_work[i];
336  host_row_map(i+1) = sum;
337  host_num_entry(i) = 0;
338  }
339 
340  for ( size_type iCoord = 0; iCoord < dimension; ++iCoord ) {
341  coord_work[iCoord] = host_row_map[iCoord];
342  }
343 
344  // Initialize values and coordinates to zero since we will have extra
345  // ones for alignment
346  Kokkos::deep_copy( host_value, 0.0 );
347  Kokkos::deep_copy( host_coord, 0 );
348  Kokkos::deep_copy( host_coord2, 0 );
349 
350  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
351  i_it!=Cijk.i_end(); ++i_it) {
352  OrdinalType i = index(i_it);
353  const size_type row = sorted_row_map[i];
354  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
355  k_it != Cijk.k_end(i_it); ++k_it) {
356  OrdinalType k = index(k_it);
357  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
358  j_it != Cijk.j_end(k_it); ++j_it) {
359  OrdinalType j = index(j_it);
360  ValueType c = Stokhos::value(j_it);
361  if (j >= k) {
362  const size_type n = coord_work[row]; ++coord_work[row];
363  host_value(n) = (j != k) ? c : 0.5*c;
364  host_coord2(n,0) = j;
365  host_coord2(n,1) = k;
366  host_coord(n) = ( k << 16 ) | j;
367  ++host_num_entry(row);
368  ++tensor.m_nnz;
369  }
370  }
371  }
372  // Align num_entry
373  host_num_entry(row) =
374  (host_num_entry(row) + num_entry_align-1) & ~(num_entry_align-1);
375  }
376 
377  // Copy data to device if necessary
378  Kokkos::deep_copy( tensor.m_coord, host_coord );
379  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
380  Kokkos::deep_copy( tensor.m_value, host_value );
381  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
382  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
383 
384  for ( size_type i = 0; i < dimension; ++i ) {
385  tensor.m_entry_max = std::max( tensor.m_entry_max, host_num_entry(i) );
386  }
387 
388  tensor.m_flops = 5*tensor.m_nnz + dimension;
389 
390  return tensor;
391  }
392 
394  {
395  const size_type dimension = 1;
396  const size_type entry_count = 1;
397 
398  // Allocate tensor data
399  // coord and coord2 are initialized to zero because otherwise we get
400  // seg faults in the MIC algorithm when processing the tail of each
401  // tensor row. Not quite sure why as the coord loads are padded to
402  // length 16 and are masked for the remainder (unless it does load x[j]
403  // anyway and masks off the result, so j needs to be valid).
404  CrsProductTensor tensor;
405  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
406  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
407  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
408  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
409  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
410  tensor.m_dim = dimension;
411  tensor.m_entry_max = 1;
412  tensor.m_avg_entries_per_row = 1;
413  tensor.m_nnz = 1;
414  tensor.m_flops = 5*tensor.m_nnz + dimension;
415 
416  // Create mirror, is a view if is host memory
417  typename coord_array_type::host_mirror_type
418  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
419  typename coord2_array_type::host_mirror_type
420  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
421  typename value_array_type::host_mirror_type
422  host_value = Kokkos::create_mirror_view( tensor.m_value );
423  typename entry_array_type::host_mirror_type
424  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
425  typename entry_array_type::host_mirror_type
426  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
427 
428  // Compute row map
429  host_row_map(0) = 0;
430  host_row_map(1) = 1;
431  host_num_entry(0) = 1;
432 
433  // Compute tensor values
434  host_value(0) = 0.5;
435  host_coord2(0,0) = 0;
436  host_coord2(0,1) = 0;
437  host_coord(0) = 0;
438 
439  // Copy data to device if necessary
440  Kokkos::deep_copy( tensor.m_coord, host_coord );
441  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
442  Kokkos::deep_copy( tensor.m_value, host_value );
443  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
444  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
445 
446  return tensor;
447  }
448 
449  static host_mirror_type
451  host_mirror_type host_tensor;
452 
453  host_tensor.m_coord = Kokkos::create_mirror_view( tensor.m_coord );
454  host_tensor.m_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
455  host_tensor.m_value = Kokkos::create_mirror_view( tensor.m_value );
456  host_tensor.m_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
457  host_tensor.m_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
458 
459  host_tensor.m_dim = tensor.m_dim;
460  host_tensor.m_entry_max = tensor.m_entry_max;
461  host_tensor.m_avg_entries_per_row = tensor.m_avg_entries_per_row;
462  host_tensor.m_nnz = tensor.m_nnz;
463  host_tensor.m_flops = tensor.m_flops;
464 
465  return host_tensor;
466  }
467 
468  template < class DstDevice, class DstMemory >
469  static void
471  const CrsProductTensor& src ) {
472  Kokkos::deep_copy( dst.m_coord, src.m_coord );
473  Kokkos::deep_copy( dst.m_coord2, src.m_coord2 );
474  Kokkos::deep_copy( dst.m_value, src.m_value );
477  }
478 
479 };
480 
481 template< class Device, typename OrdinalType, typename ValueType>
482 CrsProductTensor<ValueType, Device>
487 {
488  return CrsProductTensor<ValueType, Device>::create(basis, Cijk, params );
489 }
490 
491 template< class Device, typename OrdinalType, typename ValueType,
492  class Memory >
493 CrsProductTensor<ValueType, Device, Memory>
498 {
500  basis, Cijk, params );
501 }
502 
503 template< class Device, typename OrdinalType, typename ValueType>
504 CrsProductTensor<ValueType, Device>
506 {
508 }
509 
510 template< class Device, typename OrdinalType, typename ValueType,
511  class Memory >
512 CrsProductTensor<ValueType, Device, Memory>
514 {
516 }
517 
518 template < class ValueType, class Device, class Memory >
519 inline
520 typename CrsProductTensor<ValueType,Device,Memory>::host_mirror_type
522 {
524 }
525 
526  template < class ValueType,
527  class DstDevice, class DstMemory,
528  class SrcDevice, class SrcMemory >
529 void
532 {
534 }
535 
536 template < typename ValueType, typename Device >
537 class BlockMultiply< CrsProductTensor< ValueType , Device > >
538 {
539 public:
540 
541  typedef Device execution_space;
544 
545 // Whether to use manual or auto-vectorization
546 #ifdef __MIC__
547 #define USE_AUTO_VECTORIZATION 1
548 #else
549 #define USE_AUTO_VECTORIZATION 0
550 #endif
551 
552 #if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
553 
554  // Version leveraging intel vectorization
555  template< typename MatrixValue , typename VectorValue >
556  KOKKOS_INLINE_FUNCTION
557  static void apply( const tensor_type & tensor ,
558  const MatrixValue * const a ,
559  const VectorValue * const x ,
560  VectorValue * const y ,
561  const VectorValue & alpha = VectorValue(1) )
562  {
563  // The intel compiler doesn't seem to be able to vectorize through
564  // the coord() calls, so extract pointers
565  const size_type * cj = &tensor.coord(0,0);
566  const size_type * ck = &tensor.coord(0,1);
567  const size_type nDim = tensor.dimension();
568 
569  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
570  const size_type nEntry = tensor.num_entry(iy);
571  const size_type iEntryBeg = tensor.entry_begin(iy);
572  const size_type iEntryEnd = iEntryBeg + nEntry;
573  VectorValue ytmp = 0;
574 
575 #pragma simd vectorlength(tensor_type::vectorsize)
576 #pragma ivdep
577 #pragma vector aligned
578  for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
579  const size_type j = cj[iEntry]; //tensor.coord(iEntry,0);
580  const size_type k = ck[iEntry]; //tensor.coord(iEntry,1);
581  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
582  }
583 
584  y[iy] += alpha * ytmp ;
585  }
586  }
587 
588 #elif defined(__MIC__)
589 
590  // Version specific to MIC architecture using manual vectorization
591  template< typename MatrixValue , typename VectorValue >
592  KOKKOS_INLINE_FUNCTION
593  static void apply( const tensor_type & tensor ,
594  const MatrixValue * const a ,
595  const VectorValue * const x ,
596  VectorValue * const y ,
597  const VectorValue & alpha = VectorValue(1) )
598  {
599  const size_type nDim = tensor.dimension();
600  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
601 
602  const size_type nEntry = tensor.num_entry(iy);
603  const size_type iEntryBeg = tensor.entry_begin(iy);
604  const size_type iEntryEnd = iEntryBeg + nEntry;
605  size_type iEntry = iEntryBeg;
606 
607  VectorValue ytmp = 0 ;
608 
609  const size_type nBlock = nEntry / tensor_type::vectorsize;
610  const size_type nEntryB = nBlock * tensor_type::vectorsize;
611  const size_type iEnd = iEntryBeg + nEntryB;
612 
613  typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
614  TV vy;
615  vy.zero();
616  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
617  const size_type *j = &tensor.coord(iEntry,0);
618  const size_type *k = &tensor.coord(iEntry,1);
619  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
620  c(&(tensor.value(iEntry)));
621 
622  // vy += c * ( aj * xk + ak * xj)
623  aj.times_equal(xk);
624  aj.multiply_add(ak, xj);
625  vy.multiply_add(c, aj);
626 
627  }
628  ytmp += vy.sum();
629 
630  // The number of nonzeros is always constrained to be a multiple of 8
631 
632  const size_type rem = iEntryEnd-iEntry;
633  if (rem >= 8) {
634  typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
635  const size_type *j = &tensor.coord(iEntry,0);
636  const size_type *k = &tensor.coord(iEntry,1);
637  TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
638  c(&(tensor.value(iEntry)));
639 
640  // vy += c * ( aj * xk + ak * xj)
641  aj.times_equal(xk);
642  aj.multiply_add(ak, xj);
643  aj.times_equal(c);
644  ytmp += aj.sum();
645  }
646 
647  y[iy] += alpha * ytmp ;
648  }
649  }
650 
651 #else
652 
653  // General version
654  template< typename MatrixValue , typename VectorValue >
655  KOKKOS_INLINE_FUNCTION
656  static void apply( const tensor_type & tensor ,
657  const MatrixValue * const a ,
658  const VectorValue * const x ,
659  VectorValue * const y ,
660  const VectorValue & alpha = VectorValue(1) )
661  {
662  const size_type nDim = tensor.dimension();
663  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
664 
665  const size_type nEntry = tensor.num_entry(iy);
666  const size_type iEntryBeg = tensor.entry_begin(iy);
667  const size_type iEntryEnd = iEntryBeg + nEntry;
668  size_type iEntry = iEntryBeg;
669 
670  VectorValue ytmp = 0 ;
671 
672  // Do entries with a blocked loop of size vectorsize
673  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
674  const size_type nBlock = nEntry / tensor_type::vectorsize;
675  const size_type nEntryB = nBlock * tensor_type::vectorsize;
676  const size_type iEnd = iEntryBeg + nEntryB;
677 
679  TV vy;
680  vy.zero();
681  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
682  const size_type *j = &tensor.coord(iEntry,0);
683  const size_type *k = &tensor.coord(iEntry,1);
684  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.value(iEntry)));
685 
686  // vy += c * ( aj * xk + ak * xj)
687  aj.times_equal(xk);
688  aj.multiply_add(ak, xj);
689  vy.multiply_add(c, aj);
690  }
691  ytmp += vy.sum();
692  }
693 
694  // Do remaining entries with a scalar loop
695  for ( ; iEntry<iEntryEnd; ++iEntry) {
696  const size_type j = tensor.coord(iEntry,0);
697  const size_type k = tensor.coord(iEntry,1);
698 
699  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
700  }
701 
702  y[iy] += alpha * ytmp ;
703  }
704  }
705 #endif
706 
707  KOKKOS_INLINE_FUNCTION
708  static size_type matrix_size( const tensor_type & tensor )
709  { return tensor.dimension(); }
710 
711  KOKKOS_INLINE_FUNCTION
712  static size_type vector_size( const tensor_type & tensor )
713  { return tensor.dimension(); }
714 };
715 
716 // Specialization of Multiply< BlockCrsMatrix< BlockSpec, ... > > > for
717 // CrsProductTensor, which provides a version that processes blocks of FEM
718 // columns together to reduce the number of global reads of the sparse 3 tensor
719 
720 // Even though this isn't specific to Threads, templating on Device creates a
721 // duplicate specialization error for Cuda. Need to see if we can fix this,
722 // or put the implementation in another class easily specialized for Threads,
723 // OpenMP, ...
724 template< typename ValueType , typename MatrixValue , typename VectorValue ,
725  typename Device >
727 public:
728 
729  typedef Device execution_space ;
732  typedef typename BlockSpec::size_type size_type ;
733  typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space > block_vector_type ;
735 
736  const matrix_type m_A ;
739 
741  const block_vector_type & x ,
742  const block_vector_type & y )
743  : m_A( A )
744  , m_x( x )
745  , m_y( y )
746  {}
747 
748  //--------------------------------------------------------------------------
749  // A( storage_size( m_A.block.size() ) , m_A.graph.row_map.size() );
750  // x( m_A.block.dimension() , m_A.graph.row_map.first_count() );
751  // y( m_A.block.dimension() , m_A.graph.row_map.first_count() );
752  //
753 
754  KOKKOS_INLINE_FUNCTION
755  void operator()( const size_type iBlockRow ) const
756  {
757  // Prefer that y[ m_A.block.dimension() ] be scratch space
758  // on the local thread, but cannot dynamically allocate
759  VectorValue * const y = & m_y(0,iBlockRow);
760 
761  const size_type iEntryBegin = m_A.graph.row_map[ iBlockRow ];
762  const size_type iEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
763 
764  // Leading dimension guaranteed contiguous for LayoutLeft
765  for ( size_type j = 0 ; j < m_A.block.dimension() ; ++j ) { y[j] = 0 ; }
766 
767  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
768  const VectorValue * const x = & m_x( 0 , m_A.graph.entries(iEntry) );
769  const MatrixValue * const a = & m_A.values( 0 , iEntry );
770 
772  }
773 
774  }
775 
776  /*
777  * Compute work range = (begin, end) such that adjacent threads write to
778  * separate cache lines
779  */
780  KOKKOS_INLINE_FUNCTION
781  std::pair< size_type , size_type >
782  compute_work_range( const size_type work_count ,
783  const size_type thread_count ,
784  const size_type thread_rank ) const
785  {
786  enum { work_align = 64 / sizeof(VectorValue) };
787  enum { work_shift = Stokhos::power_of_two< work_align >::value };
788  enum { work_mask = work_align - 1 };
789 
790  const size_type work_per_thread =
791  ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
792  thread_count ) << work_shift ;
793 
794  const size_type work_begin =
795  std::min( thread_rank * work_per_thread , work_count );
796  const size_type work_end =
797  std::min( work_begin + work_per_thread , work_count );
798 
799  return std::make_pair( work_begin , work_end );
800  }
801 
802 #if defined(__MIC__)
803 
804  // A MIC-specific version of the block-multiply algorithm, where block here
805  // means processing multiple FEM columns at a time
806  KOKKOS_INLINE_FUNCTION
807  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
808  {
809  const size_type iBlockRow = device.league_rank();
810 
811  // Check for valid row
812  const size_type row_count = m_A.graph.row_map.extent(0)-1;
813  if (iBlockRow >= row_count)
814  return;
815 
816  const size_type num_thread = device.team_size();
817  const size_type thread_idx = device.team_rank();
818  std::pair<size_type,size_type> work_range =
819  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
820 
821  // Prefer that y[ m_A.block.dimension() ] be scratch space
822  // on the local thread, but cannot dynamically allocate
823  VectorValue * const y = & m_y(0,iBlockRow);
824 
825  // Leading dimension guaranteed contiguous for LayoutLeft
826  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
827  y[j] = 0 ;
828 
829  const tensor_type& tensor = m_A.block.tensor();
830 
831  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
832  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
833  const size_type BlockSize = 9;
834  const size_type numBlock =
835  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
836 
837  const MatrixValue* sh_A[BlockSize];
838  const VectorValue* sh_x[BlockSize];
839 
840  size_type iBlockEntry = iBlockEntryBeg;
841  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
842  const size_type block_size =
843  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
844 
845  for ( size_type col = 0; col < block_size; ++col ) {
846  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
847  sh_x[col] = & m_x( 0 , iBlockColumn );
848  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
849  }
850 
851  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
852 
853  const size_type nEntry = tensor.num_entry(iy);
854  const size_type iEntryBeg = tensor.entry_begin(iy);
855  const size_type iEntryEnd = iEntryBeg + nEntry;
856  size_type iEntry = iEntryBeg;
857 
858  VectorValue ytmp = 0 ;
859 
860  // Do entries with a blocked loop of size blocksize
861  const size_type nBlock = nEntry / tensor_type::vectorsize;
862  const size_type nEntryB = nBlock * tensor_type::vectorsize;
863  const size_type iEnd = iEntryBeg + nEntryB;
864 
868  VecTV vy;
869  vy.zero();
870  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
871  const size_type *j = &tensor.coord(iEntry,0);
872  const size_type *k = &tensor.coord(iEntry,1);
873  ValTV c(&(tensor.value(iEntry)));
874 
875  for ( size_type col = 0; col < block_size; ++col ) {
876  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
877  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
878 
879  // vy += c * ( aj * xk + ak * xj)
880  aj.times_equal(xk);
881  aj.multiply_add(ak, xj);
882  vy.multiply_add(c, aj);
883  }
884  }
885  ytmp += vy.sum();
886 
887  // The number of nonzeros is always constrained to be a multiple of 8
888 
889  const size_type rem = iEntryEnd-iEntry;
890  if (rem >= 8) {
894  const size_type *j = &tensor.coord(iEntry,0);
895  const size_type *k = &tensor.coord(iEntry,1);
896  ValTV2 c(&(tensor.value(iEntry)));
897 
898  for ( size_type col = 0; col < block_size; ++col ) {
899  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
900  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
901 
902  // vy += c * ( aj * xk + ak * xj)
903  aj.times_equal(xk);
904  aj.multiply_add(ak, xj);
905  aj.times_equal(c);
906  ytmp += aj.sum();
907  }
908  }
909 
910  y[iy] += ytmp ;
911  }
912 
913  // Add a team barrier to keep the thread team in-sync before going on
914  // to the next block
915  device.team_barrier();
916  }
917 
918  }
919 
920 #else
921 
922  // A general hand-vectorized version of the block multiply algorithm, where
923  // block here means processing multiple FEM columns at a time. Note that
924  // auto-vectorization of a block algorithm doesn't work, because the
925  // stochastic loop is not the inner-most loop.
926  KOKKOS_INLINE_FUNCTION
927  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
928  {
929  const size_type iBlockRow = device.league_rank();
930 
931  // Check for valid row
932  const size_type row_count = m_A.graph.row_map.extent(0)-1;
933  if (iBlockRow >= row_count)
934  return;
935 
936  const size_type num_thread = device.team_size();
937  const size_type thread_idx = device.team_rank();
938  std::pair<size_type,size_type> work_range =
939  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
940 
941  // Prefer that y[ m_A.block.dimension() ] be scratch space
942  // on the local thread, but cannot dynamically allocate
943  VectorValue * const y = & m_y(0,iBlockRow);
944 
945  // Leading dimension guaranteed contiguous for LayoutLeft
946  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
947  y[j] = 0 ;
948 
949  const tensor_type& tensor = m_A.block.tensor();
950 
951  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
952  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
953  const size_type BlockSize = 14;
954  const size_type numBlock =
955  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
956 
957  const MatrixValue* sh_A[BlockSize];
958  const VectorValue* sh_x[BlockSize];
959 
960  size_type iBlockEntry = iBlockEntryBeg;
961  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
962  const size_type block_size =
963  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
964 
965  for ( size_type col = 0; col < block_size; ++col ) {
966  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
967  sh_x[col] = & m_x( 0 , iBlockColumn );
968  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
969  }
970 
971  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
972 
973  const size_type nEntry = tensor.num_entry(iy);
974  const size_type iEntryBeg = tensor.entry_begin(iy);
975  const size_type iEntryEnd = iEntryBeg + nEntry;
976  size_type iEntry = iEntryBeg;
977 
978  VectorValue ytmp = 0 ;
979 
980  // Do entries with a blocked loop of size blocksize
981  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
982  const size_type nBlock = nEntry / tensor_type::vectorsize;
983  const size_type nEntryB = nBlock * tensor_type::vectorsize;
984  const size_type iEnd = iEntryBeg + nEntryB;
985 
989  VecTV vy;
990  vy.zero();
991  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
992  const size_type *j = &tensor.coord(iEntry,0);
993  const size_type *k = &tensor.coord(iEntry,1);
994  ValTV c(&(tensor.value(iEntry)));
995 
996  for ( size_type col = 0; col < block_size; ++col ) {
997  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
998  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
999 
1000  // vy += c * ( aj * xk + ak * xj)
1001  aj.times_equal(xk);
1002  aj.multiply_add(ak, xj);
1003  vy.multiply_add(c, aj);
1004  }
1005  }
1006  ytmp += vy.sum();
1007  }
1008 
1009  // Do remaining entries with a scalar loop
1010  for ( ; iEntry<iEntryEnd; ++iEntry) {
1011  const size_type j = tensor.coord(iEntry,0);
1012  const size_type k = tensor.coord(iEntry,1);
1013  ValueType cijk = tensor.value(iEntry);
1014 
1015  for ( size_type col = 0; col < block_size; ++col ) {
1016  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
1017  sh_A[col][k] * sh_x[col][j] );
1018  }
1019 
1020  }
1021 
1022  y[iy] += ytmp ;
1023  }
1024 
1025  // Add a team barrier to keep the thread team in-sync before going on
1026  // to the next block
1027  device.team_barrier();
1028  }
1029 
1030  }
1031 
1032 #endif
1033 
1034  static void apply( const matrix_type & A ,
1035  const block_vector_type & x ,
1036  const block_vector_type & y )
1037  {
1038  // Generally the block algorithm seems to perform better on the MIC,
1039  // as long as the stochastic size isn't too big, but doesn't perform
1040  // any better on the CPU (probably because the CPU has a fat L3 cache
1041  // to store the sparse 3 tensor).
1042 #ifdef __MIC__
1043  const bool use_block_algorithm = true;
1044 #else
1045  const bool use_block_algorithm = false;
1046 #endif
1047 
1048  const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1049  if (use_block_algorithm) {
1050 #ifdef __MIC__
1051  const size_t team_size = 4; // 4 hyperthreads for MIC
1052 #else
1053  const size_t team_size = 2; // 2 for everything else
1054 #endif
1055  const size_t league_size = row_count;
1056  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1057  Kokkos::parallel_for( config , MultiplyImpl(A,x,y) );
1058  }
1059  else {
1060  Kokkos::parallel_for( row_count , MultiplyImpl(A,x,y) );
1061  }
1062  }
1063 };
1064 
1065 //----------------------------------------------------------------------------
1066 
1067 } /* namespace Stokhos */
1068 
1069 //----------------------------------------------------------------------------
1070 //----------------------------------------------------------------------------
1071 
1072 // Inject some functions into the Kokkos namespace
1073 namespace Kokkos {
1074 
1076  using Stokhos::deep_copy;
1077 
1078 } // namespace Kokkos
1079 
1080 #endif /* #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP */
static const size_type num_entry_align
CrsProductTensor< ValueType, Device, Memory >::host_mirror_type create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Bases defined by combinatorial product of polynomial bases.
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero&#39;s.
k_iterator k_begin() const
Iterator pointing to first k entry.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
CrsProductTensor< ValueType, execution_space > tensor_type
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop&#39;s per multiply-add.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate &#39;i&#39;.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
Stokhos::CrsMatrix< ValueType, Device, Layout >::host_mirror_type create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
const block_vector_type m_x
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
static const size_type tensor_align
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate &#39;i&#39;.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
CrsProductTensor< value_type, host_mirror_space > host_mirror_type
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION CrsProductTensor()
Kokkos::DefaultExecutionSpace device
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
CRS matrix of dense blocks.
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate &#39;i&#39;.
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
const block_vector_type m_y
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
KOKKOS_INLINE_FUNCTION void zero()
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
int n
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
virtual ordinal_type size() const =0
Return total size of basis.
static host_mirror_type create_mirror_view(const CrsProductTensor &tensor)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
static const size_type cuda_vectorsize
static CrsProductTensor createMeanBased()