Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TestMeanMultiply.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include <iostream>
11 
12 // PCE scalar type
14 
15 // Kokkos CrsMatrix
16 #include "KokkosSparse_CrsMatrix.hpp"
17 #include "KokkosSparse_spmv.hpp"
20 
21 // Stokhos
25 
26 // Utilities
27 #include "Kokkos_Timer.hpp"
28 
29 template< typename IntType >
30 inline
31 IntType map_fem_graph_coord( const IntType & N ,
32  const IntType & i ,
33  const IntType & j ,
34  const IntType & k )
35 {
36  return k + N * ( j + N * i );
37 }
38 
39 inline
40 size_t generate_fem_graph( size_t N ,
41  std::vector< std::vector<size_t> > & graph )
42 {
43  graph.resize( N * N * N , std::vector<size_t>() );
44 
45  size_t total = 0 ;
46 
47  for ( int i = 0 ; i < (int) N ; ++i ) {
48  for ( int j = 0 ; j < (int) N ; ++j ) {
49  for ( int k = 0 ; k < (int) N ; ++k ) {
50 
51  const size_t row = map_fem_graph_coord((int)N,i,j,k);
52 
53  graph[row].reserve(27);
54 
55  for ( int ii = -1 ; ii < 2 ; ++ii ) {
56  for ( int jj = -1 ; jj < 2 ; ++jj ) {
57  for ( int kk = -1 ; kk < 2 ; ++kk ) {
58  if ( 0 <= i + ii && i + ii < (int) N &&
59  0 <= j + jj && j + jj < (int) N &&
60  0 <= k + kk && k + kk < (int) N ) {
61  size_t col = map_fem_graph_coord((int)N,i+ii,j+jj,k+kk);
62 
63  graph[row].push_back(col);
64  }
65  }}}
66  total += graph[row].size();
67  }}}
68 
69  return total ;
70 }
71 
72 template <typename ScalarType, typename OrdinalType, typename Device>
73 void
74 test_mean_multiply(const OrdinalType order,
75  const OrdinalType dim,
76  const OrdinalType nGrid,
77  const OrdinalType iterCount,
78  std::vector<double>& scalar_perf,
79  std::vector<double>& block_left_perf,
80  std::vector<double>& block_right_perf,
81  std::vector<double>& pce_perf,
82  std::vector<double>& block_pce_perf)
83 {
84  typedef ScalarType value_type;
85  typedef OrdinalType ordinal_type;
86  typedef Device execution_space;
87 
90 
91  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space > scalar_vector_type;
92  typedef Kokkos::View< value_type**, Kokkos::LayoutLeft, execution_space > scalar_left_multi_vector_type;
93  typedef Kokkos::View< value_type**, Kokkos::LayoutRight, execution_space > scalar_right_multi_vector_type;
94  typedef Kokkos::View< pce_type*, Kokkos::LayoutLeft, execution_space > pce_vector_type;
95  typedef Kokkos::View< pce_type**, Kokkos::LayoutLeft, execution_space > pce_multi_vector_type;
96 
97  typedef KokkosSparse::CrsMatrix< value_type, ordinal_type, execution_space > scalar_matrix_type;
98  typedef KokkosSparse::CrsMatrix< pce_type, ordinal_type, execution_space > pce_matrix_type;
99  typedef typename scalar_matrix_type::StaticCrsGraphType matrix_graph_type;
100  typedef typename scalar_matrix_type::values_type scalar_matrix_values_type;
101  typedef typename pce_matrix_type::values_type pce_matrix_values_type;
102 
103  typedef Stokhos::OneDOrthogPolyBasis<ordinal_type,value_type> abstract_basis_type;
108  typedef typename pce_type::cijk_type kokkos_cijk_type;
109 
110  using Teuchos::rcp;
111  using Teuchos::RCP;
112  using Teuchos::Array;
113 
114  // Number of columns for PCE multi-vector apply
115  const ordinal_type num_pce_col = 5;
116 
117  // Create Stochastic Galerkin basis and expansion
118  Array< RCP<const abstract_basis_type> > bases(dim);
119  for (ordinal_type i=0; i<dim; ++i) {
120  bases[i] = Teuchos::rcp(new basis_type(order, true));
121  }
122  RCP<const product_basis_type> basis = rcp(new product_basis_type(bases));
123  RCP<cijk_type> cijk = basis->computeTripleProductTensor();
124  kokkos_cijk_type kokkos_cijk =
125  Stokhos::create_product_tensor<execution_space>(*basis, *cijk);
126  Kokkos::setGlobalCijkTensor(kokkos_cijk);
127 
128  //------------------------------
129  // Generate graph for "FEM" box structure:
130 
131  std::vector< std::vector<size_t> > fem_graph;
132  const size_t fem_length = nGrid * nGrid * nGrid;
133  const size_t graph_length = generate_fem_graph( nGrid , fem_graph );
134 
135  //------------------------------
136  // Generate input vectors:
137 
138  ordinal_type pce_size = basis->size();
139  scalar_left_multi_vector_type xl(Kokkos::ViewAllocateWithoutInitializing("scalar left x"), fem_length, pce_size);
140  scalar_left_multi_vector_type yl(Kokkos::ViewAllocateWithoutInitializing("scalar right y"), fem_length, pce_size);
141  scalar_right_multi_vector_type xr(Kokkos::ViewAllocateWithoutInitializing("scalar right x"), fem_length, pce_size);
142  scalar_right_multi_vector_type yr(Kokkos::ViewAllocateWithoutInitializing("scalar right y"), fem_length, pce_size);
143  std::vector<scalar_vector_type> x_col(pce_size), y_col(pce_size);
144  for (ordinal_type i=0; i<pce_size; ++i) {
145  x_col[i] = scalar_vector_type (Kokkos::ViewAllocateWithoutInitializing("scalar x col"), fem_length);
146  y_col[i] = scalar_vector_type(Kokkos::ViewAllocateWithoutInitializing("scalar y col"), fem_length);
147  Kokkos::deep_copy( x_col[i] , value_type(1.0) );
148  Kokkos::deep_copy( y_col[i] , value_type(0.0) );
149  }
150  pce_vector_type x_pce =
151  Kokkos::make_view<pce_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce x"),
152  kokkos_cijk, fem_length, pce_size);
153  pce_vector_type y_pce =
154  Kokkos::make_view<pce_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce y"),
155  kokkos_cijk, fem_length, pce_size);
156  pce_multi_vector_type x_multi_pce =
157  Kokkos::make_view<pce_multi_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce multi x"),
158  kokkos_cijk, fem_length,
159  num_pce_col, pce_size);
160  pce_multi_vector_type y_multi_pce =
161  Kokkos::make_view<pce_multi_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce multi y"),
162  kokkos_cijk, fem_length,
163  num_pce_col, pce_size);
164 
165  Kokkos::deep_copy( xl , value_type(1.0) );
166  Kokkos::deep_copy( yl , value_type(0.0) );
167  Kokkos::deep_copy( xr , value_type(1.0) );
168  Kokkos::deep_copy( yr , value_type(0.0) );
169  Kokkos::deep_copy( x_pce , value_type(1.0) );
170  Kokkos::deep_copy( y_pce , value_type(0.0) );
171  Kokkos::deep_copy( x_multi_pce , value_type(1.0) );
172  Kokkos::deep_copy( y_multi_pce , value_type(0.0) );
173 
174  //------------------------------
175  // Generate matrix
176 
177  matrix_graph_type matrix_graph =
178  Kokkos::create_staticcrsgraph<matrix_graph_type>(
179  std::string("test crs graph"), fem_graph);
180  scalar_matrix_values_type scalar_matrix_values =
181  scalar_matrix_values_type(Kokkos::ViewAllocateWithoutInitializing("scalar matrix"), graph_length);
182  pce_matrix_values_type pce_matrix_values =
183  Kokkos::make_view<pce_matrix_values_type>(Kokkos::ViewAllocateWithoutInitializing("pce matrix"), kokkos_cijk, graph_length, 1);
184  scalar_matrix_type scalar_matrix("scalar matrix", fem_length,
185  scalar_matrix_values, matrix_graph);
186  pce_matrix_type pce_matrix("pce matrix", fem_length,
187  pce_matrix_values, matrix_graph);
188 
189  Kokkos::deep_copy( scalar_matrix_values , value_type(1.0) );
190  Kokkos::deep_copy( pce_matrix_values , value_type(1.0) );
191 
192  //------------------------------
193  // Scalar multiply
194 
195  {
196  // warm up
197  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
198  for (ordinal_type col=0; col<pce_size; ++col) {
199  // scalar_vector_type xc =
200  // Kokkos::subview(x, Kokkos::ALL(), col);
201  // scalar_vector_type yc =
202  // Kokkos::subview(y, Kokkos::ALL(), col);
203  // Kokkos::MV_Multiply( yc, scalar_matrix, xc );
204  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, x_col[col] , value_type(0.0) ,y_col[col]);
205  }
206  }
207 
208  execution_space().fence();
209  Kokkos::Timer clock ;
210  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
211  for (ordinal_type col=0; col<pce_size; ++col) {
212  // scalar_vector_type xc =
213  // Kokkos::subview(x, Kokkos::ALL(), col);
214  // scalar_vector_type yc =
215  // Kokkos::subview(y, Kokkos::ALL(), col);
216  // Kokkos::MV_Multiply( yc, scalar_matrix, xc );
217  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, x_col[col] , value_type(0.0) ,y_col[col]);
218  }
219  }
220  execution_space().fence();
221 
222  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
223  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
224 
225  scalar_perf.resize(5);
226  scalar_perf[0] = fem_length;
227  scalar_perf[1] = pce_size;
228  scalar_perf[2] = graph_length;
229  scalar_perf[3] = seconds_per_iter;
230  scalar_perf[4] = flops / seconds_per_iter;
231  }
232 
233  //------------------------------
234  // Block-left multiply
235 
236  {
237  // warm up
238  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
239  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xl , value_type(0.0) ,yl);
240  }
241 
242  execution_space().fence();
243  Kokkos::Timer clock ;
244  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
245  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xl , value_type(0.0) ,yl);
246  }
247  execution_space().fence();
248 
249  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
250  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
251 
252  block_left_perf.resize(5);
253  block_left_perf[0] = fem_length;
254  block_left_perf[1] = pce_size;
255  block_left_perf[2] = graph_length;
256  block_left_perf[3] = seconds_per_iter;
257  block_left_perf[4] = flops / seconds_per_iter;
258  }
259 
260  //------------------------------
261  // Block-right multiply
262 
263  {
264  // warm up
265  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
266  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xr , value_type(0.0) ,yr);
267  }
268 
269  execution_space().fence();
270  Kokkos::Timer clock ;
271  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
272  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xr , value_type(0.0) ,yr);
273  }
274  execution_space().fence();
275 
276  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
277  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
278 
279  block_right_perf.resize(5);
280  block_right_perf[0] = fem_length;
281  block_right_perf[1] = pce_size;
282  block_right_perf[2] = graph_length;
283  block_right_perf[3] = seconds_per_iter;
284  block_right_perf[4] = flops / seconds_per_iter;
285  }
286 
287  //------------------------------
288  // PCE multiply
289 
290  {
291  // warm up
292  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
293  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_pce , value_type(0.0) ,y_pce);
294  }
295 
296  execution_space().fence();
297  Kokkos::Timer clock ;
298  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
299  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_pce , value_type(0.0) ,y_pce);
300  }
301  execution_space().fence();
302 
303  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
304  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
305 
306  pce_perf.resize(5);
307  pce_perf[0] = fem_length;
308  pce_perf[1] = pce_size;
309  pce_perf[2] = graph_length;
310  pce_perf[3] = seconds_per_iter;
311  pce_perf[4] = flops / seconds_per_iter;
312  }
313 
314  //------------------------------
315  // PCE multi-vector multiply
316 
317  {
318  // warm up
319  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
320  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_multi_pce , value_type(0.0) ,y_multi_pce);
321  }
322 
323  execution_space().fence();
324  Kokkos::Timer clock ;
325  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
326  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_multi_pce , value_type(0.0) ,y_multi_pce);
327  }
328  execution_space().fence();
329 
330  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
331  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size * num_pce_col;
332 
333  block_pce_perf.resize(5);
334  block_pce_perf[0] = fem_length;
335  block_pce_perf[1] = pce_size;
336  block_pce_perf[2] = graph_length;
337  block_pce_perf[3] = seconds_per_iter;
338  block_pce_perf[4] = flops / seconds_per_iter;
339  }
340 
341 }
342 
343 template <typename Scalar, typename Ordinal, typename Device>
345  const Ordinal nIter,
346  const Ordinal order,
347  const Ordinal min_var,
348  const Ordinal max_var )
349 {
350  std::cout.precision(8);
351  std::cout << std::endl
352  << "\"Grid Size\" , "
353  << "\"FEM Size\" , "
354  << "\"FEM Graph Size\" , "
355  << "\"Dimension\" , "
356  << "\"Order\" , "
357  << "\"PCE Size\" , "
358  << "\"Scalar SpMM Time\" , "
359  << "\"Scalar SpMM Speedup\" , "
360  << "\"Scalar SpMM GFLOPS\" , "
361  << "\"Block-Left SpMM Speedup\" , "
362  << "\"Block-Left SpMM GFLOPS\" , "
363  << "\"Block-Right SpMM Speedup\" , "
364  << "\"Block-Right SpMM GFLOPS\" , "
365  << "\"PCE SpMM Speedup\" , "
366  << "\"PCE SpMM GFLOPS\" , "
367  << "\"Block PCE SpMM Speedup\" , "
368  << "\"Block PCE SpMM GFLOPS\" , "
369  << std::endl;
370 
371  std::vector<double> perf_scalar, perf_block_left, perf_block_right,
372  perf_pce, perf_block_pce;
373  for (Ordinal dim=min_var; dim<=max_var; ++dim) {
374 
375  test_mean_multiply<Scalar,Ordinal,Device>(
376  order, dim, nGrid, nIter, perf_scalar, perf_block_left, perf_block_right,
377  perf_pce, perf_block_pce );
378 
379  std::cout << nGrid << " , "
380  << perf_scalar[0] << " , "
381  << perf_scalar[2] << " , "
382  << dim << " , "
383  << order << " , "
384  << perf_scalar[1] << " , "
385  << perf_scalar[3] << " , "
386  << perf_scalar[4] / perf_scalar[4] << " , "
387  << perf_scalar[4] << " , "
388  << perf_block_left[4]/ perf_scalar[4] << " , "
389  << perf_block_left[4] << " , "
390  << perf_block_right[4]/ perf_scalar[4] << " , "
391  << perf_block_right[4] << " , "
392  << perf_pce[4]/ perf_scalar[4] << " , "
393  << perf_pce[4] << " , "
394  << perf_block_pce[4]/ perf_scalar[4] << " , "
395  << perf_block_pce[4] << " , "
396  << std::endl;
397 
398  }
399 }
400 
401 #define INST_PERF_DRIVER(SCALAR, ORDINAL, DEVICE) \
402  template void performance_test_driver< SCALAR, ORDINAL, DEVICE >( \
403  const ORDINAL nGrid, const ORDINAL nIter, const ORDINAL order, \
404  const ORDINAL min_var, const ORDINAL max_var);
Stokhos::StandardStorage< int, double > storage_type
Sacado::ETPCE::OrthogPoly< double, Stokhos::StandardStorage< int, double > > pce_type
Multivariate orthogonal polynomial basis generated from a total order tensor product of univariate po...
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
ordinal generate_fem_graph(ordinal N, std::vector< std::vector< ordinal > > &graph)
Definition: TestEpetra.cpp:45
Kokkos::DefaultExecutionSpace execution_space
void test_mean_multiply(const OrdinalType order, const OrdinalType dim, const OrdinalType nGrid, const OrdinalType iterCount, std::vector< double > &scalar_perf, std::vector< double > &block_left_perf, std::vector< double > &block_right_perf, std::vector< double > &pce_perf, std::vector< double > &block_pce_perf)
IntType map_fem_graph_coord(const IntType &N, const IntType &i, const IntType &j, const IntType &k)
Definition: TestEpetra.cpp:35
Stokhos::LegendreBasis< int, double > basis_type
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
void setGlobalCijkTensor(const cijk_type &cijk)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Legendre polynomial basis.
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Abstract base class for 1-D orthogonal polynomials.
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
A comparison functor implementing a strict weak ordering based lexographic ordering.