16 #include "KokkosSparse_CrsMatrix.hpp"
17 #include "KokkosSparse_spmv.hpp"
27 #include "Kokkos_Timer.hpp"
29 template<
typename IntType >
36 return k + N * ( j + N * i );
41 std::vector< std::vector<size_t> > & graph )
43 graph.resize( N * N * N , std::vector<size_t>() );
47 for (
int i = 0 ; i < (int) N ; ++i ) {
48 for (
int j = 0 ;
j < (int) N ; ++
j ) {
49 for (
int k = 0 ; k < (int) N ; ++k ) {
53 graph[row].reserve(27);
55 for (
int ii = -1 ; ii < 2 ; ++ii ) {
56 for (
int jj = -1 ; jj < 2 ; ++jj ) {
57 for (
int kk = -1 ; kk < 2 ; ++kk ) {
58 if ( 0 <= i + ii && i + ii < (
int) N &&
59 0 <=
j + jj &&
j + jj < (
int) N &&
60 0 <= k + kk && k + kk < (
int) N ) {
63 graph[row].push_back(col);
66 total += graph[row].size();
72 template <
typename ScalarType,
typename OrdinalType,
typename Device>
75 const OrdinalType dim,
76 const OrdinalType nGrid,
77 const OrdinalType iterCount,
78 std::vector<double>& scalar_perf,
79 std::vector<double>& block_left_perf,
80 std::vector<double>& block_right_perf,
81 std::vector<double>& pce_perf,
82 std::vector<double>& block_pce_perf)
91 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space > scalar_vector_type;
92 typedef Kokkos::View< value_type**, Kokkos::LayoutLeft, execution_space > scalar_left_multi_vector_type;
93 typedef Kokkos::View< value_type**, Kokkos::LayoutRight, execution_space > scalar_right_multi_vector_type;
94 typedef Kokkos::View< pce_type*, Kokkos::LayoutLeft, execution_space > pce_vector_type;
95 typedef Kokkos::View< pce_type**, Kokkos::LayoutLeft, execution_space > pce_multi_vector_type;
97 typedef KokkosSparse::CrsMatrix< value_type, ordinal_type, execution_space > scalar_matrix_type;
98 typedef KokkosSparse::CrsMatrix< pce_type, ordinal_type, execution_space > pce_matrix_type;
99 typedef typename scalar_matrix_type::StaticCrsGraphType matrix_graph_type;
100 typedef typename scalar_matrix_type::values_type scalar_matrix_values_type;
101 typedef typename pce_matrix_type::values_type pce_matrix_values_type;
108 typedef typename pce_type::cijk_type kokkos_cijk_type;
115 const ordinal_type num_pce_col = 5;
118 Array< RCP<const abstract_basis_type> > bases(dim);
119 for (ordinal_type i=0; i<dim; ++i) {
122 RCP<const product_basis_type> basis =
rcp(
new product_basis_type(bases));
123 RCP<cijk_type>
cijk = basis->computeTripleProductTensor();
124 kokkos_cijk_type kokkos_cijk =
125 Stokhos::create_product_tensor<execution_space>(*basis, *
cijk);
131 std::vector< std::vector<size_t> > fem_graph;
132 const size_t fem_length = nGrid * nGrid * nGrid;
138 ordinal_type pce_size = basis->size();
139 scalar_left_multi_vector_type xl(Kokkos::ViewAllocateWithoutInitializing(
"scalar left x"), fem_length, pce_size);
140 scalar_left_multi_vector_type yl(Kokkos::ViewAllocateWithoutInitializing(
"scalar right y"), fem_length, pce_size);
141 scalar_right_multi_vector_type xr(Kokkos::ViewAllocateWithoutInitializing(
"scalar right x"), fem_length, pce_size);
142 scalar_right_multi_vector_type yr(Kokkos::ViewAllocateWithoutInitializing(
"scalar right y"), fem_length, pce_size);
143 std::vector<scalar_vector_type> x_col(pce_size), y_col(pce_size);
144 for (ordinal_type i=0; i<pce_size; ++i) {
145 x_col[i] = scalar_vector_type (Kokkos::ViewAllocateWithoutInitializing(
"scalar x col"), fem_length);
146 y_col[i] = scalar_vector_type(Kokkos::ViewAllocateWithoutInitializing(
"scalar y col"), fem_length);
150 pce_vector_type x_pce =
151 Kokkos::make_view<pce_vector_type>(Kokkos::ViewAllocateWithoutInitializing(
"pce x"),
152 kokkos_cijk, fem_length, pce_size);
153 pce_vector_type y_pce =
154 Kokkos::make_view<pce_vector_type>(Kokkos::ViewAllocateWithoutInitializing(
"pce y"),
155 kokkos_cijk, fem_length, pce_size);
156 pce_multi_vector_type x_multi_pce =
157 Kokkos::make_view<pce_multi_vector_type>(Kokkos::ViewAllocateWithoutInitializing(
"pce multi x"),
158 kokkos_cijk, fem_length,
159 num_pce_col, pce_size);
160 pce_multi_vector_type y_multi_pce =
161 Kokkos::make_view<pce_multi_vector_type>(Kokkos::ViewAllocateWithoutInitializing(
"pce multi y"),
162 kokkos_cijk, fem_length,
163 num_pce_col, pce_size);
177 matrix_graph_type matrix_graph =
178 Kokkos::create_staticcrsgraph<matrix_graph_type>(
179 std::string(
"test crs graph"), fem_graph);
180 scalar_matrix_values_type scalar_matrix_values =
181 scalar_matrix_values_type(Kokkos::ViewAllocateWithoutInitializing(
"scalar matrix"), graph_length);
182 pce_matrix_values_type pce_matrix_values =
183 Kokkos::make_view<pce_matrix_values_type>(Kokkos::ViewAllocateWithoutInitializing(
"pce matrix"), kokkos_cijk, graph_length, 1);
184 scalar_matrix_type scalar_matrix(
"scalar matrix", fem_length,
185 scalar_matrix_values, matrix_graph);
186 pce_matrix_type pce_matrix(
"pce matrix", fem_length,
187 pce_matrix_values, matrix_graph);
197 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
198 for (ordinal_type col=0; col<pce_size; ++col) {
209 Kokkos::Timer clock ;
210 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
211 for (ordinal_type col=0; col<pce_size; ++col) {
222 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
223 const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
225 scalar_perf.resize(5);
226 scalar_perf[0] = fem_length;
227 scalar_perf[1] = pce_size;
228 scalar_perf[2] = graph_length;
229 scalar_perf[3] = seconds_per_iter;
230 scalar_perf[4] = flops / seconds_per_iter;
238 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
243 Kokkos::Timer clock ;
244 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
249 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
250 const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
252 block_left_perf.resize(5);
253 block_left_perf[0] = fem_length;
254 block_left_perf[1] = pce_size;
255 block_left_perf[2] = graph_length;
256 block_left_perf[3] = seconds_per_iter;
257 block_left_perf[4] = flops / seconds_per_iter;
265 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
270 Kokkos::Timer clock ;
271 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
276 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
277 const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
279 block_right_perf.resize(5);
280 block_right_perf[0] = fem_length;
281 block_right_perf[1] = pce_size;
282 block_right_perf[2] = graph_length;
283 block_right_perf[3] = seconds_per_iter;
284 block_right_perf[4] = flops / seconds_per_iter;
292 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
297 Kokkos::Timer clock ;
298 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
303 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
304 const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
307 pce_perf[0] = fem_length;
308 pce_perf[1] = pce_size;
309 pce_perf[2] = graph_length;
310 pce_perf[3] = seconds_per_iter;
311 pce_perf[4] = flops / seconds_per_iter;
319 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
324 Kokkos::Timer clock ;
325 for (ordinal_type iter = 0; iter < iterCount; ++iter) {
330 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
331 const double flops = 1.0e-9 * 2.0 * graph_length * pce_size * num_pce_col;
333 block_pce_perf.resize(5);
334 block_pce_perf[0] = fem_length;
335 block_pce_perf[1] = pce_size;
336 block_pce_perf[2] = graph_length;
337 block_pce_perf[3] = seconds_per_iter;
338 block_pce_perf[4] = flops / seconds_per_iter;
343 template <
typename Scalar,
typename Ordinal,
typename Device>
350 std::cout.precision(8);
351 std::cout << std::endl
352 <<
"\"Grid Size\" , "
354 <<
"\"FEM Graph Size\" , "
355 <<
"\"Dimension\" , "
358 <<
"\"Scalar SpMM Time\" , "
359 <<
"\"Scalar SpMM Speedup\" , "
360 <<
"\"Scalar SpMM GFLOPS\" , "
361 <<
"\"Block-Left SpMM Speedup\" , "
362 <<
"\"Block-Left SpMM GFLOPS\" , "
363 <<
"\"Block-Right SpMM Speedup\" , "
364 <<
"\"Block-Right SpMM GFLOPS\" , "
365 <<
"\"PCE SpMM Speedup\" , "
366 <<
"\"PCE SpMM GFLOPS\" , "
367 <<
"\"Block PCE SpMM Speedup\" , "
368 <<
"\"Block PCE SpMM GFLOPS\" , "
371 std::vector<double> perf_scalar, perf_block_left, perf_block_right,
372 perf_pce, perf_block_pce;
373 for (
Ordinal dim=min_var; dim<=max_var; ++dim) {
375 test_mean_multiply<Scalar,Ordinal,Device>(
376 order, dim, nGrid, nIter, perf_scalar, perf_block_left, perf_block_right,
377 perf_pce, perf_block_pce );
379 std::cout << nGrid <<
" , "
380 << perf_scalar[0] <<
" , "
381 << perf_scalar[2] <<
" , "
384 << perf_scalar[1] <<
" , "
385 << perf_scalar[3] <<
" , "
386 << perf_scalar[4] / perf_scalar[4] <<
" , "
387 << perf_scalar[4] <<
" , "
388 << perf_block_left[4]/ perf_scalar[4] <<
" , "
389 << perf_block_left[4] <<
" , "
390 << perf_block_right[4]/ perf_scalar[4] <<
" , "
391 << perf_block_right[4] <<
" , "
392 << perf_pce[4]/ perf_scalar[4] <<
" , "
393 << perf_pce[4] <<
" , "
394 << perf_block_pce[4]/ perf_scalar[4] <<
" , "
395 << perf_block_pce[4] <<
" , "
401 #define INST_PERF_DRIVER(SCALAR, ORDINAL, DEVICE) \
402 template void performance_test_driver< SCALAR, ORDINAL, DEVICE >( \
403 const ORDINAL nGrid, const ORDINAL nIter, const ORDINAL order, \
404 const ORDINAL min_var, const ORDINAL max_var);
Stokhos::StandardStorage< int, double > storage_type
Sacado::ETPCE::OrthogPoly< double, Stokhos::StandardStorage< int, double > > pce_type
Multivariate orthogonal polynomial basis generated from a total order tensor product of univariate po...
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
ordinal generate_fem_graph(ordinal N, std::vector< std::vector< ordinal > > &graph)
Kokkos::DefaultExecutionSpace execution_space
void test_mean_multiply(const OrdinalType order, const OrdinalType dim, const OrdinalType nGrid, const OrdinalType iterCount, std::vector< double > &scalar_perf, std::vector< double > &block_left_perf, std::vector< double > &block_right_perf, std::vector< double > &pce_perf, std::vector< double > &block_pce_perf)
IntType map_fem_graph_coord(const IntType &N, const IntType &i, const IntType &j, const IntType &k)
Stokhos::LegendreBasis< int, double > basis_type
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
void setGlobalCijkTensor(const cijk_type &cijk)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Legendre polynomial basis.
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Abstract base class for 1-D orthogonal polynomials.
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
A comparison functor implementing a strict weak ordering based lexographic ordering.