Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include <iostream>
11 
12 // MP::Vector
15 
16 // Compile-time loops
17 #include "Sacado_mpl_range_c.hpp"
18 #include "Sacado_mpl_for_each.hpp"
19 #include "Sacado_mpl_integral_c.hpp"
20 
21 // Kokkos libraries' headers:
22 #include <Kokkos_UnorderedMap.hpp>
23 #include <Kokkos_Timer.hpp>
24 
25 // Utilities
26 #include <Teuchos_CommHelpers.hpp>
29 
30 // FENL
31 #include <BoxElemFixture.hpp>
32 #include <VectorImport.hpp>
33 #include <fenl_functors.hpp>
34 
35 inline
36 double maximum( const Teuchos::Comm<int>& comm , double local )
37 {
38  double global = 0 ;
39  Teuchos::reduceAll( comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
40  return global ;
41 }
42 
43 struct Perf {
44  size_t global_elem_count ;
45  size_t global_node_count ;
46  double import_time ;
47  double fill_time ;
48 
51  import_time(0) ,
52  fill_time(0) {}
53 
54  void increment(const Perf& p) {
58  fill_time += p.fill_time;
59  }
60 
61  void min(const Perf& p) {
66  }
67 
68  void scale(double s) {
69  import_time *= s;
70  fill_time *= s;
71  }
72 
73  void reduceMax(const Teuchos::Comm<int>& comm) {
74  import_time = maximum( comm , import_time );
75  fill_time = maximum( comm , fill_time );
76  }
77 };
78 
79 template <typename Scalar, typename Device>
81  const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
82  const int use_print ,
83  const int use_trials ,
84  const int use_nodes[] ,
86  Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
87 {
88  using Teuchos::RCP;
89  using Teuchos::rcp;
90  using Teuchos::rcpFromRef;
91  using Teuchos::arrayView;
93 
95 
97 
98  typedef typename LocalMatrixType::StaticCrsGraphType
99  LocalGraphType ;
100 
102  NodeNodeGraphType ;
103 
104  //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
107  ElementComputationType ;
108 
110  DirichletComputationType ;
111 
112  typedef typename ElementComputationType::vector_type VectorType ;
113 
115  typename FixtureType::comm_list_type ,
116  typename FixtureType::send_nodeid_type ,
117  VectorType > ImportType ;
118 
119  //------------------------------------
120 
121  const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
122 
123  const int comm_rank = comm->getRank();
124  const int comm_size = comm->getSize();
125 
126  // Decompose by node to avoid parallel communication in assembly
127 
128  const double bubble_x = 1.0 ;
129  const double bubble_y = 1.0 ;
130  const double bubble_z = 1.0 ;
131 
132  const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
133  comm_size , comm_rank ,
134  use_nodes[0] , use_nodes[1] , use_nodes[2] ,
135  bubble_x , bubble_y , bubble_z );
136 
137  if ( maximum(*comm, ( fixture.ok() ? 0 : 1 ) ) ) {
138  throw std::runtime_error(std::string("Problem fixture setup failed"));
139  }
140 
141  //------------------------------------
142 
143  const ImportType comm_nodal_import(
144  comm ,
145  fixture.recv_node() ,
146  fixture.send_node() ,
147  fixture.send_nodeid() ,
148  fixture.node_count_owned() ,
149  fixture.node_count() - fixture.node_count_owned() );
150 
151  //------------------------------------
152 
153  const double bc_lower_value = 1 ;
154  const double bc_upper_value = 2 ;
155  //CoeffFunctionType diffusion_coefficient( 1.0 );
156  CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
157  Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
158 
159  //------------------------------------
160 
161  if ( print_flag ) {
162  std::cout << "ElemNode {" << std::endl ;
163  for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
164  std::cout << " elem[" << ielem << "]{" ;
165  for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
166  std::cout << " " << fixture.elem_node(ielem,inode);
167  }
168  std::cout << " }" << std::endl ;
169  }
170  std::cout << "}" << std::endl ;
171  }
172 
173  //------------------------------------
174 
175  Kokkos::Timer wall_clock ;
176 
177  Perf perf_stats = Perf() ;
178 
179  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
180 
181  Perf perf = Perf() ;
182 
183  perf.global_elem_count = fixture.elem_count_global();
184  perf.global_node_count = fixture.node_count_global();
185 
186  //----------------------------------
187  // Create the local sparse matrix graph and element-to-graph map
188  // from the element->to->node identifier array.
189  // The graph only has rows for the owned nodes.
190 
191  typename NodeNodeGraphType::Times graph_times;
192  const NodeNodeGraphType
193  mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
194  graph_times );
195 
196  // Create the local sparse matrix from the graph:
197  LocalMatrixType jacobian( mesh_to_graph.graph );
198 
199  //----------------------------------
200 
201  // Allocate solution vector for each node in the mesh and residual vector for each owned node
202  VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
203  nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
204 
205  // Get DeviceConfig structs used by some functors
206  Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
208  dev_config_bc );
209 
210  // Create element computation functor
211  const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
212  nodal_solution ,
213  mesh_to_graph.elem_graph ,
214  jacobian , nodal_residual ,
215  dev_config_elem );
216 
217  // Create boundary condition functor
218  const DirichletComputationType dirichlet(
219  fixture , nodal_solution , jacobian , nodal_residual ,
220  2 /* apply at 'z' ends */ ,
221  bc_lower_value ,
222  bc_upper_value ,
223  dev_config_bc );
224 
225  Kokkos::deep_copy( nodal_solution , Scalar(1) );
226 
227  //--------------------------------
228 
229  Device().fence();
230  wall_clock.reset();
231 
232  comm_nodal_import( nodal_solution );
233 
234  Device().fence();
235  perf.import_time = wall_clock.seconds();
236 
237  //--------------------------------
238  // Element contributions to residual and jacobian
239 
240  Device().fence();
241  wall_clock.reset();
242 
243  Kokkos::deep_copy( nodal_residual , Scalar(0) );
244  Kokkos::deep_copy( jacobian.values , Scalar(0) );
245 
246  elemcomp.apply();
247 
248  //--------------------------------
249  // Apply boundary conditions
250 
251  dirichlet.apply();
252 
253  Device().fence();
254  perf.fill_time = wall_clock.seconds();
255 
256  //--------------------------------
257 
258  perf.reduceMax(*comm);
259  if (itrial == 0)
260  perf_stats = perf;
261  else
262  perf_stats.min(perf);
263 
264  }
265 
266  return perf_stats ;
267 }
268 
269 template <typename ScalarViewType, typename EnsembleViewType>
270 bool check_residuals(const ScalarViewType& scalar_residual,
271  const EnsembleViewType& ensemble_residual)
272 {
273  const double tol = 1e-14;
274  bool success = true;
276  Teuchos::VerboseObjectBase::getDefaultOStream();
277  std::stringstream buf;
278  Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
279 
280  typename ScalarViewType::HostMirror host_scalar_residual =
281  Kokkos::create_mirror_view(scalar_residual);
282  typename EnsembleViewType::HostMirror host_ensemble_residual =
283  Kokkos::create_mirror_view(ensemble_residual);
284  Kokkos::deep_copy( host_scalar_residual, scalar_residual );
285  Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
286 
287  TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
288  host_ensemble_residual.extent(0), fbuf, success );
289 
290  const size_t num_node = host_scalar_residual.extent(0);
291  const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
292  for (size_t i=0; i<num_node; ++i) {
293  for (size_t j=0; j<num_ensemble; ++j) {
295  host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
296  tol, fbuf, success );
297  }
298  }
299 
300  if (!success)
301  *out << buf.str();
302 
303  return success;
304 }
305 
306 template <class Storage>
307 struct PerformanceDriverOp {
308  typedef typename Storage::value_type Scalar;
309  typedef typename Storage::ordinal_type Ordinal;
312  const int use_print ;
313  const int use_trials ;
314  const int *use_nodes ;
315  const bool check ;
317 
319  const int use_print_ ,
320  const int use_trials_ ,
321  const int use_nodes_[] ,
322  const bool check_ ,
324  comm(comm_),
325  use_print(use_print_),
326  use_trials(use_trials_),
327  use_nodes(use_nodes_),
328  check(check_),
329  dev_config(dev_config_) {}
330 
331  template <typename ArgT>
332  void operator() (ArgT arg) const {
333  const int ensemble = ArgT::value;
334  typedef typename Storage::template apply_N<ensemble> NewStorageApply;
335  typedef typename NewStorageApply::type storage_type;
336  typedef Sacado::MP::Vector<storage_type> mp_vector_type;
337 
338  typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
339  typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
340 
341  scalar_vector_type scalar_residual;
342  Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
343  Perf perf_scalar =
344  fenl_assembly<Scalar,Device>(
346  scalar_dev_config, scalar_residual );
347 
348  ensemble_vector_type ensemble_residual;
349  Kokkos::Example::FENL::DeviceConfig ensemble_dev_config = dev_config;
350 #if defined( KOKKOS_ENABLE_CUDA )
351  const bool is_cuda = std::is_same<Device,Kokkos::Cuda>::value;
352 #else
353  const bool is_cuda = false ;
354 #endif
355  if (is_cuda) {
356  const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
357  ensemble_dev_config.block_dim.x = ensemble;
358  ensemble_dev_config.block_dim.y = block_size/ensemble;
359  }
360  Perf perf_ensemble =
361  fenl_assembly<mp_vector_type,Device>(
363  ensemble_dev_config, ensemble_residual);
364 
365  if (check)
366  check_residuals( scalar_residual, ensemble_residual );
367 
368  perf_scalar.scale( 1000.0 / ( perf_scalar.global_node_count ) );
369  perf_ensemble.scale( 1000.0 / ( ensemble * perf_scalar.global_node_count ) );
370 
371  if (comm->getRank() == 0) {
372  std::cout.precision(3);
373  std::cout << use_nodes[0] << " , "
374  << perf_scalar.global_node_count << " , "
375  << std::setw(2) << ensemble << " , "
376  << std::scientific
377  << perf_scalar.import_time << " , "
378  << perf_ensemble.import_time << " , "
379  << std::fixed << std::setw(6)
380  << perf_scalar.import_time / perf_ensemble.import_time << " , "
381  << std::scientific
382  << perf_scalar.fill_time << " , "
383  << perf_ensemble.fill_time << " , "
384  << std::fixed << std::setw(6)
385  << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
386  << std::endl;
387  }
388  }
389 };
390 
391 template <class Storage, int entry_min, int entry_max, int entry_step>
393  const int use_print ,
394  const int use_trials ,
395  const int use_nodes[] ,
396  const bool check ,
398 {
399  if (comm->getRank() == 0) {
400  std::cout.precision(8);
401  std::cout << std::endl
402  << "\"Grid Size\" , "
403  << "\"FEM Size\" , "
404  << "\"Ensemble Size\" , "
405  << "\"Scalar Import Time (ms)\" , "
406  << "\"Ensemble Import Time (ms)\" , "
407  << "\"Ensemble Import Speedup\" , "
408  << "\"Scalar Fill Time (ms)\" , "
409  << "\"Ensemble Fill Time (ms)\" , "
410  << "\"Ensemble Fill Speedup\" , "
411  << std::endl;
412  }
413 
414  // Loop over [entry_min, entry_max] vector entries per thread
415  typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
416  PerformanceDriverOp<Storage> op(comm, use_print, use_trials,
417  use_nodes, check, dev_config);
418  Sacado::mpl::for_each_no_kokkos<Range> f(op);
419 }
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
Stokhos::StandardStorage< int, double > storage_type
void min(const Perf &p)
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
#define TEUCHOS_TEST_FLOATING_EQUALITY(v1, v2, tol, out, success)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
void operator()(ArgT arg) const
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Storage::execution_space Device
void increment(const Perf &p)
Storage::ordinal_type Ordinal
Teuchos::RCP< const Teuchos::Comm< int > > comm
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
void scale(double s)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
#define TEUCHOS_TEST_EQUALITY(v1, v2, out, success)
Kokkos::Example::FENL::DeviceConfig dev_config
double maximum(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, double local)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
void reduceMax(const Teuchos::Comm< int > &comm)
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements...
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)