Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include <iostream>
11 
12 // MP::Vector
15 
16 // Compile-time loops
17 #include "Sacado_mpl_range_c.hpp"
18 #include "Sacado_mpl_for_each.hpp"
19 #include "Sacado_mpl_integral_c.hpp"
20 
21 // Kokkos libraries' headers:
22 #include <Kokkos_UnorderedMap.hpp>
23 #include <Kokkos_StaticCrsGraph.hpp>
24 #include <Kokkos_Timer.hpp>
25 
26 // Utilities
27 #include <Teuchos_CommHelpers.hpp>
30 
31 // FENL
32 #include <BoxElemFixture.hpp>
33 #include <VectorImport.hpp>
34 #include <fenl_functors.hpp>
35 
36 inline
37 double maximum( const Teuchos::Comm<int>& comm , double local )
38 {
39  double global = 0 ;
40  Teuchos::reduceAll( comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
41  return global ;
42 }
43 
44 struct Perf {
45  size_t global_elem_count ;
46  size_t global_node_count ;
47  double import_time ;
48  double fill_time ;
49 
52  import_time(0) ,
53  fill_time(0) {}
54 
55  void increment(const Perf& p) {
59  fill_time += p.fill_time;
60  }
61 
62  void min(const Perf& p) {
67  }
68 
69  void scale(double s) {
70  import_time *= s;
71  fill_time *= s;
72  }
73 
74  void reduceMax(const Teuchos::Comm<int>& comm) {
75  import_time = maximum( comm , import_time );
76  fill_time = maximum( comm , fill_time );
77  }
78 };
79 
80 template <typename Scalar, typename Device>
82  const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
83  const int use_print ,
84  const int use_trials ,
85  const int use_nodes[] ,
87  Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
88 {
89  using Teuchos::RCP;
90  using Teuchos::rcp;
91  using Teuchos::rcpFromRef;
92  using Teuchos::arrayView;
94 
96 
98 
99  typedef typename LocalMatrixType::StaticCrsGraphType
100  LocalGraphType ;
101 
103  NodeNodeGraphType ;
104 
105  //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
108  ElementComputationType ;
109 
111  DirichletComputationType ;
112 
113  typedef typename ElementComputationType::vector_type VectorType ;
114 
116  typename FixtureType::comm_list_type ,
117  typename FixtureType::send_nodeid_type ,
118  VectorType > ImportType ;
119 
120  //------------------------------------
121 
122  const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
123 
124  const int comm_rank = comm->getRank();
125  const int comm_size = comm->getSize();
126 
127  // Decompose by node to avoid parallel communication in assembly
128 
129  const double bubble_x = 1.0 ;
130  const double bubble_y = 1.0 ;
131  const double bubble_z = 1.0 ;
132 
133  const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
134  comm_size , comm_rank ,
135  use_nodes[0] , use_nodes[1] , use_nodes[2] ,
136  bubble_x , bubble_y , bubble_z );
137 
138  if ( maximum(*comm, ( fixture.ok() ? 0 : 1 ) ) ) {
139  throw std::runtime_error(std::string("Problem fixture setup failed"));
140  }
141 
142  //------------------------------------
143 
144  const ImportType comm_nodal_import(
145  comm ,
146  fixture.recv_node() ,
147  fixture.send_node() ,
148  fixture.send_nodeid() ,
149  fixture.node_count_owned() ,
150  fixture.node_count() - fixture.node_count_owned() );
151 
152  //------------------------------------
153 
154  const double bc_lower_value = 1 ;
155  const double bc_upper_value = 2 ;
156  //CoeffFunctionType diffusion_coefficient( 1.0 );
157  CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
158  Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
159 
160  //------------------------------------
161 
162  if ( print_flag ) {
163  std::cout << "ElemNode {" << std::endl ;
164  for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
165  std::cout << " elem[" << ielem << "]{" ;
166  for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
167  std::cout << " " << fixture.elem_node(ielem,inode);
168  }
169  std::cout << " }" << std::endl ;
170  }
171  std::cout << "}" << std::endl ;
172  }
173 
174  //------------------------------------
175 
176  Kokkos::Timer wall_clock ;
177 
178  Perf perf_stats = Perf() ;
179 
180  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
181 
182  Perf perf = Perf() ;
183 
184  perf.global_elem_count = fixture.elem_count_global();
185  perf.global_node_count = fixture.node_count_global();
186 
187  //----------------------------------
188  // Create the local sparse matrix graph and element-to-graph map
189  // from the element->to->node identifier array.
190  // The graph only has rows for the owned nodes.
191 
192  typename NodeNodeGraphType::Times graph_times;
193  const NodeNodeGraphType
194  mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
195  graph_times );
196 
197  // Create the local sparse matrix from the graph:
198  LocalMatrixType jacobian( mesh_to_graph.graph );
199 
200  //----------------------------------
201 
202  // Allocate solution vector for each node in the mesh and residual vector for each owned node
203  VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
204  nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
205 
206  // Get DeviceConfig structs used by some functors
207  Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
209  dev_config_bc );
210 
211  // Create element computation functor
212  const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
213  nodal_solution ,
214  mesh_to_graph.elem_graph ,
215  jacobian , nodal_residual ,
216  dev_config_elem );
217 
218  // Create boundary condition functor
219  const DirichletComputationType dirichlet(
220  fixture , nodal_solution , jacobian , nodal_residual ,
221  2 /* apply at 'z' ends */ ,
222  bc_lower_value ,
223  bc_upper_value ,
224  dev_config_bc );
225 
226  Kokkos::deep_copy( nodal_solution , Scalar(1) );
227 
228  //--------------------------------
229 
230  Device().fence();
231  wall_clock.reset();
232 
233  comm_nodal_import( nodal_solution );
234 
235  Device().fence();
236  perf.import_time = wall_clock.seconds();
237 
238  //--------------------------------
239  // Element contributions to residual and jacobian
240 
241  Device().fence();
242  wall_clock.reset();
243 
244  Kokkos::deep_copy( nodal_residual , Scalar(0) );
245  Kokkos::deep_copy( jacobian.values , Scalar(0) );
246 
247  elemcomp.apply();
248 
249  //--------------------------------
250  // Apply boundary conditions
251 
252  dirichlet.apply();
253 
254  Device().fence();
255  perf.fill_time = wall_clock.seconds();
256 
257  //--------------------------------
258 
259  perf.reduceMax(*comm);
260  if (itrial == 0)
261  perf_stats = perf;
262  else
263  perf_stats.min(perf);
264 
265  }
266 
267  return perf_stats ;
268 }
269 
270 template <typename ScalarViewType, typename EnsembleViewType>
271 bool check_residuals(const ScalarViewType& scalar_residual,
272  const EnsembleViewType& ensemble_residual)
273 {
274  const double tol = 1e-14;
275  bool success = true;
277  Teuchos::VerboseObjectBase::getDefaultOStream();
278  std::stringstream buf;
279  Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
280 
281  typename ScalarViewType::HostMirror host_scalar_residual =
282  Kokkos::create_mirror_view(scalar_residual);
283  typename EnsembleViewType::HostMirror host_ensemble_residual =
284  Kokkos::create_mirror_view(ensemble_residual);
285  Kokkos::deep_copy( host_scalar_residual, scalar_residual );
286  Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
287 
288  TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
289  host_ensemble_residual.extent(0), fbuf, success );
290 
291  const size_t num_node = host_scalar_residual.extent(0);
292  const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
293  for (size_t i=0; i<num_node; ++i) {
294  for (size_t j=0; j<num_ensemble; ++j) {
296  host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
297  tol, fbuf, success );
298  }
299  }
300 
301  if (!success)
302  *out << buf.str();
303 
304  return success;
305 }
306 
307 template <class Storage>
308 struct PerformanceDriverOp {
309  typedef typename Storage::value_type Scalar;
310  typedef typename Storage::ordinal_type Ordinal;
313  const int use_print ;
314  const int use_trials ;
315  const int *use_nodes ;
316  const bool check ;
318 
320  const int use_print_ ,
321  const int use_trials_ ,
322  const int use_nodes_[] ,
323  const bool check_ ,
325  comm(comm_),
326  use_print(use_print_),
327  use_trials(use_trials_),
328  use_nodes(use_nodes_),
329  check(check_),
330  dev_config(dev_config_) {}
331 
332  template <typename ArgT>
333  void operator() (ArgT arg) const {
334  const int ensemble = ArgT::value;
335  typedef typename Storage::template apply_N<ensemble> NewStorageApply;
336  typedef typename NewStorageApply::type storage_type;
337  typedef Sacado::MP::Vector<storage_type> mp_vector_type;
338 
339  typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
340  typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
341 
342  scalar_vector_type scalar_residual;
343  Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
344  Perf perf_scalar =
345  fenl_assembly<Scalar,Device>(
347  scalar_dev_config, scalar_residual );
348 
349  ensemble_vector_type ensemble_residual;
350  Kokkos::Example::FENL::DeviceConfig ensemble_dev_config = dev_config;
351 #if defined( KOKKOS_ENABLE_CUDA )
352  const bool is_cuda = std::is_same<Device,Kokkos::Cuda>::value;
353 #else
354  const bool is_cuda = false ;
355 #endif
356  if (is_cuda) {
357  const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
358  ensemble_dev_config.block_dim.x = ensemble;
359  ensemble_dev_config.block_dim.y = block_size/ensemble;
360  }
361  Perf perf_ensemble =
362  fenl_assembly<mp_vector_type,Device>(
364  ensemble_dev_config, ensemble_residual);
365 
366  if (check)
367  check_residuals( scalar_residual, ensemble_residual );
368 
369  perf_scalar.scale( 1000.0 / ( perf_scalar.global_node_count ) );
370  perf_ensemble.scale( 1000.0 / ( ensemble * perf_scalar.global_node_count ) );
371 
372  if (comm->getRank() == 0) {
373  std::cout.precision(3);
374  std::cout << use_nodes[0] << " , "
375  << perf_scalar.global_node_count << " , "
376  << std::setw(2) << ensemble << " , "
377  << std::scientific
378  << perf_scalar.import_time << " , "
379  << perf_ensemble.import_time << " , "
380  << std::fixed << std::setw(6)
381  << perf_scalar.import_time / perf_ensemble.import_time << " , "
382  << std::scientific
383  << perf_scalar.fill_time << " , "
384  << perf_ensemble.fill_time << " , "
385  << std::fixed << std::setw(6)
386  << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
387  << std::endl;
388  }
389  }
390 };
391 
392 template <class Storage, int entry_min, int entry_max, int entry_step>
394  const int use_print ,
395  const int use_trials ,
396  const int use_nodes[] ,
397  const bool check ,
399 {
400  if (comm->getRank() == 0) {
401  std::cout.precision(8);
402  std::cout << std::endl
403  << "\"Grid Size\" , "
404  << "\"FEM Size\" , "
405  << "\"Ensemble Size\" , "
406  << "\"Scalar Import Time (ms)\" , "
407  << "\"Ensemble Import Time (ms)\" , "
408  << "\"Ensemble Import Speedup\" , "
409  << "\"Scalar Fill Time (ms)\" , "
410  << "\"Ensemble Fill Time (ms)\" , "
411  << "\"Ensemble Fill Speedup\" , "
412  << std::endl;
413  }
414 
415  // Loop over [entry_min, entry_max] vector entries per thread
416  typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
417  PerformanceDriverOp<Storage> op(comm, use_print, use_trials,
418  use_nodes, check, dev_config);
419  Sacado::mpl::for_each_no_kokkos<Range> f(op);
420 }
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
Stokhos::StandardStorage< int, double > storage_type
void min(const Perf &p)
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
#define TEUCHOS_TEST_FLOATING_EQUALITY(v1, v2, tol, out, success)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
void operator()(ArgT arg) const
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Storage::execution_space Device
void increment(const Perf &p)
Storage::ordinal_type Ordinal
Teuchos::RCP< const Teuchos::Comm< int > > comm
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
void scale(double s)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
#define TEUCHOS_TEST_EQUALITY(v1, v2, out, success)
Kokkos::Example::FENL::DeviceConfig dev_config
double maximum(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, double local)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
void reduceMax(const Teuchos::Comm< int > &comm)
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements...
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)