Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FadMPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include <iostream>
11 
12 // MP::Vector
15 
16 // Compile-time loops
17 #include "Sacado_mpl_range_c.hpp"
18 #include "Sacado_mpl_for_each.hpp"
19 #include "Sacado_mpl_integral_c.hpp"
20 
21 // Kokkos libraries' headers:
22 #include <Kokkos_UnorderedMap.hpp>
23 #include <Kokkos_StaticCrsGraph.hpp>
24 #include <Kokkos_Timer.hpp>
25 
26 // Utilities
27 #include <Teuchos_CommHelpers.hpp>
30 
31 // FENL
32 #include <BoxElemFixture.hpp>
33 #include <VectorImport.hpp>
34 #include <fenl_functors.hpp>
35 
36 struct Perf {
39  double import_time ;
40  double fill_time ;
41 
44  import_time(0) ,
45  fill_time(0) {}
46 
47  void increment(const Perf& p) {
51  fill_time += p.fill_time;
52  }
53 
54  void scale(double s) {
55  import_time *= s;
56  fill_time *= s;
57  }
58 };
59 
60 inline
61 double maximum( const Teuchos::RCP<const Teuchos::Comm<int> >& comm , double local )
62 {
63  double global = 0 ;
64  Teuchos::reduceAll( *comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
65  return global ;
66 }
67 
68 template <typename Scalar, typename Device,
71  const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
72  const int use_print ,
73  const int use_trials ,
74  const int use_nodes[] ,
76  Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
77 {
78  using Teuchos::RCP;
79  using Teuchos::rcp;
80  using Teuchos::rcpFromRef;
81  using Teuchos::arrayView;
83 
85 
87 
88  typedef typename LocalMatrixType::StaticCrsGraphType LocalGraphType ;
89 
91  NodeNodeGraphType ;
92 
93  //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
96  ElementComputationType ;
97 
98  // typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , LocalMatrixType >
99  // DirichletComputationType ;
100 
101  typedef typename ElementComputationType::vector_type VectorType ;
102 
104  typename FixtureType::comm_list_type ,
105  typename FixtureType::send_nodeid_type ,
106  VectorType > ImportType ;
107 
108  //------------------------------------
109 
110  const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
111 
112  const int comm_rank = comm->getRank();
113  const int comm_size = comm->getSize();
114 
115  // Decompose by node to avoid parallel communication in assembly
116 
117  const double bubble_x = 1.0 ;
118  const double bubble_y = 1.0 ;
119  const double bubble_z = 1.0 ;
120 
121  const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
122  comm_size , comm_rank ,
123  use_nodes[0] , use_nodes[1] , use_nodes[2] ,
124  bubble_x , bubble_y , bubble_z );
125 
126  //------------------------------------
127 
128  const ImportType comm_nodal_import(
129  comm ,
130  fixture.recv_node() ,
131  fixture.send_node() ,
132  fixture.send_nodeid() ,
133  fixture.node_count_owned() ,
134  fixture.node_count() - fixture.node_count_owned() );
135 
136  //------------------------------------
137 
138  // const double bc_lower_value = 1 ;
139  // const double bc_upper_value = 2 ;
140  //CoeffFunctionType diffusion_coefficient( 1.0 );
141  CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
142  Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
143 
144  //------------------------------------
145 
146  if ( print_flag ) {
147  std::cout << "ElemNode {" << std::endl ;
148  for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
149  std::cout << " elem[" << ielem << "]{" ;
150  for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
151  std::cout << " " << fixture.elem_node(ielem,inode);
152  }
153  std::cout << " }" << std::endl ;
154  }
155  std::cout << "}" << std::endl ;
156  }
157 
158  //------------------------------------
159 
160  Kokkos::Timer wall_clock ;
161 
162  Perf perf_stats = Perf() ;
163 
164  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
165 
166  Perf perf = Perf() ;
167 
168  perf.global_elem_count = fixture.elem_count_global();
169  perf.global_node_count = fixture.node_count_global();
170 
171  //----------------------------------
172  // Create the local sparse matrix graph and element-to-graph map
173  // from the element->to->node identifier array.
174  // The graph only has rows for the owned nodes.
175 
176  typename NodeNodeGraphType::Times graph_times;
177  const NodeNodeGraphType
178  mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
179  graph_times );
180 
181  // Create the local sparse matrix from the graph:
182  LocalMatrixType jacobian( mesh_to_graph.graph );
183 
184  //----------------------------------
185 
186  // Allocate solution vector for each node in the mesh and residual vector for each owned node
187  VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
188  nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
189 
190  // Get DeviceConfig structs used by some functors
191  Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
193  dev_config_bc );
194 
195  // Create element computation functor
196  const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
197  nodal_solution ,
198  mesh_to_graph.elem_graph ,
199  jacobian , nodal_residual ,
200  dev_config_elem );
201 
202  // Create boundary condition functor
203  // const DirichletComputationType dirichlet(
204  // fixture , nodal_solution , jacobian , nodal_residual ,
205  // 2 /* apply at 'z' ends */ ,
206  // bc_lower_value ,
207  // bc_upper_value ,
208  // dev_config_bc );
209 
210  Kokkos::deep_copy( nodal_solution , Scalar(1) );
211 
212  //--------------------------------
213 
214  wall_clock.reset();
215 
216  comm_nodal_import( nodal_solution );
217 
218  Device().fence();
219  perf.import_time = maximum( comm , wall_clock.seconds() );
220 
221  //--------------------------------
222  // Element contributions to residual and jacobian
223 
224  wall_clock.reset();
225 
226  Kokkos::deep_copy( nodal_residual , Scalar(0) );
227  Kokkos::deep_copy( jacobian.values , Scalar(0) );
228 
229  elemcomp.apply();
230 
231  //--------------------------------
232  // Apply boundary conditions
233 
234  //dirichlet.apply();
235 
236  Device().fence();
237  perf.fill_time = maximum( comm , wall_clock.seconds() );
238 
239  //--------------------------------
240 
241  perf_stats.increment(perf);
242 
243  }
244 
245  return perf_stats ;
246 }
247 
248 template <typename ScalarViewType, typename EnsembleViewType>
249 bool check_residuals(const ScalarViewType& scalar_residual,
250  const EnsembleViewType& ensemble_residual)
251 {
252  const double tol = 1e-14;
253  bool success = true;
255  Teuchos::VerboseObjectBase::getDefaultOStream();
256  std::stringstream buf;
257  Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
258 
259  typename ScalarViewType::HostMirror host_scalar_residual =
260  Kokkos::create_mirror_view(scalar_residual);
261  typename EnsembleViewType::HostMirror host_ensemble_residual =
262  Kokkos::create_mirror_view(ensemble_residual);
263  Kokkos::deep_copy( host_scalar_residual, scalar_residual );
264  Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
265 
266  TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
267  host_ensemble_residual.extent(0), fbuf, success );
268 
269  const size_t num_node = host_scalar_residual.extent(0);
270  const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
271  for (size_t i=0; i<num_node; ++i) {
272  for (size_t j=0; j<num_ensemble; ++j) {
274  host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
275  tol, fbuf, success );
276  }
277  }
278 
279  if (!success)
280  *out << buf.str();
281 
282  return success;
283 }
284 
285 template <class Storage,
288  typedef typename Storage::value_type Scalar;
289  typedef typename Storage::ordinal_type Ordinal;
292  const int use_print ;
293  const int use_trials ;
294  const int *use_nodes ;
295  const bool check ;
297 
299  const int use_print_ ,
300  const int use_trials_ ,
301  const int use_nodes_[] ,
302  const bool check_ ,
304  comm(comm_),
305  use_print(use_print_),
306  use_trials(use_trials_),
307  use_nodes(use_nodes_),
308  check(check_),
309  dev_config(dev_config_) {}
310 
311  template <typename ArgT>
312  void operator() (ArgT arg) const {
313  const int ensemble = ArgT::value;
314  typedef typename Storage::template apply_N<ensemble> NewStorageApply;
315  typedef typename NewStorageApply::type storage_type;
316  typedef Sacado::MP::Vector<storage_type> mp_vector_type;
317 
318  typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
319  typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
320 
321  scalar_vector_type scalar_residual;
322  Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
323  Perf perf_scalar =
324  fenl_assembly<Scalar,Device,Method>(
325  comm, use_print, use_trials*ensemble, use_nodes,
326  scalar_dev_config, scalar_residual );
327 
328  ensemble_vector_type ensemble_residual;
329  Kokkos::Example::FENL::DeviceConfig ensemble_dev_config = dev_config;
330 #if defined( KOKKOS_ENABLE_CUDA )
331  const bool is_cuda = std::is_same<Device,Kokkos::Cuda>::value;
332 #else
333  const bool is_cuda = false ;
334 #endif
335  if (is_cuda) {
336  const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
337  ensemble_dev_config.block_dim.x = ensemble;
338  ensemble_dev_config.block_dim.y = block_size/ensemble;
339  }
340  Perf perf_ensemble =
341  fenl_assembly<mp_vector_type,Device,Method>(
343  ensemble_dev_config, ensemble_residual);
344 
345  if (check)
346  check_residuals( scalar_residual, ensemble_residual );
347 
348  double s =
349  1000.0 / ( use_trials * ensemble * perf_scalar.global_node_count );
350  perf_scalar.scale(s);
351  perf_ensemble.scale(s);
352 
353  if (comm->getRank() == 0) {
354  std::cout.precision(3);
355  std::cout << use_nodes[0] << " , "
356  << perf_scalar.global_node_count << " , "
357  << std::setw(2) << ensemble << " , "
358  << std::scientific
359  << perf_scalar.import_time << " , "
360  << perf_ensemble.import_time << " , "
361  << std::fixed << std::setw(6)
362  << perf_scalar.import_time / perf_ensemble.import_time << " , "
363  << std::scientific
364  << perf_scalar.fill_time << " , "
365  << perf_ensemble.fill_time << " , "
366  << std::fixed << std::setw(6)
367  << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
368  << std::endl;
369  }
370  }
371 };
372 
373 template <class Storage, int entry_min, int entry_max, int entry_step,
376  const int use_print ,
377  const int use_trials ,
378  const int use_nodes[] ,
379  const bool check ,
381 {
382  if (comm->getRank() == 0) {
383  std::cout.precision(8);
384  std::cout << std::endl
385  << "\"Grid Size\" , "
386  << "\"FEM Size\" , "
387  << "\"Ensemble Size\" , "
388  << "\"Scalar Import Time\" , "
389  << "\"Ensemble Import Time\" , "
390  << "\"Ensemble Import Speedup\" , "
391  << "\"Scalar Fill Time\" , "
392  << "\"Ensemble Fill Time\" , "
393  << "\"Ensemble Fill Speedup\" , "
394  << std::endl;
395  }
396 
397  // Loop over [entry_min, entry_max] vector entries per thread
398  typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
399  PerformanceDriverOp<Storage,Method> op(comm, use_print, use_trials,
400  use_nodes, check, dev_config);
401  Sacado::mpl::for_each_no_kokkos<Range> f(op);
402 }
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
Stokhos::StandardStorage< int, double > storage_type
Stokhos::StandardStorage< int, double > Storage
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
#define TEUCHOS_TEST_FLOATING_EQUALITY(v1, v2, tol, out, success)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
void operator()(ArgT arg) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Storage::execution_space Device
void increment(const Perf &p)
Teuchos::RCP< const Teuchos::Comm< int > > comm
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
void scale(double s)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
#define TEUCHOS_TEST_EQUALITY(v1, v2, out, success)
Kokkos::Example::FENL::DeviceConfig dev_config
double maximum(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, double local)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements...
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)