Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FadMPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include <iostream>
11 
12 // MP::Vector
15 
16 // Compile-time loops
17 #include "Sacado_mpl_range_c.hpp"
18 #include "Sacado_mpl_for_each.hpp"
19 #include "Sacado_mpl_integral_c.hpp"
20 
21 // Kokkos libraries' headers:
22 #include <Kokkos_UnorderedMap.hpp>
23 #include <Kokkos_Timer.hpp>
24 
25 // Utilities
26 #include <Teuchos_CommHelpers.hpp>
29 
30 // FENL
31 #include <BoxElemFixture.hpp>
32 #include <VectorImport.hpp>
33 #include <fenl_functors.hpp>
34 
35 struct Perf {
38  double import_time ;
39  double fill_time ;
40 
43  import_time(0) ,
44  fill_time(0) {}
45 
46  void increment(const Perf& p) {
50  fill_time += p.fill_time;
51  }
52 
53  void scale(double s) {
54  import_time *= s;
55  fill_time *= s;
56  }
57 };
58 
59 inline
60 double maximum( const Teuchos::RCP<const Teuchos::Comm<int> >& comm , double local )
61 {
62  double global = 0 ;
63  Teuchos::reduceAll( *comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
64  return global ;
65 }
66 
67 template <typename Scalar, typename Device,
70  const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
71  const int use_print ,
72  const int use_trials ,
73  const int use_nodes[] ,
75  Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
76 {
77  using Teuchos::RCP;
78  using Teuchos::rcp;
79  using Teuchos::rcpFromRef;
80  using Teuchos::arrayView;
82 
84 
86 
87  typedef typename LocalMatrixType::StaticCrsGraphType LocalGraphType ;
88 
90  NodeNodeGraphType ;
91 
92  //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
95  ElementComputationType ;
96 
97  // typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , LocalMatrixType >
98  // DirichletComputationType ;
99 
100  typedef typename ElementComputationType::vector_type VectorType ;
101 
103  typename FixtureType::comm_list_type ,
104  typename FixtureType::send_nodeid_type ,
105  VectorType > ImportType ;
106 
107  //------------------------------------
108 
109  const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
110 
111  const int comm_rank = comm->getRank();
112  const int comm_size = comm->getSize();
113 
114  // Decompose by node to avoid parallel communication in assembly
115 
116  const double bubble_x = 1.0 ;
117  const double bubble_y = 1.0 ;
118  const double bubble_z = 1.0 ;
119 
120  const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
121  comm_size , comm_rank ,
122  use_nodes[0] , use_nodes[1] , use_nodes[2] ,
123  bubble_x , bubble_y , bubble_z );
124 
125  //------------------------------------
126 
127  const ImportType comm_nodal_import(
128  comm ,
129  fixture.recv_node() ,
130  fixture.send_node() ,
131  fixture.send_nodeid() ,
132  fixture.node_count_owned() ,
133  fixture.node_count() - fixture.node_count_owned() );
134 
135  //------------------------------------
136 
137  // const double bc_lower_value = 1 ;
138  // const double bc_upper_value = 2 ;
139  //CoeffFunctionType diffusion_coefficient( 1.0 );
140  CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
141  Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
142 
143  //------------------------------------
144 
145  if ( print_flag ) {
146  std::cout << "ElemNode {" << std::endl ;
147  for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
148  std::cout << " elem[" << ielem << "]{" ;
149  for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
150  std::cout << " " << fixture.elem_node(ielem,inode);
151  }
152  std::cout << " }" << std::endl ;
153  }
154  std::cout << "}" << std::endl ;
155  }
156 
157  //------------------------------------
158 
159  Kokkos::Timer wall_clock ;
160 
161  Perf perf_stats = Perf() ;
162 
163  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
164 
165  Perf perf = Perf() ;
166 
167  perf.global_elem_count = fixture.elem_count_global();
168  perf.global_node_count = fixture.node_count_global();
169 
170  //----------------------------------
171  // Create the local sparse matrix graph and element-to-graph map
172  // from the element->to->node identifier array.
173  // The graph only has rows for the owned nodes.
174 
175  typename NodeNodeGraphType::Times graph_times;
176  const NodeNodeGraphType
177  mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
178  graph_times );
179 
180  // Create the local sparse matrix from the graph:
181  LocalMatrixType jacobian( mesh_to_graph.graph );
182 
183  //----------------------------------
184 
185  // Allocate solution vector for each node in the mesh and residual vector for each owned node
186  VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
187  nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
188 
189  // Get DeviceConfig structs used by some functors
190  Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
192  dev_config_bc );
193 
194  // Create element computation functor
195  const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
196  nodal_solution ,
197  mesh_to_graph.elem_graph ,
198  jacobian , nodal_residual ,
199  dev_config_elem );
200 
201  // Create boundary condition functor
202  // const DirichletComputationType dirichlet(
203  // fixture , nodal_solution , jacobian , nodal_residual ,
204  // 2 /* apply at 'z' ends */ ,
205  // bc_lower_value ,
206  // bc_upper_value ,
207  // dev_config_bc );
208 
209  Kokkos::deep_copy( nodal_solution , Scalar(1) );
210 
211  //--------------------------------
212 
213  wall_clock.reset();
214 
215  comm_nodal_import( nodal_solution );
216 
217  Device().fence();
218  perf.import_time = maximum( comm , wall_clock.seconds() );
219 
220  //--------------------------------
221  // Element contributions to residual and jacobian
222 
223  wall_clock.reset();
224 
225  Kokkos::deep_copy( nodal_residual , Scalar(0) );
226  Kokkos::deep_copy( jacobian.values , Scalar(0) );
227 
228  elemcomp.apply();
229 
230  //--------------------------------
231  // Apply boundary conditions
232 
233  //dirichlet.apply();
234 
235  Device().fence();
236  perf.fill_time = maximum( comm , wall_clock.seconds() );
237 
238  //--------------------------------
239 
240  perf_stats.increment(perf);
241 
242  }
243 
244  return perf_stats ;
245 }
246 
247 template <typename ScalarViewType, typename EnsembleViewType>
248 bool check_residuals(const ScalarViewType& scalar_residual,
249  const EnsembleViewType& ensemble_residual)
250 {
251  const double tol = 1e-14;
252  bool success = true;
254  Teuchos::VerboseObjectBase::getDefaultOStream();
255  std::stringstream buf;
256  Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
257 
258  typename ScalarViewType::HostMirror host_scalar_residual =
259  Kokkos::create_mirror_view(scalar_residual);
260  typename EnsembleViewType::HostMirror host_ensemble_residual =
261  Kokkos::create_mirror_view(ensemble_residual);
262  Kokkos::deep_copy( host_scalar_residual, scalar_residual );
263  Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
264 
265  TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
266  host_ensemble_residual.extent(0), fbuf, success );
267 
268  const size_t num_node = host_scalar_residual.extent(0);
269  const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
270  for (size_t i=0; i<num_node; ++i) {
271  for (size_t j=0; j<num_ensemble; ++j) {
273  host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
274  tol, fbuf, success );
275  }
276  }
277 
278  if (!success)
279  *out << buf.str();
280 
281  return success;
282 }
283 
284 template <class Storage,
287  typedef typename Storage::value_type Scalar;
288  typedef typename Storage::ordinal_type Ordinal;
291  const int use_print ;
292  const int use_trials ;
293  const int *use_nodes ;
294  const bool check ;
296 
298  const int use_print_ ,
299  const int use_trials_ ,
300  const int use_nodes_[] ,
301  const bool check_ ,
303  comm(comm_),
304  use_print(use_print_),
305  use_trials(use_trials_),
306  use_nodes(use_nodes_),
307  check(check_),
308  dev_config(dev_config_) {}
309 
310  template <typename ArgT>
311  void operator() (ArgT arg) const {
312  const int ensemble = ArgT::value;
313  typedef typename Storage::template apply_N<ensemble> NewStorageApply;
314  typedef typename NewStorageApply::type storage_type;
315  typedef Sacado::MP::Vector<storage_type> mp_vector_type;
316 
317  typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
318  typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
319 
320  scalar_vector_type scalar_residual;
321  Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
322  Perf perf_scalar =
323  fenl_assembly<Scalar,Device,Method>(
324  comm, use_print, use_trials*ensemble, use_nodes,
325  scalar_dev_config, scalar_residual );
326 
327  ensemble_vector_type ensemble_residual;
328  Kokkos::Example::FENL::DeviceConfig ensemble_dev_config = dev_config;
329 #if defined( KOKKOS_ENABLE_CUDA )
330  const bool is_cuda = std::is_same<Device,Kokkos::Cuda>::value;
331 #else
332  const bool is_cuda = false ;
333 #endif
334  if (is_cuda) {
335  const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
336  ensemble_dev_config.block_dim.x = ensemble;
337  ensemble_dev_config.block_dim.y = block_size/ensemble;
338  }
339  Perf perf_ensemble =
340  fenl_assembly<mp_vector_type,Device,Method>(
342  ensemble_dev_config, ensemble_residual);
343 
344  if (check)
345  check_residuals( scalar_residual, ensemble_residual );
346 
347  double s =
348  1000.0 / ( use_trials * ensemble * perf_scalar.global_node_count );
349  perf_scalar.scale(s);
350  perf_ensemble.scale(s);
351 
352  if (comm->getRank() == 0) {
353  std::cout.precision(3);
354  std::cout << use_nodes[0] << " , "
355  << perf_scalar.global_node_count << " , "
356  << std::setw(2) << ensemble << " , "
357  << std::scientific
358  << perf_scalar.import_time << " , "
359  << perf_ensemble.import_time << " , "
360  << std::fixed << std::setw(6)
361  << perf_scalar.import_time / perf_ensemble.import_time << " , "
362  << std::scientific
363  << perf_scalar.fill_time << " , "
364  << perf_ensemble.fill_time << " , "
365  << std::fixed << std::setw(6)
366  << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
367  << std::endl;
368  }
369  }
370 };
371 
372 template <class Storage, int entry_min, int entry_max, int entry_step,
375  const int use_print ,
376  const int use_trials ,
377  const int use_nodes[] ,
378  const bool check ,
380 {
381  if (comm->getRank() == 0) {
382  std::cout.precision(8);
383  std::cout << std::endl
384  << "\"Grid Size\" , "
385  << "\"FEM Size\" , "
386  << "\"Ensemble Size\" , "
387  << "\"Scalar Import Time\" , "
388  << "\"Ensemble Import Time\" , "
389  << "\"Ensemble Import Speedup\" , "
390  << "\"Scalar Fill Time\" , "
391  << "\"Ensemble Fill Time\" , "
392  << "\"Ensemble Fill Speedup\" , "
393  << std::endl;
394  }
395 
396  // Loop over [entry_min, entry_max] vector entries per thread
397  typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
398  PerformanceDriverOp<Storage,Method> op(comm, use_print, use_trials,
399  use_nodes, check, dev_config);
400  Sacado::mpl::for_each_no_kokkos<Range> f(op);
401 }
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
Stokhos::StandardStorage< int, double > storage_type
Stokhos::StandardStorage< int, double > Storage
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
#define TEUCHOS_TEST_FLOATING_EQUALITY(v1, v2, tol, out, success)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
void operator()(ArgT arg) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Storage::execution_space Device
void increment(const Perf &p)
Teuchos::RCP< const Teuchos::Comm< int > > comm
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
void scale(double s)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
#define TEUCHOS_TEST_EQUALITY(v1, v2, out, success)
Kokkos::Example::FENL::DeviceConfig dev_config
double maximum(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, double local)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements...
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)