Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 #include <iostream>
42 
43 // MP::Vector
46 
47 // Compile-time loops
48 #include "Sacado_mpl_range_c.hpp"
49 #include "Sacado_mpl_for_each.hpp"
50 #include "Sacado_mpl_integral_c.hpp"
51 
52 // Kokkos libraries' headers:
53 #include <Kokkos_UnorderedMap.hpp>
54 #include <Kokkos_StaticCrsGraph.hpp>
55 #include <impl/Kokkos_Timer.hpp>
56 
57 // Utilities
58 #include <Teuchos_CommHelpers.hpp>
61 
62 // FENL
63 #include <BoxElemFixture.hpp>
64 #include <VectorImport.hpp>
65 #include <fenl_functors.hpp>
66 
67 inline
68 double maximum( const Teuchos::Comm<int>& comm , double local )
69 {
70  double global = 0 ;
71  Teuchos::reduceAll( comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
72  return global ;
73 }
74 
75 struct Perf {
76  size_t global_elem_count ;
77  size_t global_node_count ;
78  double import_time ;
79  double fill_time ;
80 
83  import_time(0) ,
84  fill_time(0) {}
85 
86  void increment(const Perf& p) {
90  fill_time += p.fill_time;
91  }
92 
93  void min(const Perf& p) {
98  }
99 
100  void scale(double s) {
101  import_time *= s;
102  fill_time *= s;
103  }
104 
105  void reduceMax(const Teuchos::Comm<int>& comm) {
106  import_time = maximum( comm , import_time );
107  fill_time = maximum( comm , fill_time );
108  }
109 };
110 
111 template <typename Scalar, typename Device>
113  const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
114  const int use_print ,
115  const int use_trials ,
116  const int use_nodes[] ,
118  Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
119 {
120  using Teuchos::RCP;
121  using Teuchos::rcp;
122  using Teuchos::rcpFromRef;
123  using Teuchos::arrayView;
125 
127 
129 
130  typedef typename LocalMatrixType::StaticCrsGraphType
131  LocalGraphType ;
132 
134  NodeNodeGraphType ;
135 
136  //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
139  ElementComputationType ;
140 
142  DirichletComputationType ;
143 
144  typedef typename ElementComputationType::vector_type VectorType ;
145 
147  typename FixtureType::comm_list_type ,
148  typename FixtureType::send_nodeid_type ,
149  VectorType > ImportType ;
150 
151  //------------------------------------
152 
153  const int print_flag = use_print && Kokkos::Impl::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
154 
155  const int comm_rank = comm->getRank();
156  const int comm_size = comm->getSize();
157 
158  // Decompose by node to avoid parallel communication in assembly
159 
160  const double bubble_x = 1.0 ;
161  const double bubble_y = 1.0 ;
162  const double bubble_z = 1.0 ;
163 
164  const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
165  comm_size , comm_rank ,
166  use_nodes[0] , use_nodes[1] , use_nodes[2] ,
167  bubble_x , bubble_y , bubble_z );
168 
169  if ( maximum(*comm, ( fixture.ok() ? 0 : 1 ) ) ) {
170  throw std::runtime_error(std::string("Problem fixture setup failed"));
171  }
172 
173  //------------------------------------
174 
175  const ImportType comm_nodal_import(
176  comm ,
177  fixture.recv_node() ,
178  fixture.send_node() ,
179  fixture.send_nodeid() ,
180  fixture.node_count_owned() ,
181  fixture.node_count() - fixture.node_count_owned() );
182 
183  //------------------------------------
184 
185  const double bc_lower_value = 1 ;
186  const double bc_upper_value = 2 ;
187  //CoeffFunctionType diffusion_coefficient( 1.0 );
188  CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
189  Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
190 
191  //------------------------------------
192 
193  if ( print_flag ) {
194  std::cout << "ElemNode {" << std::endl ;
195  for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
196  std::cout << " elem[" << ielem << "]{" ;
197  for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
198  std::cout << " " << fixture.elem_node(ielem,inode);
199  }
200  std::cout << " }" << std::endl ;
201  }
202  std::cout << "}" << std::endl ;
203  }
204 
205  //------------------------------------
206 
207  Kokkos::Impl::Timer wall_clock ;
208 
209  Perf perf_stats = Perf() ;
210 
211  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
212 
213  Perf perf = Perf() ;
214 
215  perf.global_elem_count = fixture.elem_count_global();
216  perf.global_node_count = fixture.node_count_global();
217 
218  //----------------------------------
219  // Create the local sparse matrix graph and element-to-graph map
220  // from the element->to->node identifier array.
221  // The graph only has rows for the owned nodes.
222 
223  typename NodeNodeGraphType::Times graph_times;
224  const NodeNodeGraphType
225  mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
226  graph_times );
227 
228  // Create the local sparse matrix from the graph:
229  LocalMatrixType jacobian( mesh_to_graph.graph );
230 
231  //----------------------------------
232 
233  // Allocate solution vector for each node in the mesh and residual vector for each owned node
234  VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
235  nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
236 
237  // Get DeviceConfig structs used by some functors
238  Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
240  dev_config_bc );
241 
242  // Create element computation functor
243  const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
244  nodal_solution ,
245  mesh_to_graph.elem_graph ,
246  jacobian , nodal_residual ,
247  dev_config_elem );
248 
249  // Create boundary condition functor
250  const DirichletComputationType dirichlet(
251  fixture , nodal_solution , jacobian , nodal_residual ,
252  2 /* apply at 'z' ends */ ,
253  bc_lower_value ,
254  bc_upper_value ,
255  dev_config_bc );
256 
257  Kokkos::deep_copy( nodal_solution , Scalar(1) );
258 
259  //--------------------------------
260 
261  Device::fence();
262  wall_clock.reset();
263 
264  comm_nodal_import( nodal_solution );
265 
266  Device::fence();
267  perf.import_time = wall_clock.seconds();
268 
269  //--------------------------------
270  // Element contributions to residual and jacobian
271 
272  Device::fence();
273  wall_clock.reset();
274 
275  Kokkos::deep_copy( nodal_residual , Scalar(0) );
276  Kokkos::deep_copy( jacobian.values , Scalar(0) );
277 
278  elemcomp.apply();
279 
280  //--------------------------------
281  // Apply boundary conditions
282 
283  dirichlet.apply();
284 
285  Device::fence();
286  perf.fill_time = wall_clock.seconds();
287 
288  //--------------------------------
289 
290  perf.reduceMax(*comm);
291  if (itrial == 0)
292  perf_stats = perf;
293  else
294  perf_stats.min(perf);
295 
296  }
297 
298  return perf_stats ;
299 }
300 
301 template <typename ScalarViewType, typename EnsembleViewType>
302 bool check_residuals(const ScalarViewType& scalar_residual,
303  const EnsembleViewType& ensemble_residual)
304 {
305  const double tol = 1e-14;
306  bool success = true;
308  Teuchos::VerboseObjectBase::getDefaultOStream();
309  std::stringstream buf;
310  Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
311 
312  typename ScalarViewType::HostMirror host_scalar_residual =
313  Kokkos::create_mirror_view(scalar_residual);
314  typename EnsembleViewType::HostMirror host_ensemble_residual =
315  Kokkos::create_mirror_view(ensemble_residual);
316  Kokkos::deep_copy( host_scalar_residual, scalar_residual );
317  Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
318 
319  TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
320  host_ensemble_residual.extent(0), fbuf, success );
321 
322  const size_t num_node = host_scalar_residual.extent(0);
323  const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
324  for (size_t i=0; i<num_node; ++i) {
325  for (size_t j=0; j<num_ensemble; ++j) {
327  host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
328  tol, fbuf, success );
329  }
330  }
331 
332  if (!success)
333  *out << buf.str();
334 
335  return success;
336 }
337 
338 template <class Storage>
339 struct PerformanceDriverOp {
340  typedef typename Storage::value_type Scalar;
341  typedef typename Storage::ordinal_type Ordinal;
344  const int use_print ;
345  const int use_trials ;
346  const int *use_nodes ;
347  const bool check ;
349 
351  const int use_print_ ,
352  const int use_trials_ ,
353  const int use_nodes_[] ,
354  const bool check_ ,
356  comm(comm_),
357  use_print(use_print_),
358  use_trials(use_trials_),
359  use_nodes(use_nodes_),
360  check(check_),
361  dev_config(dev_config_) {}
362 
363  template <typename ArgT>
364  void operator() (ArgT arg) const {
365  const int ensemble = ArgT::value;
366  typedef typename Storage::template apply_N<ensemble> NewStorageApply;
367  typedef typename NewStorageApply::type storage_type;
368  typedef Sacado::MP::Vector<storage_type> mp_vector_type;
369 
370  typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
371  typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
372 
373  scalar_vector_type scalar_residual;
374  Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
375  Perf perf_scalar =
376  fenl_assembly<Scalar,Device>(
378  scalar_dev_config, scalar_residual );
379 
380  ensemble_vector_type ensemble_residual;
381  Kokkos::Example::FENL::DeviceConfig ensemble_dev_config = dev_config;
382 #if defined( KOKKOS_ENABLE_CUDA )
383  const bool is_cuda = Kokkos::Impl::is_same<Device,Kokkos::Cuda>::value;
384 #else
385  const bool is_cuda = false ;
386 #endif
387  if (is_cuda) {
388  const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
389  ensemble_dev_config.block_dim.x = ensemble;
390  ensemble_dev_config.block_dim.y = block_size/ensemble;
391  }
392  Perf perf_ensemble =
393  fenl_assembly<mp_vector_type,Device>(
395  ensemble_dev_config, ensemble_residual);
396 
397  if (check)
398  check_residuals( scalar_residual, ensemble_residual );
399 
400  perf_scalar.scale( 1000.0 / ( perf_scalar.global_node_count ) );
401  perf_ensemble.scale( 1000.0 / ( ensemble * perf_scalar.global_node_count ) );
402 
403  if (comm->getRank() == 0) {
404  std::cout.precision(3);
405  std::cout << use_nodes[0] << " , "
406  << perf_scalar.global_node_count << " , "
407  << std::setw(2) << ensemble << " , "
408  << std::scientific
409  << perf_scalar.import_time << " , "
410  << perf_ensemble.import_time << " , "
411  << std::fixed << std::setw(6)
412  << perf_scalar.import_time / perf_ensemble.import_time << " , "
413  << std::scientific
414  << perf_scalar.fill_time << " , "
415  << perf_ensemble.fill_time << " , "
416  << std::fixed << std::setw(6)
417  << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
418  << std::endl;
419  }
420  }
421 };
422 
423 template <class Storage, int entry_min, int entry_max, int entry_step>
425  const int use_print ,
426  const int use_trials ,
427  const int use_nodes[] ,
428  const bool check ,
430 {
431  if (comm->getRank() == 0) {
432  std::cout.precision(8);
433  std::cout << std::endl
434  << "\"Grid Size\" , "
435  << "\"FEM Size\" , "
436  << "\"Ensemble Size\" , "
437  << "\"Scalar Import Time (ms)\" , "
438  << "\"Ensemble Import Time (ms)\" , "
439  << "\"Ensemble Import Speedup\" , "
440  << "\"Scalar Fill Time (ms)\" , "
441  << "\"Ensemble Fill Time (ms)\" , "
442  << "\"Ensemble Fill Speedup\" , "
443  << std::endl;
444  }
445 
446  // Loop over [entry_min, entry_max] vector entries per thread
447  typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
448  PerformanceDriverOp<Storage> op(comm, use_print, use_trials,
449  use_nodes, check, dev_config);
450  Sacado::mpl::for_each_no_kokkos<Range> f(op);
451 }
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
Stokhos::StandardStorage< int, double > storage_type
void min(const Perf &p)
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
#define TEUCHOS_TEST_FLOATING_EQUALITY(v1, v2, tol, out, success)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
void operator()(ArgT arg) const
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Storage::execution_space Device
void increment(const Perf &p)
Storage::ordinal_type Ordinal
Teuchos::RCP< const Teuchos::Comm< int > > comm
TEUCHOS_DEPRECATED void reduceAll(const Comm< Ordinal > &comm, const EReductionType reductType, const Packet &send, Packet *globalReduct)
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
void scale(double s)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
#define TEUCHOS_TEST_EQUALITY(v1, v2, out, success)
Kokkos::Example::FENL::DeviceConfig dev_config
double maximum(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, double local)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
void reduceMax(const Teuchos::Comm< int > &comm)
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements...
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)