Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FadMPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 #include <iostream>
42 
43 // MP::Vector
46 
47 // Compile-time loops
48 #include "Sacado_mpl_range_c.hpp"
49 #include "Sacado_mpl_for_each.hpp"
50 #include "Sacado_mpl_integral_c.hpp"
51 
52 // Kokkos libraries' headers:
53 #include <Kokkos_UnorderedMap.hpp>
54 #include <Kokkos_StaticCrsGraph.hpp>
55 #include <impl/Kokkos_Timer.hpp>
56 
57 // Utilities
58 #include <Teuchos_CommHelpers.hpp>
61 
62 // FENL
63 #include <BoxElemFixture.hpp>
64 #include <VectorImport.hpp>
65 #include <fenl_functors.hpp>
66 
67 struct Perf {
70  double import_time ;
71  double fill_time ;
72 
75  import_time(0) ,
76  fill_time(0) {}
77 
78  void increment(const Perf& p) {
82  fill_time += p.fill_time;
83  }
84 
85  void scale(double s) {
86  import_time *= s;
87  fill_time *= s;
88  }
89 };
90 
91 inline
92 double maximum( const Teuchos::RCP<const Teuchos::Comm<int> >& comm , double local )
93 {
94  double global = 0 ;
95  Teuchos::reduceAll( *comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
96  return global ;
97 }
98 
99 template <typename Scalar, typename Device,
102  const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
103  const int use_print ,
104  const int use_trials ,
105  const int use_nodes[] ,
107  Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
108 {
109  using Teuchos::RCP;
110  using Teuchos::rcp;
111  using Teuchos::rcpFromRef;
112  using Teuchos::arrayView;
114 
116 
118 
119  typedef typename LocalMatrixType::StaticCrsGraphType LocalGraphType ;
120 
122  NodeNodeGraphType ;
123 
124  //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
127  ElementComputationType ;
128 
129  // typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , LocalMatrixType >
130  // DirichletComputationType ;
131 
132  typedef typename ElementComputationType::vector_type VectorType ;
133 
135  typename FixtureType::comm_list_type ,
136  typename FixtureType::send_nodeid_type ,
137  VectorType > ImportType ;
138 
139  //------------------------------------
140 
141  const int print_flag = use_print && Kokkos::Impl::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
142 
143  const int comm_rank = comm->getRank();
144  const int comm_size = comm->getSize();
145 
146  // Decompose by node to avoid parallel communication in assembly
147 
148  const double bubble_x = 1.0 ;
149  const double bubble_y = 1.0 ;
150  const double bubble_z = 1.0 ;
151 
152  const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
153  comm_size , comm_rank ,
154  use_nodes[0] , use_nodes[1] , use_nodes[2] ,
155  bubble_x , bubble_y , bubble_z );
156 
157  //------------------------------------
158 
159  const ImportType comm_nodal_import(
160  comm ,
161  fixture.recv_node() ,
162  fixture.send_node() ,
163  fixture.send_nodeid() ,
164  fixture.node_count_owned() ,
165  fixture.node_count() - fixture.node_count_owned() );
166 
167  //------------------------------------
168 
169  // const double bc_lower_value = 1 ;
170  // const double bc_upper_value = 2 ;
171  //CoeffFunctionType diffusion_coefficient( 1.0 );
172  CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
173  Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
174 
175  //------------------------------------
176 
177  if ( print_flag ) {
178  std::cout << "ElemNode {" << std::endl ;
179  for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
180  std::cout << " elem[" << ielem << "]{" ;
181  for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
182  std::cout << " " << fixture.elem_node(ielem,inode);
183  }
184  std::cout << " }" << std::endl ;
185  }
186  std::cout << "}" << std::endl ;
187  }
188 
189  //------------------------------------
190 
191  Kokkos::Impl::Timer wall_clock ;
192 
193  Perf perf_stats = Perf() ;
194 
195  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
196 
197  Perf perf = Perf() ;
198 
199  perf.global_elem_count = fixture.elem_count_global();
200  perf.global_node_count = fixture.node_count_global();
201 
202  //----------------------------------
203  // Create the local sparse matrix graph and element-to-graph map
204  // from the element->to->node identifier array.
205  // The graph only has rows for the owned nodes.
206 
207  typename NodeNodeGraphType::Times graph_times;
208  const NodeNodeGraphType
209  mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
210  graph_times );
211 
212  // Create the local sparse matrix from the graph:
213  LocalMatrixType jacobian( mesh_to_graph.graph );
214 
215  //----------------------------------
216 
217  // Allocate solution vector for each node in the mesh and residual vector for each owned node
218  VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
219  nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
220 
221  // Get DeviceConfig structs used by some functors
222  Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
224  dev_config_bc );
225 
226  // Create element computation functor
227  const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
228  nodal_solution ,
229  mesh_to_graph.elem_graph ,
230  jacobian , nodal_residual ,
231  dev_config_elem );
232 
233  // Create boundary condition functor
234  // const DirichletComputationType dirichlet(
235  // fixture , nodal_solution , jacobian , nodal_residual ,
236  // 2 /* apply at 'z' ends */ ,
237  // bc_lower_value ,
238  // bc_upper_value ,
239  // dev_config_bc );
240 
241  Kokkos::deep_copy( nodal_solution , Scalar(1) );
242 
243  //--------------------------------
244 
245  wall_clock.reset();
246 
247  comm_nodal_import( nodal_solution );
248 
249  Device::fence();
250  perf.import_time = maximum( comm , wall_clock.seconds() );
251 
252  //--------------------------------
253  // Element contributions to residual and jacobian
254 
255  wall_clock.reset();
256 
257  Kokkos::deep_copy( nodal_residual , Scalar(0) );
258  Kokkos::deep_copy( jacobian.values , Scalar(0) );
259 
260  elemcomp.apply();
261 
262  //--------------------------------
263  // Apply boundary conditions
264 
265  //dirichlet.apply();
266 
267  Device::fence();
268  perf.fill_time = maximum( comm , wall_clock.seconds() );
269 
270  //--------------------------------
271 
272  perf_stats.increment(perf);
273 
274  }
275 
276  return perf_stats ;
277 }
278 
279 template <typename ScalarViewType, typename EnsembleViewType>
280 bool check_residuals(const ScalarViewType& scalar_residual,
281  const EnsembleViewType& ensemble_residual)
282 {
283  const double tol = 1e-14;
284  bool success = true;
286  Teuchos::VerboseObjectBase::getDefaultOStream();
287  std::stringstream buf;
288  Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
289 
290  typename ScalarViewType::HostMirror host_scalar_residual =
291  Kokkos::create_mirror_view(scalar_residual);
292  typename EnsembleViewType::HostMirror host_ensemble_residual =
293  Kokkos::create_mirror_view(ensemble_residual);
294  Kokkos::deep_copy( host_scalar_residual, scalar_residual );
295  Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
296 
297  TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
298  host_ensemble_residual.extent(0), fbuf, success );
299 
300  const size_t num_node = host_scalar_residual.extent(0);
301  const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
302  for (size_t i=0; i<num_node; ++i) {
303  for (size_t j=0; j<num_ensemble; ++j) {
305  host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
306  tol, fbuf, success );
307  }
308  }
309 
310  if (!success)
311  *out << buf.str();
312 
313  return success;
314 }
315 
316 template <class Storage,
319  typedef typename Storage::value_type Scalar;
320  typedef typename Storage::ordinal_type Ordinal;
323  const int use_print ;
324  const int use_trials ;
325  const int *use_nodes ;
326  const bool check ;
328 
330  const int use_print_ ,
331  const int use_trials_ ,
332  const int use_nodes_[] ,
333  const bool check_ ,
335  comm(comm_),
336  use_print(use_print_),
337  use_trials(use_trials_),
338  use_nodes(use_nodes_),
339  check(check_),
340  dev_config(dev_config_) {}
341 
342  template <typename ArgT>
343  void operator() (ArgT arg) const {
344  const int ensemble = ArgT::value;
345  typedef typename Storage::template apply_N<ensemble> NewStorageApply;
346  typedef typename NewStorageApply::type storage_type;
347  typedef Sacado::MP::Vector<storage_type> mp_vector_type;
348 
349  typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
350  typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
351 
352  scalar_vector_type scalar_residual;
353  Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
354  Perf perf_scalar =
355  fenl_assembly<Scalar,Device,Method>(
356  comm, use_print, use_trials*ensemble, use_nodes,
357  scalar_dev_config, scalar_residual );
358 
359  ensemble_vector_type ensemble_residual;
360  Kokkos::Example::FENL::DeviceConfig ensemble_dev_config = dev_config;
361 #if defined( KOKKOS_ENABLE_CUDA )
362  const bool is_cuda = Kokkos::Impl::is_same<Device,Kokkos::Cuda>::value;
363 #else
364  const bool is_cuda = false ;
365 #endif
366  if (is_cuda) {
367  const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
368  ensemble_dev_config.block_dim.x = ensemble;
369  ensemble_dev_config.block_dim.y = block_size/ensemble;
370  }
371  Perf perf_ensemble =
372  fenl_assembly<mp_vector_type,Device,Method>(
374  ensemble_dev_config, ensemble_residual);
375 
376  if (check)
377  check_residuals( scalar_residual, ensemble_residual );
378 
379  double s =
380  1000.0 / ( use_trials * ensemble * perf_scalar.global_node_count );
381  perf_scalar.scale(s);
382  perf_ensemble.scale(s);
383 
384  if (comm->getRank() == 0) {
385  std::cout.precision(3);
386  std::cout << use_nodes[0] << " , "
387  << perf_scalar.global_node_count << " , "
388  << std::setw(2) << ensemble << " , "
389  << std::scientific
390  << perf_scalar.import_time << " , "
391  << perf_ensemble.import_time << " , "
392  << std::fixed << std::setw(6)
393  << perf_scalar.import_time / perf_ensemble.import_time << " , "
394  << std::scientific
395  << perf_scalar.fill_time << " , "
396  << perf_ensemble.fill_time << " , "
397  << std::fixed << std::setw(6)
398  << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
399  << std::endl;
400  }
401  }
402 };
403 
404 template <class Storage, int entry_min, int entry_max, int entry_step,
407  const int use_print ,
408  const int use_trials ,
409  const int use_nodes[] ,
410  const bool check ,
412 {
413  if (comm->getRank() == 0) {
414  std::cout.precision(8);
415  std::cout << std::endl
416  << "\"Grid Size\" , "
417  << "\"FEM Size\" , "
418  << "\"Ensemble Size\" , "
419  << "\"Scalar Import Time\" , "
420  << "\"Ensemble Import Time\" , "
421  << "\"Ensemble Import Speedup\" , "
422  << "\"Scalar Fill Time\" , "
423  << "\"Ensemble Fill Time\" , "
424  << "\"Ensemble Fill Speedup\" , "
425  << std::endl;
426  }
427 
428  // Loop over [entry_min, entry_max] vector entries per thread
429  typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
430  PerformanceDriverOp<Storage,Method> op(comm, use_print, use_trials,
431  use_nodes, check, dev_config);
432  Sacado::mpl::for_each_no_kokkos<Range> f(op);
433 }
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
Stokhos::StandardStorage< int, double > storage_type
Stokhos::StandardStorage< int, double > Storage
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
#define TEUCHOS_TEST_FLOATING_EQUALITY(v1, v2, tol, out, success)
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P...> >::value, unsigned >::type dimension_scalar(const View< T, P...> &view)
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
void operator()(ArgT arg) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Storage::execution_space Device
void increment(const Perf &p)
Teuchos::RCP< const Teuchos::Comm< int > > comm
TEUCHOS_DEPRECATED void reduceAll(const Comm< Ordinal > &comm, const EReductionType reductType, const Packet &send, Packet *globalReduct)
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j)-expr2.val(j)
void scale(double s)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
#define TEUCHOS_TEST_EQUALITY(v1, v2, out, success)
Kokkos::Example::FENL::DeviceConfig dev_config
double maximum(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, double local)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements...
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)