Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fad_kokkos_hierarchical.cpp
Go to the documentation of this file.
1 //#define SACADO_VIEW_CUDA_HIERARCHICAL 1
2 //#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED 1
3 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
4 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
5 #define SACADO_ALIGN_SFAD 1
6 
7 #include "Sacado.hpp"
8 
11 
12 #include "Kokkos_Core.hpp"
13 #include "Kokkos_MemoryPool.hpp"
14 #include "impl/Kokkos_Timer.hpp"
15 #include <cstdio>
16 #include <algorithm>
17 
18 // Advection kernel.
19 
20 template<typename FluxView, typename WgbView, typename SrcView,
21  typename WbsView, typename ResidualView>
23 
24 template<typename FluxView, typename WgbView, typename SrcView,
25  typename WbsView, typename ResidualView>
28 create_advection_kernel(const FluxView& flux, const WgbView& bg,
29  const SrcView& src, const WbsView& bs,
30  const ResidualView& residual,
31  const typename FluxView::non_const_value_type& coeff);
32 
33 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
34 
35 // Version of advection kernel that supports flat and hierarchical parallelism
36 // when using the hierarchical_dfad approach. This requires no changes to the
37 // kernel beyond supporting a team policy.
38 template<typename FluxView, typename WgbView, typename SrcView,
39  typename WbsView, typename ResidualView>
40 struct AdvectionKernel {
41  typedef typename FluxView::non_const_value_type scalar_type;
42  typedef typename FluxView::execution_space execution_space;
43  typedef typename Kokkos::TeamPolicy<execution_space>::member_type team_handle;
44 
45  const FluxView flux_m_i;
46  const WgbView wgb;
47  const SrcView src_m_i;
48  const WbsView wbs;
49  const ResidualView residual_m_i;
51  const size_t ncells;
53 
54  // VS isn't used in this kernel
55  template <unsigned VS> struct HierarchicalFlatTag {};
56  template <unsigned VS> struct HierarchicalTeamTag {};
57 
59  AdvectionKernel(const FluxView& flux, const WgbView& gb,
60  const SrcView& src, const WbsView& bs,
61  const ResidualView& residual, const scalar_type& c) :
62  flux_m_i(flux),
63  wgb(gb),
64  src_m_i(src),
65  wbs(bs),
66  residual_m_i(residual),
67  coeff(c),
68  ncells(flux_m_i.extent(0)),
69  num_basis(wgb.extent(1)),
70  num_points(wgb.extent(2)),
71  num_dim((wgb.extent(3)))
72  {
73  }
74 
76  size_t num_cells() const { return ncells; }
77 
79  void operator() (const size_t cell, const int basis) const {
80  scalar_type value(0),value2(0);
81  for (int qp=0; qp<num_points; ++qp) {
82  for (int dim=0; dim<num_dim; ++dim)
83  value += flux_m_i(cell,qp,dim)*wgb(cell,basis,qp,dim);
84  value2 += src_m_i(cell,qp)*wbs(cell,basis,qp);
85  }
86  residual_m_i(cell,basis) = coeff*(value+value2);
87  }
88 
90  void operator() (const size_t cell) const {
91  for (int basis=0; basis<num_basis; ++basis) {
92  (*this)(cell,basis);
93  }
94  }
95 
96  template <unsigned VS>
98  void operator() (const HierarchicalFlatTag<VS>, const team_handle& team) const {
99  const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
100  if (cell < ncells)
101  (*this)(cell);
102  }
103 
104  template <unsigned VS>
106  void operator() (const HierarchicalTeamTag<VS>, const team_handle& team) const {
107  const size_t cell = team.league_rank();
108  const int team_size = team.team_size();
109  for (int basis=team.team_rank(); basis<num_basis; basis+=team_size)
110  (*this)(cell, basis);
111  }
112 };
113 
114 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
115 
116 // Version of advection kernel that supports flat and hierarchical parallelism
117 // when using the hierarchical approach. This requires a separate scalar type
118 // for temporaries, and partitioning of non-temporary scalars.
119 template<typename FluxView, typename WgbView, typename SrcView,
120  typename WbsView, typename ResidualView>
121 struct AdvectionKernel {
122  typedef typename FluxView::non_const_value_type scalar_type;
123  typedef typename Kokkos::ThreadLocalScalarType<FluxView>::type local_scalar_type;
124  typedef typename FluxView::execution_space execution_space;
125  typedef typename Kokkos::TeamPolicy<execution_space>::member_type team_handle;
126  enum { stride = Kokkos::ViewScalarStride<FluxView>::stride };
127 
128  const FluxView flux_m_i;
129  const WgbView wgb;
130  const SrcView src_m_i;
131  const WbsView wbs;
132  const ResidualView residual_m_i;
133  const scalar_type coeff;
134  const size_t ncells;
135  const int num_basis, num_points, num_dim;
136 
137  // VS isn't used in this kernel
138  template <unsigned VS> struct HierarchicalFlatTag {};
139  template <unsigned VS> struct HierarchicalTeamTag {};
140 
142  AdvectionKernel(const FluxView& flux, const WgbView& gb,
143  const SrcView& src, const WbsView& bs,
144  const ResidualView& residual, const scalar_type& c) :
145  flux_m_i(flux),
146  wgb(gb),
147  src_m_i(src),
148  wbs(bs),
149  residual_m_i(residual),
150  coeff(c),
151  ncells(flux_m_i.extent(0)),
152  num_basis(wgb.extent(1)),
153  num_points(wgb.extent(2)),
154  num_dim((wgb.extent(3)))
155  {
156  }
157 
159  size_t num_cells() const { return ncells; }
160 
162  void operator() (const size_t cell, const int basis) const {
163  local_scalar_type value(0),value2(0);
164  local_scalar_type c = Sacado::partition_scalar<stride>(coeff);
165  for (int qp=0; qp<num_points; ++qp) {
166  for (int dim=0; dim<num_dim; ++dim)
167  value += flux_m_i(cell,qp,dim)*wgb(cell,basis,qp,dim);
168  value2 += src_m_i(cell,qp)*wbs(cell,basis,qp);
169  }
170  residual_m_i(cell,basis) = c*(value+value2);
171  }
172 
174  void operator() (const size_t cell) const {
175  for (int basis=0; basis<num_basis; ++basis) {
176  (*this)(cell,basis);
177  }
178  }
179 
180  template <unsigned VS>
182  void operator() (const HierarchicalFlatTag<VS>, const team_handle& team) const {
183  const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
184  if (cell < ncells)
185  (*this)(cell);
186  }
187 
188  template <unsigned VS>
190  void operator() (const HierarchicalTeamTag<VS>, const team_handle& team) const {
191  const size_t cell = team.league_rank();
192  const int team_size = team.team_size();
193  for (int basis=team.team_rank(); basis<num_basis; basis+=team_size)
194  (*this)(cell, basis);
195  }
196 };
197 
198 #else
199 
200 // Version of advection kernel that supports flat and hierarchical parallelism
201 // when using the partitioning approach. This requires additional code to
202 // create thread-local views.
203 template<typename FluxView, typename WgbView, typename SrcView,
204  typename WbsView, typename ResidualView>
205 struct AdvectionKernel {
206  typedef typename FluxView::non_const_value_type scalar_type;
207  typedef typename FluxView::execution_space execution_space;
208  typedef typename Kokkos::TeamPolicy<execution_space>::member_type team_handle;
209 
210  const FluxView flux_m_i;
211  const WgbView wgb;
212  const SrcView src_m_i;
213  const WbsView wbs;
214  const ResidualView residual_m_i;
215  const scalar_type coeff;
216  const size_t ncells;
217  const int num_basis, num_points, num_dim;
218 
219  template <unsigned VS> struct HierarchicalFlatTag {};
220  template <unsigned VS> struct HierarchicalTeamTag {};
221  template <unsigned VS> struct PartitionedTag {};
222 
224  AdvectionKernel(const FluxView& flux, const WgbView& gb,
225  const SrcView& src, const WbsView& bs,
226  const ResidualView& residual, const scalar_type& c) :
227  flux_m_i(flux),
228  wgb(gb),
229  src_m_i(src),
230  wbs(bs),
231  residual_m_i(residual),
232  coeff(c),
233  ncells(flux_m_i.extent(0)),
234  num_basis(wgb.extent(1)),
235  num_points(wgb.extent(2)),
236  num_dim((wgb.extent(3)))
237  {
238  }
239 
241  size_t num_cells() const { return ncells; }
242 
244  void operator() (const size_t cell, const int basis) const {
245  scalar_type value(0),value2(0);
246  for (int qp=0; qp<num_points; ++qp) {
247  for (int dim=0; dim<num_dim; ++dim)
248  value += flux_m_i(cell,qp,dim)*wgb(cell,basis,qp,dim);
249  value2 += src_m_i(cell,qp)*wbs(cell,basis,qp);
250  }
251  residual_m_i(cell,basis) = coeff*(value+value2);
252  }
253 
255  void operator() (const size_t cell) const {
256  for (int basis=0; basis<num_basis; ++basis) {
257  (*this)(cell,basis);
258  }
259  }
260 
261  template <unsigned VS>
263  void operator() (const PartitionedTag<VS>, const size_t cell, const int basis) const {
264  // VS is the "vector" size == blockDim.x for CUDA. This is also set in
265  // the team execution policy, but we don't seem to have a way to access it.
266  // We also don't have a way to access the vector lane index from the team
267  // handle.
268 #ifdef __CUDA_ARCH__
269  const unsigned k = threadIdx.x;
270 #else
271  const unsigned k = 0;
272 #endif
273 
274  // Partition each view based on Cuda thread (vector) index
275  auto flux_part = Kokkos::partition<VS>(flux_m_i, k, VS);
276  auto wgb_part = Kokkos::partition<VS>(wgb, k, VS);
277  auto src_part = Kokkos::partition<VS>(src_m_i, k, VS);
278  auto wbs_part = Kokkos::partition<VS>(wbs, k, VS);
279  auto resid_part = Kokkos::partition<VS>(residual_m_i, k, VS);
280  auto coeff_part = Sacado::partition_scalar<VS>(coeff);
281 
282  // Now run the kernel with thread-local view's
283  auto kernel_part = create_advection_kernel(flux_part, wgb_part, src_part,
284  wbs_part, resid_part,
285  coeff_part);
286  kernel_part(cell, basis);
287  }
288 
289  template <unsigned VS>
291  void operator() (const HierarchicalFlatTag<VS>, const team_handle& team) const {
292  const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
293  if (cell < ncells)
294  for (int basis=0; basis<num_basis; ++basis)
295  (*this)(PartitionedTag<VS>(), cell, basis);
296  }
297 
298  template <unsigned VS>
300  void operator() (const HierarchicalTeamTag<VS>, const team_handle& team) const {
301  const size_t cell = team.league_rank();
302  const int team_size = team.team_size();
303  for (int basis=team.team_rank(); basis<num_basis; basis+=team_size)
304  (*this)(PartitionedTag<VS>(), cell, basis);
305  }
306 };
307 
308 #endif
309 
310 template<typename FluxView, typename WgbView, typename SrcView,
311  typename WbsView, typename ResidualView>
314 create_advection_kernel(const FluxView& flux, const WgbView& bg,
315  const SrcView& src, const WbsView& bs,
316  const ResidualView& residual,
317  const typename FluxView::non_const_value_type& coeff)
318 {
320  return kernel_type(flux,bg,src,bs,residual,coeff);
321 }
322 
323 template<typename KernelType>
324 void run_flat(const KernelType& kernel) {
325  typedef typename KernelType::execution_space execution_space;
326  Kokkos::RangePolicy<execution_space> policy(0,kernel.num_cells());
327  Kokkos::parallel_for(policy, kernel);
328 }
329 
330 template<typename KernelType>
331 void run_hierarchical_flat(const KernelType& kernel) {
332  typedef typename KernelType::execution_space execution_space;
333 #if defined (KOKKOS_ENABLE_CUDA)
334  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
335 #else
336  const bool is_cuda = false;
337 #endif
338  const unsigned vector_size = is_cuda ? 32 : 1;
339  if (is_cuda) {
340  const unsigned team_size = 256 / vector_size;
341  typedef typename KernelType::template HierarchicalFlatTag<vector_size> tag_type;
342  typedef Kokkos::TeamPolicy<execution_space,tag_type> policy_type;
343  const size_t range = (kernel.num_cells()+team_size-1)/team_size;
344  policy_type policy(range,team_size,vector_size);
345  Kokkos::parallel_for(policy, kernel);
346  }
347  else {
348  run_flat(kernel);
349  }
350 }
351 
352 template<typename KernelType>
353 void run_hierarchical_team(const KernelType& kernel) {
354  typedef typename KernelType::execution_space execution_space;
355 #if defined (KOKKOS_ENABLE_CUDA)
356  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
357 #else
358  const bool is_cuda = false;
359 #endif
360  const unsigned vector_size = is_cuda ? 32 : 1;
361  if (is_cuda) {
362  const unsigned team_size = 256 / vector_size;
363  typedef typename KernelType::template HierarchicalTeamTag<vector_size> tag_type;
364  typedef Kokkos::TeamPolicy<execution_space,tag_type> policy_type;
365  policy_type policy(kernel.num_cells(),team_size,vector_size);
366  Kokkos::parallel_for(policy, kernel);
367  }
368  else {
369  run_flat(kernel);
370  }
371 }
372 
373 template <typename T> struct FadTypeName;
374 template <typename T> struct FadTypeName< Sacado::Fad::DFad<T> > {
375  static std::string eval() {
376  return std::string("dfad")
377 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
378  + std::string(", mempool")
379 #endif
380 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED)
381  + std::string(", strided")
382 #endif
383  ;
384  }
385 };
386 template <typename T, int N> struct FadTypeName< Sacado::Fad::SFad<T,N> > {
387  static std::string eval() {
388 #if defined(SACADO_ALIGN_SFAD)
389  return "sfad, aligned";
390 #else
391  return "sfad";
392 #endif
393  }
394 };
395 template <typename T, int N> struct FadTypeName< Sacado::Fad::SLFad<T,N> > {
396  static std::string eval() { return "slfad"; }
397 };
398 
399 template<typename ExecSpace, int DIM, int N>
400 struct DrekarTest {
401 
402  int ncells;
405  int ndim;
406 
407  struct MomFluxTag {};
408  struct MomFluxTagConst {};
410 
411  typedef Kokkos::View<double****[N+1],ExecSpace> t_4DView;
412  typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
413  typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
414 
415  typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const;
416 
417 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
418  typedef Sacado::Fad::DFad<double> FadType; // Must be DFad in this case
419 #else
421 #endif
422  typedef Kokkos::View<FadType****,ExecSpace> t_4DViewFad;
423  typedef Kokkos::View<FadType***,ExecSpace> t_3DViewFad;
424  typedef Kokkos::View<FadType**,ExecSpace> t_2DViewFad;
425 
426  typedef typename ExecSpace::array_layout DefaultLayout;
427 #if defined(KOKKOS_ENABLE_CUDA)
428  static const int FadStride =
429  std::is_same< ExecSpace, Kokkos::Cuda >::value ? 32 : 1;
430 #if defined(SACADO_ALIGN_SFAD)
431  static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
432  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
433 #else
434  typedef FadType AlignedFadType;
435 #endif
436 #else
437  static const int FadStride = 1;
439 #endif
441  typedef Kokkos::View<AlignedFadType****,ContLayout,ExecSpace> t_4DViewFadCont;
442  typedef Kokkos::View<AlignedFadType***,ContLayout,ExecSpace> t_3DViewFadCont;
443  typedef Kokkos::View<AlignedFadType**,ContLayout,ExecSpace> t_2DViewFadCont;
444 
445  typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type team_handle;
446 
447  typedef Kokkos::View<double****[N+1],Kokkos::LayoutRight,ExecSpace> t_4DView_team;
448  typedef Kokkos::View<double***[N+1],Kokkos::LayoutRight,ExecSpace> t_3DView_team;
449  typedef Kokkos::View<double**[N+1],Kokkos::LayoutRight,ExecSpace> t_2DView_team;
450  typedef Kokkos::View<const double***[N+1],Kokkos::LayoutRight,ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const_team;
451 
452 
453  typedef Kokkos::View<double[N+1],typename ExecSpace::scratch_memory_space,Kokkos::MemoryTraits<Kokkos::Unmanaged> > t_shared_scalar;
454 
458 
462 
467 
472 
474 
475  DrekarTest(int ncells_, int num_basis_, int num_points_):
476  ncells(ncells_) ,
477  num_basis(num_basis_) ,
478  num_points(num_points_) ,
479  ndim(DIM),
480  coeff(N, 0.0)
481  {
482  wgb_fad = t_4DViewFad("",ncells,num_basis,num_points,ndim,N+1);
483  wbs_fad = t_3DViewFad("",ncells,num_basis,num_points,N+1);
484  flux_m_i_fad = t_3DViewFad("",ncells,num_points,ndim,N+1);
485  src_m_i_fad = t_2DViewFad("",ncells,num_points,N+1);
486  residual_m_i_fad = t_2DViewFad("",ncells,num_basis,N+1);
488 
489  wgb_fad_cont = t_4DViewFadCont("",ncells,num_basis,num_points,ndim,N+1);
490  wbs_fad_cont = t_3DViewFadCont("",ncells,num_basis,num_points,N+1);
491  flux_m_i_fad_cont = t_3DViewFadCont("",ncells,num_points,ndim,N+1);
492  src_m_i_fad_cont = t_2DViewFadCont("",ncells,num_points,N+1);
493  residual_m_i_fad_cont = t_2DViewFadCont("",ncells,num_basis,N+1);
495 
496  wgb = t_4DView("",ncells,num_basis,num_points,ndim);
497  wbs = t_3DView("",ncells,num_basis,num_points);
498  flux_m_i_const = flux_m_i = t_3DView("",ncells,num_points,ndim);
499  src_m_i = t_2DView("",ncells,num_points);
500  residual_m_i = t_2DView("",ncells,num_basis);
502 
503  residual_m_i_const = t_2DView("",ncells,num_basis);
504  Kokkos::deep_copy( residual_m_i_const, 0.0 );
505 
506  twgb = t_4DView_team("",ncells,num_basis,num_points,ndim);
507  twbs = t_3DView_team("",ncells,num_basis,num_points);
509  tsrc_m_i = t_2DView_team("",ncells,num_points);
510  tresidual_m_i = t_2DView_team("",ncells,num_basis);
512 
513  for (int i=0; i<N; ++i)
514  coeff.fastAccessDx(i) = generate_fad(1,1,1,1,N,0,0,0,0,i);
515  coeff.val() = generate_fad(1,1,1,1,N,0,0,0,0,N);
516  }
517 
518  typename FadType::value_type
519  generate_fad( const size_t n0, const size_t n1,
520  const size_t n2, const size_t n3, const int fad_size,
521  const size_t i0, const size_t i1,
522  const size_t i2, const size_t i3,
523  const int i_fad)
524  {
525  typedef typename FadType::value_type scalar;
526  const scalar x0 = 10.0 + scalar(n0) / scalar(i0+1);
527  const scalar x1 = 100.0 + scalar(n1) / scalar(i1+1);
528  const scalar x2 = 1000.0 + scalar(n2) / scalar(i2+1);
529  const scalar x3 = 10000.0 + scalar(n3) / scalar(i3+1);
530  const scalar x = x0 + x1 + x2 + x3;
531  if (i_fad == fad_size)
532  return x;
533  const scalar x_fad = 1.0 + scalar(fad_size) / scalar(i_fad+1);
534  return x + x_fad;
535  }
536 
537  template <typename V1, typename V2, typename V3, typename V4, typename V5>
538  void init_fad(const V1& v1, const V2& v2, const V3& v3, const V4& v4, const V5& v5)
539  {
540  // Kokkos::deep_copy(typename V1::array_type(v1), 1.0);
541  // Kokkos::deep_copy(typename V2::array_type(v2), 2.0);
542  // Kokkos::deep_copy(typename V3::array_type(v3), 3.0);
543  // Kokkos::deep_copy(typename V4::array_type(v4), 4.0);
544 
545  auto v1_h = Kokkos::create_mirror_view(v1);
546  auto v2_h = Kokkos::create_mirror_view(v2);
547  auto v3_h = Kokkos::create_mirror_view(v3);
548  auto v4_h = Kokkos::create_mirror_view(v4);
549  for (int cell=0; cell<ncells; ++cell) {
550  for (int basis=0; basis<num_basis; ++basis) {
551  for (int qp=0; qp<num_points; ++qp) {
552  for (int dim=0; dim<ndim; ++dim) {
553  for (int i=0; i<N; ++i)
554  v1_h(cell,basis,qp,dim).fastAccessDx(i) =
555  generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
556  v1_h(cell,basis,qp,dim).val() =
557  generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
558  }
559  for (int i=0; i<N; ++i)
560  v2_h(cell,basis,qp).fastAccessDx(i) =
561  generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
562  v2_h(cell,basis,qp).val() =
563  generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
564  }
565  }
566  for (int qp=0; qp<num_points; ++qp) {
567  for (int dim=0; dim<ndim; ++dim) {
568  for (int i=0; i<N; ++i)
569  v3_h(cell,qp,dim).fastAccessDx(i) =
570  generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
571  v3_h(cell,qp,dim).val() =
572  generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
573  }
574  for (int i=0; i<N; ++i)
575  v4_h(cell,qp).fastAccessDx(i) =
576  generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,i);
577  v4_h(cell,qp).val() =
578  generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,N);
579  }
580  }
581 
582  Kokkos::deep_copy( v1, v1_h );
583  Kokkos::deep_copy( v2, v2_h );
584  Kokkos::deep_copy( v3, v3_h );
585  Kokkos::deep_copy( v4, v4_h );
586 
587  Kokkos::deep_copy(typename V5::array_type(v5), 0.0);
588  }
589 
590  template <typename V1, typename V2, typename V3, typename V4, typename V5>
591  void init_array(const V1& v1, const V2& v2, const V3& v3, const V4& v4, const V5& v5)
592  {
593  // Kokkos::deep_copy(typename V1::array_type(v1), 1.0);
594  // Kokkos::deep_copy(typename V2::array_type(v2), 2.0);
595  // Kokkos::deep_copy(typename V3::array_type(v3), 3.0);
596  // Kokkos::deep_copy(typename V4::array_type(v4), 4.0);
597 
598  auto v1_h = Kokkos::create_mirror_view(v1);
599  auto v2_h = Kokkos::create_mirror_view(v2);
600  auto v3_h = Kokkos::create_mirror_view(v3);
601  auto v4_h = Kokkos::create_mirror_view(v4);
602  for (int cell=0; cell<ncells; ++cell) {
603  for (int basis=0; basis<num_basis; ++basis) {
604  for (int qp=0; qp<num_points; ++qp) {
605  for (int dim=0; dim<ndim; ++dim) {
606  for (int i=0; i<N; ++i)
607  v1_h(cell,basis,qp,dim,i) =
608  generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
609  v1_h(cell,basis,qp,dim,N) =
610  generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
611  }
612  for (int i=0; i<N; ++i)
613  v2_h(cell,basis,qp,i) =
614  generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
615  v2_h(cell,basis,qp,N) =
616  generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
617  }
618  }
619  for (int qp=0; qp<num_points; ++qp) {
620  for (int dim=0; dim<ndim; ++dim) {
621  for (int i=0; i<N; ++i)
622  v3_h(cell,qp,dim,i) =
623  generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
624  v3_h(cell,qp,dim,N) =
625  generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
626  }
627  for (int i=0; i<N; ++i)
628  v4_h(cell,qp,i) =
629  generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,i);
630  v4_h(cell,qp,N) =
631  generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,N);
632  }
633  }
634 
635  Kokkos::deep_copy( v1, v1_h );
636  Kokkos::deep_copy( v2, v2_h );
637  Kokkos::deep_copy( v3, v3_h );
638  Kokkos::deep_copy( v4, v4_h );
639 
640  Kokkos::deep_copy(typename V5::array_type(v5), 0.0);
641  }
642 
643  template <typename View1, typename View2>
644  typename std::enable_if< !Kokkos::is_view_fad<View2>::value, bool>::type
645  check(const View1& v_gold, const View2& v, const double tol)
646  {
647  // Copy to host
648  typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
649  typename View2::HostMirror v_h = Kokkos::create_mirror_view(v);
650  Kokkos::deep_copy(v_gold_h, v_gold);
651  Kokkos::deep_copy(v_h, v);
652 
653  typedef typename View1::value_type value_type;
654 
655  const size_t n0 = v_gold_h.extent(0);
656  const size_t n1 = v_gold_h.extent(1);
657  const size_t n2 = v_gold_h.extent(2);
658 
659  bool success = true;
660  for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
661  for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
662  for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
663  value_type x_gold = v_gold_h(i0,i1,i2);
664  value_type x = v_h(i0,i1,i2);
665  if (std::abs(x_gold-x) > tol*std::abs(x_gold)) {
666  std::cout << "Comparison failed! x_gold("
667  << i0 << "," << i1 << "," << i2 << ") = "
668  << x_gold << " , x = " << x
669  << std::endl;
670  success = false;
671  }
672  }
673  }
674  }
675 
676  return success;
677  }
678 
679  template <typename View1, typename View2>
680  typename std::enable_if< Kokkos::is_view_fad<View2>::value, bool>::type
681  check(const View1& v_gold, const View2& v, const double tol)
682  {
683  // Copy to host
684  typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
685  typename View2::HostMirror v_h = Kokkos::create_mirror_view(v);
686  Kokkos::deep_copy(v_gold_h, v_gold);
687  Kokkos::deep_copy(v_h, v);
688 
689  typedef typename View1::value_type value_type;
690 
691  const size_t n0 = v_gold_h.extent(0);
692  const size_t n1 = v_gold_h.extent(1);
693  const size_t n2 = v_gold_h.extent(2);
694 
695  bool success = true;
696  for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
697  for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
698  for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
699  value_type x_gold = v_gold_h(i0,i1,i2);
700  value_type x = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2);
701  if (std::abs(x_gold-x) > tol*std::abs(x_gold)) {
702  std::cout << "Comparison failed! x_gold("
703  << i0 << "," << i1 << "," << i2 << ") = "
704  << x_gold << " , x = " << x
705  << std::endl;
706  success = false;
707  }
708  }
709  }
710  }
711 
712  return success;
713  }
714 
716  void operator() (const MomFluxTag, const std::size_t &cell) const {
717  for (int basis=0; basis<num_basis; ++basis) {
718  double value[N+1],value2[N+1];
719  for (int k=0; k<N+1; ++k) {
720  value[k] = 0.0;
721  value2[k] = 0.0;
722  }
723  for (int qp=0; qp<num_points; ++qp) {
724  for (int dim=0; dim<DIM; ++dim) {
725  const double flux_val = flux_m_i(cell,qp,dim,N);
726  const double wgb_val = wgb(cell,basis,qp,dim,N);
727  value[N] += flux_val*wgb_val;
728  for(int k = 0; k<N;k++)
729  value[k] += flux_val*wgb(cell,basis,qp,dim,k)+flux_m_i(cell,qp,dim,k)*wgb_val;
730  }
731  const double src_val = src_m_i(cell,qp,N);
732  const double wbs_val = wbs(cell,basis,qp,N);
733  value2[N] += src_val*wbs_val;
734  for(int k = 0; k<N;k++)
735  value2[k] += src_val*wbs(cell,basis,qp,k)+src_m_i(cell,qp,k)*wbs_val;
736  }
737  for(int k = 0; k<N; k++)
738  residual_m_i(cell,basis,k) =
739  coeff.val()*(value[k]+value2[k]) +
740  coeff.fastAccessDx(k)*(value[N]+value2[N]);
741  residual_m_i(cell,basis,N)= coeff.val()*(value[N]+value2[N]);
742  }
743  }
744 
746  void operator() (const MomFluxTagConst, const std::size_t &cell) const {
747  for (int basis=0; basis<num_basis; ++basis) {
748  double value[N+1],value2[N+1];
749  for (int k=0; k<N+1; ++k) {
750  value[k] = 0.0;
751  value2[k] = 0.0;
752  }
753  for (int qp=0; qp<num_points; ++qp) {
754  for (int dim=0; dim<DIM; ++dim) {
755  const double flux_val = flux_m_i(cell,qp,dim,N);
756  const double wgb_val = wgb(cell,basis,qp,dim,N);
757  value[N] += flux_val*wgb_val;
758  for(int k = 0; k<N;k++)
759  value[k] += flux_val*wgb(cell,basis,qp,dim,k)+flux_m_i_const(cell,qp,dim,k)*wgb_val;
760  }
761  const double src_val = src_m_i(cell,qp,N);
762  const double wbs_val = wbs(cell,basis,qp,N);
763  value2[N] += src_val*wbs_val;
764  for(int k = 0; k<N;k++)
765  value2[k] += src_val*wbs(cell,basis,qp,k)+src_m_i(cell,qp,k)*wbs_val;
766  }
767  for(int k = 0; k<N; k++)
768  residual_m_i_const(cell,basis,k) =
769  coeff.val()*(value[k]+value2[k]) +
770  coeff.fastAccessDx(k)*(value[N]+value2[N]);
771  residual_m_i_const(cell,basis,N)= coeff.val()*(value[N]+value2[N]);
772  }
773  }
774 
775 
777  void compute_one(const MomFluxTagConstTeam, const team_handle& team, const int &cell, const int& basis,
778  const t_shared_scalar& value, const t_shared_scalar& value2) const {
779  for (int qp=0; qp<num_points; ++qp) {
780  for (int dim=0; dim<DIM; ++dim) {
781  const double flux_val = tflux_m_i(cell,qp,dim,N);
782  const double wgb_val = twgb(cell,basis,qp,dim,N);
783  Kokkos::single(Kokkos::PerThread(team), [&] () {
784  value[N] += flux_val*wgb_val;
785  });
786  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (const int& k) {
787  value[k] += flux_val*twgb(cell,basis,qp,dim,k)+tflux_m_i_const(cell,qp,dim,k)*wgb_val;
788  });
789  }
790  const double src_val = tsrc_m_i(cell,qp,N);
791  const double wbs_val = twbs(cell,basis,qp,N);
792  Kokkos::single(Kokkos::PerThread(team), [&] () {
793  value2[N] += src_val*wbs_val;
794  });
795  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (const int& k) {
796  value2[k] += src_val*twbs(cell,basis,qp,k)+tsrc_m_i(cell,qp,k)*wbs_val;
797  });
798  }
799  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (const int& k) {
800  tresidual_m_i(cell,basis,k) =
801  coeff.val()*(value[k]+value2[k]) +
802  coeff.fastAccessDx(k)*(value[N]+value2[N]);
803  });
804  Kokkos::single(Kokkos::PerThread(team), [&] () {
805  tresidual_m_i(cell,basis,N) = coeff.val()*(value[N]+value2[N]);
806  });
807  }
808 
810  void operator() (const MomFluxTagConstTeam, const team_handle& team) const {
811  t_shared_scalar value1(team.thread_scratch(0));
812  t_shared_scalar value2(team.thread_scratch(0));
813 
814  const int cell = team.league_rank();
815  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis), [&] (const int& basis) {
816  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1), [&] (const int& k) {
817  value1(k) = 0;
818  value2(k) = 0;
819  });
820  compute_one(MomFluxTagConstTeam(),team,cell,basis,value1,value2);
821  });
822  }
823 
824  void compute(const int ntrial, const bool do_check) {
825  auto kernel_flat =
827  Kokkos::Impl::Timer timer;
828  for (int i=0; i<ntrial; ++i) {
829  run_flat(kernel_flat);
830  }
831  Kokkos::fence();
832  double time_fad = timer.seconds() / ntrial / ncells;
833 
834  auto kernel_team =
836  timer.reset();
837  for (int i=0; i<ntrial; ++i) {
838  run_hierarchical_team(kernel_team);
839  }
840  Kokkos::fence();
841  double time_fad_cont = timer.seconds() / ntrial / ncells;
842 
843  timer.reset();
844  for (int i=0; i<ntrial; ++i)
845  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace,MomFluxTag>(0,ncells), *this);
846  Kokkos::fence();
847  double time = timer.seconds() / ntrial / ncells;
848 
849  timer.reset();
850  for (int i=0; i<ntrial; ++i)
851  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace,MomFluxTagConst>(0,ncells), *this);
852  Kokkos::fence();
853  double time_const = timer.seconds() / ntrial / ncells;
854 
855  timer.reset();
856  for (int i=0; i<ntrial; ++i)
857  Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace,MomFluxTagConstTeam>(ncells,num_basis,32).set_scratch_size(0,Kokkos::PerThread(64*8*2)), *this);
858  Kokkos::fence();
859  double time_team = timer.seconds() / ntrial / ncells;
860 
861  printf("%5d %9.3e %9.3e %9.3e %9.3e %9.3e\n",ncells,time_fad,time_fad_cont,time,time_const,time_team);
862 
863  if (do_check) {
864  const double tol = 1e-14;
869  }
870  }
871 };
872 
873 template <typename ExecSpace>
874 void run(const int cell_begin, const int cell_end, const int cell_step,
875  const int nbasis, const int npoint, const int ntrial, const bool check)
876 {
877  const int fad_dim = 50;
878  const int dim = 3;
879  typedef DrekarTest<ExecSpace,dim,fad_dim> test_type;
880 
881  // The kernel allocates 2*N double's per warp on Cuda. Approximate
882  // the maximum number of warps as the maximum concurrency / 32.
883  // Include a fudge factor of 1.2 since memory pool treats a block as full
884  // once it reaches 80% capacity
885  std::cout << "concurrency = " << ExecSpace::concurrency() << std::endl;
886  const size_t block_size = fad_dim*sizeof(double);
887  size_t nkernels = ExecSpace::concurrency()*2;
888 #if defined(KOKKOS_ENABLE_CUDA)
889  if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
890  nkernels /= 32;
891 #endif
892  size_t mem_pool_size =
893  static_cast<size_t>(1.2*nkernels*block_size);
894  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
895  std::cout << "Memory pool size = " << mem_pool_size / (1024.0 * 1024.0)
896  << " MB" << std::endl;
897  ExecSpace exec_space;
898  Sacado::createGlobalMemoryPool(exec_space, mem_pool_size,
899  block_size, block_size, superblock_size);
900 
901 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
902  std::cout << "hierarchical";
903 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
904  std::cout << "hierarchical_dfad";
905 #else
906  std::cout << "partitioned";
907 #endif
909  << ":" << std::endl;
910 
911  printf("ncell flat hier analytic const team\n");
912  for(int i=cell_begin; i<=cell_end; i+=cell_step) {
913  test_type test(i,nbasis,npoint);
914  test.compute(ntrial, check);
915  }
916 
918 }
919 
920 int main(int argc, char* argv[]) {
921  bool success = true;
922  try {
923 
924  // Set up command line options
926  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
927 #ifdef KOKKOS_ENABLE_SERIAL
928  bool serial = 0;
929  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
930 #endif
931 #ifdef KOKKOS_ENABLE_OPENMP
932  int openmp = 0;
933  clp.setOption("openmp", &openmp, "Number of OpenMP threads");
934 #endif
935 #ifdef KOKKOS_ENABLE_THREADS
936  int threads = 0;
937  clp.setOption("threads", &threads, "Number of pThreads threads");
938 #endif
939 #ifdef KOKKOS_ENABLE_CUDA
940  bool cuda = 0;
941  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
942 #endif
943  int numa = 0;
944  clp.setOption("numa", &numa,
945  "Number of NUMA domains to use (set to 0 to use all NUMAs");
946  int cores_per_numa = 0;
947  clp.setOption("cores-per-numa", &cores_per_numa,
948  "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
949  bool print_config = false;
950  clp.setOption("print-config", "no-print-config", &print_config,
951  "Whether to print Kokkos device configuration");
952  int cell_begin = 100;
953  clp.setOption("begin", &cell_begin, "Starting number of cells");
954  int cell_end = 8000;
955  clp.setOption("end", &cell_end, "Ending number of cells");
956  int cell_step = 100;
957  clp.setOption("step", &cell_step, "Cell increment");
958  int nbasis = 8;
959  clp.setOption("basis", &nbasis, "Number of basis functions");
960  int npoint = 8;
961  clp.setOption("point", &npoint, "Number of integration points");
962  int ntrial = 5;
963  clp.setOption("trial", &ntrial, "Number of trials");
964  bool check = false;
965  clp.setOption("check", "no-check", &check,
966  "Check correctness of results");
967 
968  // Parse options
969  switch (clp.parse(argc, argv)) {
971  return 0;
974  return 1;
976  break;
977  }
978 
979  Kokkos::InitArguments init_args;
980  init_args.num_threads = -1;
981  #ifdef KOKKOS_ENABLE_OPENMP
982  if(openmp) init_args.num_threads = openmp;
983  #endif
984  #ifdef KOKKOS_ENABLE_THREADS
985  if(threads) init_args.num_threads = threads;
986  #endif
987 
988  Kokkos::initialize(init_args);
989  if (print_config)
990  Kokkos::print_configuration(std::cout, true);
991 
992 #ifdef KOKKOS_ENABLE_SERIAL
993  if (serial) {
994  using Kokkos::Serial;
995  run<Serial>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
996  }
997 #endif
998 
999 #ifdef KOKKOS_ENABLE_OPENMP
1000  if (openmp) {
1001  using Kokkos::OpenMP;
1002  run<OpenMP>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
1003  }
1004 #endif
1005 
1006 #ifdef KOKKOS_ENABLE_THREADS
1007  if (threads) {
1008  using Kokkos::Threads;
1009  run<Threads>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
1010  }
1011 #endif
1012 
1013 #ifdef KOKKOS_ENABLE_CUDA
1014  if (cuda) {
1015  using Kokkos::Cuda;
1016  run<Cuda>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
1017  }
1018 #endif
1019  Kokkos::finalize();
1020  }
1021  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
1022 
1023  return !success;
1024 }
t_3DView_team tflux_m_i
static const int FadStride
Kokkos::TeamPolicy< ExecSpace >::member_type team_handle
abs(expr.val())
FluxView::non_const_value_type scalar_type
t_3DViewFadCont wbs_fad_cont
KOKKOS_INLINE_FUNCTION size_t num_cells() const
Kokkos::View< AlignedFadType **, ContLayout, ExecSpace > t_2DViewFadCont
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_hierarchical_team(const KernelType &kernel)
t_3DViewFadCont flux_m_i_fad_cont
AlignedFadType coeff
DrekarTest(int ncells_, int num_basis_, int num_points_)
GeneralFad< StaticStorage< T, Num > > SLFad
const scalar_type coeff
KOKKOS_INLINE_FUNCTION AdvectionKernel< FluxView, WgbView, SrcView, WbsView, ResidualView > create_advection_kernel(const FluxView &flux, const WgbView &bg, const SrcView &src, const WbsView &bs, const ResidualView &residual, const typename FluxView::non_const_value_type &coeff)
void run(const int cell_begin, const int cell_end, const int cell_step, const int nbasis, const int npoint, const int ntrial, const bool check)
Kokkos::View< double **[N+1], ExecSpace > t_2DView
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
t_3DView_const_team tflux_m_i_const
Kokkos::View< AlignedFadType ***, ContLayout, ExecSpace > t_3DViewFadCont
Kokkos::View< double **[N+1], Kokkos::LayoutRight, ExecSpace > t_2DView_team
t_4DViewFadCont wgb_fad_cont
KOKKOS_INLINE_FUNCTION void operator()(const MomFluxTag, const std::size_t &cell) const
#define KOKKOS_INLINE_FUNCTION
t_2DView_team tresidual_m_i
Sacado::Fad::DFad< double > FadType
void compute(const int ntrial, const bool do_check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
FadType::value_type generate_fad(const size_t n0, const size_t n1, const size_t n2, const size_t n3, const int fad_size, const size_t i0, const size_t i1, const size_t i2, const size_t i3, const int i_fad)
std::enable_if< Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
GeneralFad< DynamicStorage< T > > DFad
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
const ResidualView residual_m_i
void init_array(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
FluxView::execution_space execution_space
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
int main()
Definition: ad_example.cpp:191
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
Kokkos::View< AlignedFadType ****, ContLayout, ExecSpace > t_4DViewFadCont
void run_flat(const KernelType &kernel)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
KOKKOS_INLINE_FUNCTION void operator()(const size_t cell, const int basis) const
Kokkos::View< double ***[N+1], Kokkos::LayoutRight, ExecSpace > t_3DView_team
Kokkos::TeamPolicy< execution_space >::member_type team_handle
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
t_2DViewFadCont residual_m_i_fad_cont
void run_hierarchical_flat(const KernelType &kernel)
void setDocString(const char doc_string[])
const double tol
Kokkos::View< double[N+1], typename ExecSpace::scratch_memory_space, Kokkos::MemoryTraits< Kokkos::Unmanaged > > t_shared_scalar
t_3DView_const flux_m_i_const
Kokkos::View< FadType ****, ExecSpace > t_4DViewFad
Kokkos::View< FadType ***, ExecSpace > t_3DViewFad
void test()
ExecSpace::array_layout DefaultLayout
Kokkos::LayoutContiguous< DefaultLayout, FadStride > ContLayout
Kokkos::View< const double ***[N+1], ExecSpace, Kokkos::MemoryTraits< Kokkos::RandomAccess > > t_3DView_const
void destroyGlobalMemoryPool(const ExecSpace &space)
KOKKOS_INLINE_FUNCTION void compute_one(const MomFluxTagConstTeam, const team_handle &team, const int &cell, const int &basis, const t_shared_scalar &value, const t_shared_scalar &value2) const
Kokkos::View< double ***[N+1], ExecSpace > t_3DView
Kokkos::View< const double ***[N+1], Kokkos::LayoutRight, ExecSpace, Kokkos::MemoryTraits< Kokkos::RandomAccess > > t_3DView_const_team
t_2DViewFadCont src_m_i_fad_cont
t_2DView_team tsrc_m_i
t_2DViewFad residual_m_i_fad
Kokkos::View< double ****[N+1], Kokkos::LayoutRight, ExecSpace > t_4DView_team
GeneralFad< StaticFixedStorage< T, Num > > SFad
Kokkos::View< double ****[N+1], ExecSpace > t_4DView
KOKKOS_INLINE_FUNCTION AdvectionKernel(const FluxView &flux, const WgbView &gb, const SrcView &src, const WbsView &bs, const ResidualView &residual, const scalar_type &c)
Kokkos::View< FadType **, ExecSpace > t_2DViewFad