Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
const_basis/advection_hierarchical_dfad.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
11 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
12 
13 #include "Sacado.hpp"
15 #include "common.hpp"
16 
17 #include "Kokkos_Timer.hpp"
18 
19 template<typename FluxView, typename WgbView, typename SrcView,
20  typename WbsView, typename ResidualView>
21 void run_dfad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
22  const SrcView& src, const WbsView& wbs,
23  const ResidualView& residual)
24 {
25  typedef typename ResidualView::execution_space execution_space;
26  typedef typename ResidualView::non_const_value_type scalar_type;
27  typedef Kokkos::TeamPolicy<execution_space> policy_type;
28  typedef typename policy_type::member_type team_member;
29 
30  const size_t num_cells = wgb.extent(0);
31  const int num_basis = wgb.extent(1);
32  const int num_points = wgb.extent(2);
33  const int num_dim = wgb.extent(3);
34 
35  const bool is_cuda = is_cuda_space<execution_space>::value;
36  const int vector_size = is_cuda ? 32 : 1;
37  const int team_size = is_cuda ? 256/vector_size : 1;
38 
39  policy_type policy(num_cells,team_size,vector_size);
40  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
41  {
42  const int team_rank = team.team_rank();
43  const size_t cell = team.league_rank();
44  scalar_type value, value2;
45  for (int basis=team_rank; basis<num_basis; basis+=team_size) {
46  value = 0.0;
47  value2 = 0.0;
48  for (int qp=0; qp<num_points; ++qp) {
49  for (int dim=0; dim<num_dim; ++dim)
50  value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
51  value2 += src(cell,qp)*wbs(cell,basis,qp);
52  }
53  residual(cell,basis) = value+value2;
54  }
55  });
56 }
57 
58 template<typename FluxView, typename WgbView, typename SrcView,
59  typename WbsView, typename ResidualView>
61  const FluxView& flux, const WgbView& wgb,
62  const SrcView& src, const WbsView& wbs,
63  const ResidualView& residual)
64 {
65  typedef typename ResidualView::execution_space execution_space;
66  typedef typename ResidualView::non_const_value_type scalar_type;
67  typedef Kokkos::TeamPolicy<execution_space> policy_type;
68  typedef typename policy_type::member_type team_member;
69  typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
70 
71  const size_t num_cells = wgb.extent(0);
72  const int num_basis = wgb.extent(1);
73  const int num_points = wgb.extent(2);
74  const int num_dim = wgb.extent(3);
75 
76  const bool is_cuda = is_cuda_space<execution_space>::value;
77  const int vector_size = is_cuda ? 32 : 1;
78  const int team_size = is_cuda ? 256/vector_size : 1;
79  const int fad_size = Kokkos::dimension_scalar(residual);
80  const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
81  policy_type policy(num_cells,team_size,vector_size);
82 
83  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
84  KOKKOS_LAMBDA (const team_member& team)
85  {
86  const int team_rank = team.team_rank();
87  const size_t cell = team.league_rank();
88  tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
89  tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
90  for (int basis=team_rank; basis<num_basis; basis+=team_size) {
91  value(team_rank) = 0.0;
92  value2(team_rank) = 0.0;
93  for (int qp=0; qp<num_points; ++qp) {
94  for (int dim=0; dim<num_dim; ++dim)
95  value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
96  value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
97  }
98  residual(cell,basis) = value(team_rank)+value2(team_rank);
99  }
100  });
101 }
102 
103 template <int N, typename ExecSpace>
104 double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points,
105  int ndim, int ntrial, bool check)
106 {
108 
109  typedef typename ExecSpace::array_layout DefaultLayout;
110  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
111  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
112  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
113  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
114  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
115 
116  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
117  t_3DView_d wbs("",ncells,num_basis,num_points);
118  t_3DView flux("",ncells,num_points,ndim,N+1);
119  t_2DView src("",ncells,num_points,N+1);
120  t_2DView residual("",ncells,num_basis,N+1);
121  init_fad(wgb, wbs, flux, src, residual);
122 
123  // Create memory pool for DFad
124  // The kernel allocates 2*N double's per warp on Cuda. Approximate
125  // the maximum number of warps as the maximum concurrency / 32.
126  // Include a fudge factor of 1.2 since memory pool treats a block as full
127  // once it reaches 80% capacity
128  const size_t block_size = N*sizeof(double);
129  size_t nkernels = ExecSpace().concurrency()*2;
131  nkernels /= 32;
132  const size_t mem_pool_size = static_cast<size_t>(1.2*nkernels*block_size);
133  const size_t superblock_size =
134  std::max<size_t>(nkernels / 100, 1) * block_size;
135  ExecSpace exec_space;
136  Sacado::createGlobalMemoryPool(exec_space, mem_pool_size,
137  block_size, block_size, superblock_size);
138 
139  // Run once to warm up, complete any UVM transfers
140  run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
141 
142  // Time execution
143  Kokkos::fence();
144  Kokkos::Timer timer;
145  for (int i=0; i<ntrial; ++i)
146  run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
147  Kokkos::fence();
148  double time = timer.seconds() / ntrial / ncells;
149 
150  // Check result
151  if (check)
152  check_residual(flux, wgb, src, wbs, residual);
153 
154  // Destroy memory pool
156 
157  return time;
158 }
159 
160 template <int N, typename ExecSpace>
162  int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
163 {
165 
166  typedef typename ExecSpace::array_layout DefaultLayout;
167  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
168  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
169  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
170  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
171  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
172 
173  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
174  t_3DView_d wbs("",ncells,num_basis,num_points);
175  t_3DView flux("",ncells,num_points,ndim,N+1);
176  t_2DView src("",ncells,num_points,N+1);
177  t_2DView residual("",ncells,num_basis,N+1);
178  init_fad(wgb, wbs, flux, src, residual);
179 
180  // Run once to warm up, complete any UVM transfers
181  run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
182 
183  // Time execution
184  Kokkos::fence();
185  Kokkos::Timer timer;
186  for (int i=0; i<ntrial; ++i)
187  run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
188  Kokkos::fence();
189  double time = timer.seconds() / ntrial / ncells;
190 
191  // Check result
192  if (check)
193  check_residual(flux, wgb, src, wbs, residual);
194 
195  return time;
196 }
197 
198 #define INST_FUNC_N_DEV(N,DEV) \
199  template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
200  template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
201 
202 #define INST_FUNC_DEV(DEV) \
203  INST_FUNC_N_DEV( fad_dim, DEV )
204 
205 #ifdef KOKKOS_ENABLE_SERIAL
206 INST_FUNC_DEV(Kokkos::Serial)
207 #endif
208 
209 #ifdef KOKKOS_ENABLE_OPENMP
210 INST_FUNC_DEV(Kokkos::OpenMP)
211 #endif
212 
213 #ifdef KOKKOS_ENABLE_THREADS
214 INST_FUNC_DEV(Kokkos::Threads)
215 #endif
216 
217 #ifdef KOKKOS_ENABLE_CUDA
218 INST_FUNC_DEV(Kokkos::Cuda)
219 #endif
double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
Sacado::Fad::DFad< double > FadType
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
const int N
void run_dfad_hierarchical_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
int value
void run_dfad_hierarchical_team_scratch(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void destroyGlobalMemoryPool(const ExecSpace &space)