Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
const_basis/advection_hierarchical_dfad.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
31 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
32 
33 #include "Sacado.hpp"
35 #include "common.hpp"
36 
37 #include "impl/Kokkos_Timer.hpp"
38 
39 template<typename FluxView, typename WgbView, typename SrcView,
40  typename WbsView, typename ResidualView>
41 void run_dfad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
42  const SrcView& src, const WbsView& wbs,
43  const ResidualView& residual)
44 {
45  typedef typename ResidualView::execution_space execution_space;
46  typedef typename ResidualView::non_const_value_type scalar_type;
47  typedef Kokkos::TeamPolicy<execution_space> policy_type;
48  typedef typename policy_type::member_type team_member;
49 
50  const size_t num_cells = wgb.extent(0);
51  const int num_basis = wgb.extent(1);
52  const int num_points = wgb.extent(2);
53  const int num_dim = wgb.extent(3);
54 
55  const bool is_cuda = is_cuda_space<execution_space>::value;
56  const int vector_size = is_cuda ? 32 : 1;
57  const int team_size = is_cuda ? 256/vector_size : 1;
58 
59  policy_type policy(num_cells,team_size,vector_size);
60  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
61  {
62  const int team_rank = team.team_rank();
63  const size_t cell = team.league_rank();
64  scalar_type value, value2;
65  for (int basis=team_rank; basis<num_basis; basis+=team_size) {
66  value = 0.0;
67  value2 = 0.0;
68  for (int qp=0; qp<num_points; ++qp) {
69  for (int dim=0; dim<num_dim; ++dim)
70  value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
71  value2 += src(cell,qp)*wbs(cell,basis,qp);
72  }
73  residual(cell,basis) = value+value2;
74  }
75  });
76 }
77 
78 template<typename FluxView, typename WgbView, typename SrcView,
79  typename WbsView, typename ResidualView>
81  const FluxView& flux, const WgbView& wgb,
82  const SrcView& src, const WbsView& wbs,
83  const ResidualView& residual)
84 {
85  typedef typename ResidualView::execution_space execution_space;
86  typedef typename ResidualView::non_const_value_type scalar_type;
87  typedef Kokkos::TeamPolicy<execution_space> policy_type;
88  typedef typename policy_type::member_type team_member;
89  typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
90 
91  const size_t num_cells = wgb.extent(0);
92  const int num_basis = wgb.extent(1);
93  const int num_points = wgb.extent(2);
94  const int num_dim = wgb.extent(3);
95 
96  const bool is_cuda = is_cuda_space<execution_space>::value;
97  const int vector_size = is_cuda ? 32 : 1;
98  const int team_size = is_cuda ? 256/vector_size : 1;
99  const int fad_size = Kokkos::dimension_scalar(residual);
100  const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
101  policy_type policy(num_cells,team_size,vector_size);
102 
103  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
104  KOKKOS_LAMBDA (const team_member& team)
105  {
106  const int team_rank = team.team_rank();
107  const size_t cell = team.league_rank();
108  tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
109  tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
110  for (int basis=team_rank; basis<num_basis; basis+=team_size) {
111  value(team_rank) = 0.0;
112  value2(team_rank) = 0.0;
113  for (int qp=0; qp<num_points; ++qp) {
114  for (int dim=0; dim<num_dim; ++dim)
115  value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
116  value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
117  }
118  residual(cell,basis) = value(team_rank)+value2(team_rank);
119  }
120  });
121 }
122 
123 template <int N, typename ExecSpace>
124 double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points,
125  int ndim, int ntrial, bool check)
126 {
128 
129  typedef typename ExecSpace::array_layout DefaultLayout;
130  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
131  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
132  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
133  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
134  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
135 
136  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
137  t_3DView_d wbs("",ncells,num_basis,num_points);
138  t_3DView flux("",ncells,num_points,ndim,N+1);
139  t_2DView src("",ncells,num_points,N+1);
140  t_2DView residual("",ncells,num_basis,N+1);
141  init_fad(wgb, wbs, flux, src, residual);
142 
143  // Create memory pool for DFad
144  // The kernel allocates 2*N double's per warp on Cuda. Approximate
145  // the maximum number of warps as the maximum concurrency / 32.
146  // Include a fudge factor of 1.2 since memory pool treats a block as full
147  // once it reaches 80% capacity
148  const size_t block_size = N*sizeof(double);
149  size_t nkernels = ExecSpace::concurrency()*2;
151  nkernels /= 32;
152  const size_t mem_pool_size = static_cast<size_t>(1.2*nkernels*block_size);
153  const size_t superblock_size =
154  std::max<size_t>(nkernels / 100, 1) * block_size;
155  ExecSpace exec_space;
156  Sacado::createGlobalMemoryPool(exec_space, mem_pool_size,
157  block_size, block_size, superblock_size);
158 
159  // Run once to warm up, complete any UVM transfers
160  run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
161 
162  // Time execution
163  Kokkos::fence();
164  Kokkos::Impl::Timer timer;
165  for (int i=0; i<ntrial; ++i)
166  run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
167  Kokkos::fence();
168  double time = timer.seconds() / ntrial / ncells;
169 
170  // Check result
171  if (check)
172  check_residual(flux, wgb, src, wbs, residual);
173 
174  // Destroy memory pool
176 
177  return time;
178 }
179 
180 template <int N, typename ExecSpace>
182  int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
183 {
185 
186  typedef typename ExecSpace::array_layout DefaultLayout;
187  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
188  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
189  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
190  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
191  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
192 
193  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
194  t_3DView_d wbs("",ncells,num_basis,num_points);
195  t_3DView flux("",ncells,num_points,ndim,N+1);
196  t_2DView src("",ncells,num_points,N+1);
197  t_2DView residual("",ncells,num_basis,N+1);
198  init_fad(wgb, wbs, flux, src, residual);
199 
200  // Run once to warm up, complete any UVM transfers
201  run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
202 
203  // Time execution
204  Kokkos::fence();
205  Kokkos::Impl::Timer timer;
206  for (int i=0; i<ntrial; ++i)
207  run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
208  Kokkos::fence();
209  double time = timer.seconds() / ntrial / ncells;
210 
211  // Check result
212  if (check)
213  check_residual(flux, wgb, src, wbs, residual);
214 
215  return time;
216 }
217 
218 #define INST_FUNC_N_DEV(N,DEV) \
219  template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
220  template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
221 
222 #define INST_FUNC_DEV(DEV) \
223  INST_FUNC_N_DEV( fad_dim, DEV )
224 
225 #ifdef KOKKOS_ENABLE_SERIAL
226 INST_FUNC_DEV(Kokkos::Serial)
227 #endif
228 
229 #ifdef KOKKOS_ENABLE_OPENMP
230 INST_FUNC_DEV(Kokkos::OpenMP)
231 #endif
232 
233 #ifdef KOKKOS_ENABLE_THREADS
234 INST_FUNC_DEV(Kokkos::Threads)
235 #endif
236 
237 #ifdef KOKKOS_ENABLE_CUDA
238 INST_FUNC_DEV(Kokkos::Cuda)
239 #endif
double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
Sacado::Fad::DFad< double > FadType
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
const int N
void run_dfad_hierarchical_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void run_dfad_hierarchical_team_scratch(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void destroyGlobalMemoryPool(const ExecSpace &space)