34 #include "impl/Kokkos_Timer.hpp"
36 template<
typename FluxView,
typename WgbView,
typename SrcView,
37 typename WbsView,
typename ResidualView>
39 const SrcView& src,
const WbsView& wbs,
40 const ResidualView& residual)
42 typedef typename ResidualView::execution_space execution_space;
43 typedef typename ResidualView::non_const_value_type scalar_type;
45 const size_t num_cells = wgb.extent(0);
46 const int num_basis = wgb.extent(1);
47 const int num_points = wgb.extent(2);
48 const int num_dim = wgb.extent(3);
50 Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
51 KOKKOS_LAMBDA (
const size_t cell)
53 scalar_type value, value2;
54 for (
int basis=0; basis<num_basis; ++basis) {
57 for (
int qp=0; qp<num_points; ++qp) {
58 for (
int dim=0; dim<num_dim; ++dim)
59 value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
60 value2 += src(cell,qp)*wbs(cell,basis,qp);
62 residual(cell,basis) = value+value2;
67 template<
typename FluxView,
typename WgbView,
typename SrcView,
68 typename WbsView,
typename ResidualView>
70 const SrcView& src,
const WbsView& wbs,
71 const ResidualView& residual)
73 typedef typename ResidualView::execution_space execution_space;
74 typedef typename ResidualView::non_const_value_type scalar_type;
75 typedef Kokkos::TeamPolicy<execution_space> policy_type;
76 typedef typename policy_type::member_type team_member;
77 typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
79 const size_t num_cells = wgb.extent(0);
80 const int num_basis = wgb.extent(1);
81 const int num_points = wgb.extent(2);
82 const int num_dim = wgb.extent(3);
84 const int vector_size = 1;
86 const int fad_size = Kokkos::dimension_scalar(residual);
87 const size_t range = (num_cells+team_size-1)/team_size;
88 const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
89 policy_type policy(range,team_size,vector_size);
91 Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
92 KOKKOS_LAMBDA (
const team_member& team)
94 const int team_rank = team.team_rank();
95 tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
96 tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
97 const size_t cell = team.league_rank()*team_size + team_rank;
98 if (cell < num_cells) {
99 for (
int basis=0; basis<num_basis; ++basis) {
100 value(team_rank) = 0.0;
101 value2(team_rank) = 0.0;
102 for (
int qp=0; qp<num_points; ++qp) {
103 for (
int dim=0; dim<num_dim; ++dim)
104 value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
105 value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
107 residual(cell,basis) = value(team_rank)+value2(team_rank);
113 template<
int N,
typename FluxView,
typename WgbView,
typename SrcView,
114 typename WbsView,
typename ResidualView>
116 const SrcView& src,
const WbsView& wbs,
117 const ResidualView& residual)
119 typedef typename ResidualView::execution_space execution_space;
120 typedef typename ResidualView::non_const_value_type scalar_type;
122 const size_t num_cells = wgb.extent(0);
123 const int num_basis = wgb.extent(1);
124 const int num_points = wgb.extent(2);
125 const int num_dim = wgb.extent(3);
127 Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
128 KOKKOS_LAMBDA (
const size_t cell)
130 scalar_type value[
N+1],value2[
N+1];
131 for (
int basis=0; basis<num_basis; ++basis) {
132 for (
int k=0; k<
N+1; ++k) {
136 for (
int qp=0; qp<num_points; ++qp) {
137 for (
int dim=0; dim<num_dim; ++dim) {
138 const scalar_type flux_val = flux(cell,qp,dim,N);
139 const scalar_type wgb_val = wgb(cell,basis,qp,dim,N);
140 value[
N] += flux_val*wgb_val;
141 for(
int k=0; k<
N; k++)
143 flux_val*wgb(cell,basis,qp,dim,k)+flux(cell,qp,dim,k)*wgb_val;
145 const scalar_type src_val = src(cell,qp,N);
146 const scalar_type wbs_val = wbs(cell,basis,qp,N);
147 value2[
N] += src_val*wbs_val;
148 for(
int k=0; k<
N; k++)
149 value2[k] += src_val*wbs(cell,basis,qp,k)+src(cell,qp,k)*wbs_val;
151 for(
int k=0; k<=
N; k++)
152 residual(cell,basis,k) = value[k]+value2[k];
157 template<
int N,
typename FluxView,
typename WgbView,
typename SrcView,
158 typename WbsView,
typename ResidualView>
160 const SrcView& src,
const WbsView& wbs,
161 const ResidualView& residual)
163 typedef typename ResidualView::execution_space execution_space;
164 typedef typename ResidualView::non_const_value_type scalar_type;
165 typedef Kokkos::TeamPolicy<execution_space> policy_type;
166 typedef typename policy_type::member_type team_member;
167 typedef Kokkos::View<scalar_type[N+1], typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
169 const size_t num_cells = wgb.extent(0);
170 const int num_basis = wgb.extent(1);
171 int num_points = wgb.extent(2);
172 int num_dim = wgb.extent(3);
174 const size_t bytes = 2*tmp_scratch_type::shmem_size();
175 policy_type policy(num_cells,num_basis,32);
176 Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerThread(bytes)),
177 KOKKOS_LAMBDA (
const team_member& team)
179 tmp_scratch_type value(team.thread_scratch(0));
180 tmp_scratch_type value2(team.thread_scratch(0));
181 const size_t cell = team.league_rank();
182 Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis),
183 [&] (
const int& basis)
185 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,
N+1),
191 for (
int qp=0; qp<num_points; ++qp) {
192 for (
int dim=0; dim<num_dim; ++dim) {
193 const scalar_type flux_val = flux(cell,qp,dim,
N);
194 const scalar_type wgb_val = wgb(cell,basis,qp,dim,
N);
195 Kokkos::single(Kokkos::PerThread(team), [&] () {
196 value[
N] += flux_val*wgb_val;
198 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,
N),
202 flux_val*wgb(cell,basis,qp,dim,k)+flux(cell,qp,dim,k)*wgb_val;
205 const scalar_type src_val = src(cell,qp,
N);
206 const scalar_type wbs_val = wbs(cell,basis,qp,
N);
207 Kokkos::single(Kokkos::PerThread(team), [&] () {
208 value2[
N] += src_val*wbs_val;
210 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,
N),
213 value2[k] += src_val*wbs(cell,basis,qp,k)+src(cell,qp,k)*wbs_val;
216 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,
N+1),
219 residual(cell,basis,k) = value[k]+value2[k];
225 template <
typename FadType,
int N,
typename ExecSpace>
227 int ntrial,
bool check)
229 typedef Kokkos::View<FadType****,ExecSpace> t_4DView;
230 typedef Kokkos::View<FadType***,ExecSpace> t_3DView;
231 typedef Kokkos::View<FadType**,ExecSpace> t_2DView;
233 t_4DView wgb(
"",ncells,num_basis,num_points,ndim,
N+1);
234 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
235 t_3DView wbs(
"",ncells,num_basis,num_points,
N+1);
236 t_2DView src(
"",ncells,num_points,
N+1);
237 t_2DView residual(
"",ncells,num_basis,
N+1);
238 init_fad(wgb, wbs, flux, src, residual);
245 Kokkos::Impl::Timer timer;
246 for (
int i=0; i<ntrial; ++i)
249 double time = timer.seconds() / ntrial / ncells;
258 template <
typename FadType,
int N,
typename ExecSpace>
260 int ntrial,
bool check)
262 typedef Kokkos::View<FadType****,ExecSpace> t_4DView;
263 typedef Kokkos::View<FadType***,ExecSpace> t_3DView;
264 typedef Kokkos::View<FadType**,ExecSpace> t_2DView;
266 t_4DView wgb(
"",ncells,num_basis,num_points,ndim,
N+1);
267 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
268 t_3DView wbs(
"",ncells,num_basis,num_points,
N+1);
269 t_2DView src(
"",ncells,num_points,
N+1);
270 t_2DView residual(
"",ncells,num_basis,
N+1);
271 init_fad(wgb, wbs, flux, src, residual);
278 Kokkos::Impl::Timer timer;
279 for (
int i=0; i<ntrial; ++i)
282 double time = timer.seconds() / ntrial / ncells;
291 template <
int N,
typename ExecSpace>
293 int ntrial,
bool check)
295 typedef Kokkos::View<double****[N+1],ExecSpace> t_4DView;
296 typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
297 typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
299 t_4DView wgb(
"",ncells,num_basis,num_points,ndim);
300 t_3DView flux(
"",ncells,num_points,ndim);
301 t_3DView wbs(
"",ncells,num_basis,num_points);
302 t_2DView src(
"",ncells,num_points);
303 t_2DView residual(
"",ncells,num_basis);
307 run_analytic_flat<N>(flux, wgb, src, wbs, residual);
311 Kokkos::Impl::Timer timer;
312 for (
int i=0; i<ntrial; ++i)
313 run_analytic_flat<N>(flux, wgb, src, wbs, residual);
315 double time = timer.seconds() / ntrial / ncells;
324 template <
int N,
typename ExecSpace>
326 int ntrial,
bool check)
328 typedef Kokkos::View<double****[N+1],ExecSpace> t_4DView;
329 typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
330 typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
331 typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const;
333 t_4DView wgb(
"",ncells,num_basis,num_points,ndim);
334 t_3DView flux(
"",ncells,num_points,ndim);
335 t_3DView wbs(
"",ncells,num_basis,num_points);
336 t_2DView src(
"",ncells,num_points);
337 t_2DView residual(
"",ncells,num_basis);
340 t_3DView_const flux_const = flux;
343 run_analytic_flat<N>(flux_const, wgb, src, wbs, residual);
347 Kokkos::Impl::Timer timer;
348 for (
int i=0; i<ntrial; ++i)
349 run_analytic_flat<N>(flux_const, wgb, src, wbs, residual);
351 double time = timer.seconds() / ntrial / ncells;
360 template <
int N,
typename ExecSpace>
362 int ntrial,
bool check)
364 typedef Kokkos::View<double****[N+1],ExecSpace> t_4DView;
365 typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
366 typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
367 typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const;
369 t_4DView wgb(
"",ncells,num_basis,num_points,ndim);
370 t_3DView flux(
"",ncells,num_points,ndim);
371 t_3DView wbs(
"",ncells,num_basis,num_points);
372 t_2DView src(
"",ncells,num_points);
373 t_2DView residual(
"",ncells,num_basis);
376 t_3DView_const flux_const = flux;
379 run_analytic_team<N>(flux_const, wgb, src, wbs, residual);
383 Kokkos::Impl::Timer timer;
384 for (
int i=0; i<ntrial; ++i)
385 run_analytic_team<N>(flux_const, wgb, src, wbs, residual);
387 double time = timer.seconds() / ntrial / ncells;
396 #define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \
397 template double time_fad_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
398 template double time_fad_scratch< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
400 #define INST_FUNC_N_DEV(N,DEV) \
401 INST_FUNC_FAD_N_DEV(SFadType,N,DEV) \
402 INST_FUNC_FAD_N_DEV(SLFadType,N,DEV) \
403 INST_FUNC_FAD_N_DEV(DFadType,N,DEV) \
404 template double time_analytic_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
405 template double time_analytic_const< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
406 template double time_analytic_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
408 #define INST_FUNC_DEV(DEV) \
409 INST_FUNC_N_DEV( fad_dim, DEV )
411 #ifdef KOKKOS_ENABLE_SERIAL
415 #ifdef KOKKOS_ENABLE_OPENMP
419 #ifdef KOKKOS_ENABLE_THREADS
423 #ifdef KOKKOS_ENABLE_CUDA
void run_fad_scratch(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
double time_analytic_const(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
double time_fad_flat(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void run_analytic_flat(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void run_analytic_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void init_array(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
#define INST_FUNC_DEV(DEV)
void run_fad_flat(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_analytic_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)