30 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
31 #define SACADO_ALIGN_SFAD 1
37 #include "impl/Kokkos_Timer.hpp"
39 template<
typename FluxView,
typename WgbView,
typename SrcView,
40 typename WbsView,
typename ResidualView>
42 const SrcView& src,
const WbsView& wbs,
43 const ResidualView& residual)
45 typedef typename ResidualView::execution_space execution_space;
46 typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
47 typedef Kokkos::TeamPolicy<execution_space> policy_type;
48 typedef typename policy_type::member_type team_member;
50 const size_t num_cells = wgb.extent(0);
51 const int num_basis = wgb.extent(1);
52 const int num_points = wgb.extent(2);
53 const int num_dim = wgb.extent(3);
56 const int vector_size = is_cuda ? 32 : 1;
57 const int team_size = is_cuda ? 256/vector_size : 1;
58 const size_t range = (num_cells+team_size-1)/team_size;
60 policy_type policy(range,team_size,vector_size);
61 Kokkos::parallel_for(policy, KOKKOS_LAMBDA (
const team_member& team)
63 const size_t cell = team.league_rank()*team_size + team.team_rank();
64 local_scalar_type value, value2;
65 for (
int basis=0; basis<num_basis; ++basis) {
68 for (
int qp=0; qp<num_points; ++qp) {
69 for (
int dim=0; dim<num_dim; ++dim)
70 value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
71 value2 += src(cell,qp)*wbs(cell,basis,qp);
73 residual(cell,basis) = value+value2;
78 template<
typename FluxView,
typename WgbView,
typename SrcView,
79 typename WbsView,
typename ResidualView>
81 const SrcView& src,
const WbsView& wbs,
82 const ResidualView& residual)
84 typedef typename ResidualView::execution_space execution_space;
85 typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
86 typedef Kokkos::TeamPolicy<execution_space> policy_type;
87 typedef typename policy_type::member_type team_member;
89 const size_t num_cells = wgb.extent(0);
90 const int num_basis = wgb.extent(1);
91 const int num_points = wgb.extent(2);
92 const int num_dim = wgb.extent(3);
95 const int vector_size = is_cuda ? 32 : 1;
96 const int team_size = is_cuda ? 256/vector_size : 1;
98 policy_type policy(num_cells,team_size,vector_size);
99 Kokkos::parallel_for(policy, KOKKOS_LAMBDA (
const team_member& team)
101 const int team_rank = team.team_rank();
102 const size_t cell = team.league_rank();
103 local_scalar_type value, value2;
104 for (
int basis=team_rank; basis<num_basis; basis+=team_size) {
107 for (
int qp=0; qp<num_points; ++qp) {
108 for (
int dim=0; dim<num_dim; ++dim)
109 value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
110 value2 += src(cell,qp)*wbs(cell,basis,qp);
112 residual(cell,basis) = value+value2;
117 template <
typename FadType,
int N,
typename ExecSpace>
119 int ndim,
int ntrial,
bool check)
122 #if defined(SACADO_ALIGN_SFAD)
123 static const int Nalign = ((
N+FadStride-1)/FadStride)*FadStride;
124 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
126 typedef FadType AlignedFadType;
129 typedef typename ExecSpace::array_layout DefaultLayout;
131 typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
132 typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
133 typedef Kokkos::View<AlignedFadType***,ContLayout,ExecSpace> t_3DView;
134 typedef Kokkos::View<AlignedFadType**,ContLayout,ExecSpace> t_2DView;
136 t_4DView_d wgb(
"",ncells,num_basis,num_points,ndim);
137 t_3DView_d wbs(
"",ncells,num_basis,num_points);
138 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
139 t_2DView src(
"",ncells,num_points,
N+1);
140 t_2DView residual(
"",ncells,num_basis,
N+1);
141 init_fad(wgb, wbs, flux, src, residual);
148 Kokkos::Impl::Timer timer;
149 for (
int i=0; i<ntrial; ++i)
152 double time = timer.seconds() / ntrial / ncells;
161 template <
typename FadType,
int N,
typename ExecSpace>
163 int ndim,
int ntrial,
bool check)
166 #if defined(SACADO_ALIGN_SFAD)
167 static const int Nalign = ((
N+FadStride-1)/FadStride)*FadStride;
168 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
170 typedef FadType AlignedFadType;
173 typedef typename ExecSpace::array_layout DefaultLayout;
175 typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
176 typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
177 typedef Kokkos::View<AlignedFadType***,ContLayout,ExecSpace> t_3DView;
178 typedef Kokkos::View<AlignedFadType**,ContLayout,ExecSpace> t_2DView;
180 t_4DView_d wgb(
"",ncells,num_basis,num_points,ndim);
181 t_3DView_d wbs(
"",ncells,num_basis,num_points);
182 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
183 t_2DView src(
"",ncells,num_points,
N+1);
184 t_2DView residual(
"",ncells,num_basis,
N+1);
185 init_fad(wgb, wbs, flux, src, residual);
192 Kokkos::Impl::Timer timer;
193 for (
int i=0; i<ntrial; ++i)
196 double time = timer.seconds() / ntrial / ncells;
205 #define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \
206 template double time_fad_hierarchical_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
207 template double time_fad_hierarchical_team< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
209 #define INST_FUNC_DEV(DEV) \
210 INST_FUNC_FAD_N_DEV( SFadType, fad_dim, DEV ) \
211 INST_FUNC_FAD_N_DEV( SLFadType, fad_dim, DEV )
213 #ifdef KOKKOS_ENABLE_SERIAL
217 #ifdef KOKKOS_ENABLE_OPENMP
221 #ifdef KOKKOS_ENABLE_THREADS
225 #ifdef KOKKOS_ENABLE_CUDA
void run_fad_hierarchical_flat(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
#define INST_FUNC_DEV(DEV)
double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void run_fad_hierarchical_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)