10 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
11 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
17 #include "Kokkos_Timer.hpp"
19 template<
typename FluxView,
typename WgbView,
typename SrcView,
20 typename WbsView,
typename ResidualView>
22 const SrcView& src,
const WbsView& wbs,
23 const ResidualView& residual)
25 typedef typename ResidualView::execution_space execution_space;
26 typedef typename ResidualView::non_const_value_type scalar_type;
27 typedef Kokkos::TeamPolicy<execution_space> policy_type;
28 typedef typename policy_type::member_type team_member;
30 const size_t num_cells = wgb.extent(0);
31 const int num_basis = wgb.extent(1);
32 const int num_points = wgb.extent(2);
33 const int num_dim = wgb.extent(3);
36 const int vector_size = is_cuda ? 32 : 1;
37 const int team_size = is_cuda ? 256/vector_size : 1;
39 policy_type policy(num_cells,team_size,vector_size);
40 Kokkos::parallel_for(policy, KOKKOS_LAMBDA (
const team_member& team)
42 const int team_rank = team.team_rank();
43 const size_t cell = team.league_rank();
44 scalar_type
value, value2;
45 for (
int basis=team_rank; basis<num_basis; basis+=team_size) {
48 for (
int qp=0; qp<num_points; ++qp) {
49 for (
int dim=0; dim<num_dim; ++dim)
50 value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
51 value2 += src(cell,qp)*wbs(cell,basis,qp);
53 residual(cell,basis) = value+value2;
58 template<
typename FluxView,
typename WgbView,
typename SrcView,
59 typename WbsView,
typename ResidualView>
61 const FluxView& flux,
const WgbView& wgb,
62 const SrcView& src,
const WbsView& wbs,
63 const ResidualView& residual)
65 typedef typename ResidualView::execution_space execution_space;
66 typedef typename ResidualView::non_const_value_type scalar_type;
67 typedef Kokkos::TeamPolicy<execution_space> policy_type;
68 typedef typename policy_type::member_type team_member;
69 typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
71 const size_t num_cells = wgb.extent(0);
72 const int num_basis = wgb.extent(1);
73 const int num_points = wgb.extent(2);
74 const int num_dim = wgb.extent(3);
77 const int vector_size = is_cuda ? 32 : 1;
78 const int team_size = is_cuda ? 256/vector_size : 1;
79 const int fad_size = Kokkos::dimension_scalar(residual);
80 const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
81 policy_type policy(num_cells,team_size,vector_size);
83 Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
84 KOKKOS_LAMBDA (
const team_member& team)
86 const int team_rank = team.team_rank();
87 const size_t cell = team.league_rank();
88 tmp_scratch_type
value(team.team_scratch(0), team_size, fad_size);
89 tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
90 for (
int basis=team_rank; basis<num_basis; basis+=team_size) {
91 value(team_rank) = 0.0;
92 value2(team_rank) = 0.0;
93 for (
int qp=0; qp<num_points; ++qp) {
94 for (
int dim=0; dim<num_dim; ++dim)
95 value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
96 value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
98 residual(cell,basis) =
value(team_rank)+value2(team_rank);
103 template <
int N,
typename ExecSpace>
105 int ndim,
int ntrial,
bool check)
109 typedef typename ExecSpace::array_layout DefaultLayout;
111 typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
112 typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
113 typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
114 typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
116 t_4DView_d wgb(
"",ncells,num_basis,num_points,ndim);
117 t_3DView_d wbs(
"",ncells,num_basis,num_points);
118 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
119 t_2DView src(
"",ncells,num_points,
N+1);
120 t_2DView residual(
"",ncells,num_basis,
N+1);
121 init_fad(wgb, wbs, flux, src, residual);
128 const size_t block_size =
N*
sizeof(double);
129 size_t nkernels = ExecSpace().concurrency()*2;
132 const size_t mem_pool_size =
static_cast<size_t>(1.2*nkernels*block_size);
133 const size_t superblock_size =
134 std::max<size_t>(nkernels / 100, 1) * block_size;
135 ExecSpace exec_space;
137 block_size, block_size, superblock_size);
145 for (
int i=0;
i<ntrial; ++
i)
148 double time = timer.seconds() / ntrial / ncells;
160 template <
int N,
typename ExecSpace>
162 int ncells,
int num_basis,
int num_points,
int ndim,
int ntrial,
bool check)
166 typedef typename ExecSpace::array_layout DefaultLayout;
168 typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
169 typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
170 typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
171 typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
173 t_4DView_d wgb(
"",ncells,num_basis,num_points,ndim);
174 t_3DView_d wbs(
"",ncells,num_basis,num_points);
175 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
176 t_2DView src(
"",ncells,num_points,
N+1);
177 t_2DView residual(
"",ncells,num_basis,
N+1);
178 init_fad(wgb, wbs, flux, src, residual);
186 for (
int i=0;
i<ntrial; ++
i)
189 double time = timer.seconds() / ntrial / ncells;
198 #define INST_FUNC_N_DEV(N,DEV) \
199 template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
200 template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
202 #define INST_FUNC_DEV(DEV) \
203 INST_FUNC_N_DEV( fad_dim, DEV )
205 #ifdef KOKKOS_ENABLE_SERIAL
209 #ifdef KOKKOS_ENABLE_OPENMP
213 #ifdef KOKKOS_ENABLE_THREADS
217 #ifdef KOKKOS_ENABLE_CUDA
double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
Sacado::Fad::DFad< double > FadType
#define INST_FUNC_DEV(DEV)
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
void run_dfad_hierarchical_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void run_dfad_hierarchical_team_scratch(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void destroyGlobalMemoryPool(const ExecSpace &space)