30 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
31 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
37 #include "impl/Kokkos_Timer.hpp"
39 template<
typename FluxView,
typename WgbView,
typename SrcView,
40 typename WbsView,
typename ResidualView>
42 const SrcView& src,
const WbsView& wbs,
43 const ResidualView& residual)
45 typedef typename ResidualView::execution_space execution_space;
46 typedef typename ResidualView::non_const_value_type scalar_type;
47 typedef Kokkos::TeamPolicy<execution_space> policy_type;
48 typedef typename policy_type::member_type team_member;
50 const size_t num_cells = wgb.extent(0);
51 const int num_basis = wgb.extent(1);
52 const int num_points = wgb.extent(2);
53 const int num_dim = wgb.extent(3);
56 const int vector_size = is_cuda ? 32 : 1;
57 const int team_size = is_cuda ? 256/vector_size : 1;
59 policy_type policy(num_cells,team_size,vector_size);
60 Kokkos::parallel_for(policy, KOKKOS_LAMBDA (
const team_member& team)
62 const int team_rank = team.team_rank();
63 const size_t cell = team.league_rank();
64 scalar_type value, value2;
65 for (
int basis=team_rank; basis<num_basis; basis+=team_size) {
68 for (
int qp=0; qp<num_points; ++qp) {
69 for (
int dim=0; dim<num_dim; ++dim)
70 value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
71 value2 += src(cell,qp)*wbs(cell,basis,qp);
73 residual(cell,basis) = value+value2;
78 template<
typename FluxView,
typename WgbView,
typename SrcView,
79 typename WbsView,
typename ResidualView>
81 const FluxView& flux,
const WgbView& wgb,
82 const SrcView& src,
const WbsView& wbs,
83 const ResidualView& residual)
85 typedef typename ResidualView::execution_space execution_space;
86 typedef typename ResidualView::non_const_value_type scalar_type;
87 typedef Kokkos::TeamPolicy<execution_space> policy_type;
88 typedef typename policy_type::member_type team_member;
89 typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
91 const size_t num_cells = wgb.extent(0);
92 const int num_basis = wgb.extent(1);
93 const int num_points = wgb.extent(2);
94 const int num_dim = wgb.extent(3);
97 const int vector_size = is_cuda ? 32 : 1;
98 const int team_size = is_cuda ? 256/vector_size : 1;
99 const int fad_size = Kokkos::dimension_scalar(residual);
100 const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
101 policy_type policy(num_cells,team_size,vector_size);
103 Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
104 KOKKOS_LAMBDA (
const team_member& team)
106 const int team_rank = team.team_rank();
107 const size_t cell = team.league_rank();
108 tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
109 tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
110 for (
int basis=team_rank; basis<num_basis; basis+=team_size) {
111 value(team_rank) = 0.0;
112 value2(team_rank) = 0.0;
113 for (
int qp=0; qp<num_points; ++qp) {
114 for (
int dim=0; dim<num_dim; ++dim)
115 value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
116 value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
118 residual(cell,basis) = value(team_rank)+value2(team_rank);
123 template <
int N,
typename ExecSpace>
125 int ndim,
int ntrial,
bool check)
129 typedef typename ExecSpace::array_layout DefaultLayout;
131 typedef Kokkos::View<FadType****,ContLayout,ExecSpace> t_4DView;
132 typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
133 typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
135 t_4DView wgb(
"",ncells,num_basis,num_points,ndim,
N+1);
136 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
137 t_3DView wbs(
"",ncells,num_basis,num_points,
N+1);
138 t_2DView src(
"",ncells,num_points,
N+1);
139 t_2DView residual(
"",ncells,num_basis,
N+1);
140 init_fad(wgb, wbs, flux, src, residual);
147 const size_t block_size =
N*
sizeof(double);
148 size_t nkernels = ExecSpace::concurrency()*2;
151 const size_t mem_pool_size =
static_cast<size_t>(1.2*nkernels*block_size);
152 const size_t superblock_size =
153 std::max<size_t>(nkernels / 100, 1) * block_size;
154 ExecSpace exec_space;
156 block_size, block_size, superblock_size);
163 Kokkos::Impl::Timer timer;
164 for (
int i=0; i<ntrial; ++i)
167 double time = timer.seconds() / ntrial / ncells;
179 template <
int N,
typename ExecSpace>
181 int ncells,
int num_basis,
int num_points,
int ndim,
int ntrial,
bool check)
185 typedef typename ExecSpace::array_layout DefaultLayout;
187 typedef Kokkos::View<FadType****,ContLayout,ExecSpace> t_4DView;
188 typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
189 typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
191 t_4DView wgb(
"",ncells,num_basis,num_points,ndim,
N+1);
192 t_3DView flux(
"",ncells,num_points,ndim,
N+1);
193 t_3DView wbs(
"",ncells,num_basis,num_points,
N+1);
194 t_2DView src(
"",ncells,num_points,
N+1);
195 t_2DView residual(
"",ncells,num_basis,
N+1);
196 init_fad(wgb, wbs, flux, src, residual);
203 Kokkos::Impl::Timer timer;
204 for (
int i=0; i<ntrial; ++i)
207 double time = timer.seconds() / ntrial / ncells;
216 #define INST_FUNC_N_DEV(N,DEV) \
217 template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
218 template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
220 #define INST_FUNC_DEV(DEV) \
221 INST_FUNC_N_DEV( fad_dim, DEV )
223 #ifdef KOKKOS_ENABLE_SERIAL
227 #ifdef KOKKOS_ENABLE_OPENMP
231 #ifdef KOKKOS_ENABLE_THREADS
235 #ifdef KOKKOS_ENABLE_CUDA
double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
Sacado::Fad::DFad< double > FadType
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
void run_dfad_hierarchical_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
#define INST_FUNC_DEV(DEV)
void run_dfad_hierarchical_team_scratch(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void destroyGlobalMemoryPool(const ExecSpace &space)