30 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 
   31 #define SACADO_KOKKOS_USE_MEMORY_POOL 1 
   37 #include "impl/Kokkos_Timer.hpp" 
   39 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
   40          typename WbsView, 
typename ResidualView>
 
   42                                 const SrcView& src, 
const WbsView& wbs,
 
   43                                 const ResidualView& residual)
 
   45   typedef typename ResidualView::execution_space execution_space;
 
   46   typedef typename ResidualView::non_const_value_type scalar_type;
 
   47   typedef Kokkos::TeamPolicy<execution_space> policy_type;
 
   48   typedef typename policy_type::member_type team_member;
 
   50   const size_t num_cells = wgb.extent(0);
 
   51   const int num_basis    = wgb.extent(1);
 
   52   const int num_points   = wgb.extent(2);
 
   53   const int num_dim      = wgb.extent(3);
 
   56   const int vector_size  = is_cuda ? 32 : 1;
 
   57   const int team_size    = is_cuda ? 256/vector_size : 1;
 
   59   policy_type policy(num_cells,team_size,vector_size);
 
   60   Kokkos::parallel_for(policy, KOKKOS_LAMBDA (
const team_member& team)
 
   62     const int team_rank = team.team_rank();
 
   63     const size_t cell = team.league_rank();
 
   64     scalar_type value, value2;
 
   65     for (
int basis=team_rank; basis<num_basis; basis+=team_size) {
 
   68       for (
int qp=0; qp<num_points; ++qp) {
 
   69         for (
int dim=0; dim<num_dim; ++dim)
 
   70           value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
 
   71         value2 += src(cell,qp)*wbs(cell,basis,qp);
 
   73       residual(cell,basis) = value+value2;
 
   78 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
   79          typename WbsView, 
typename ResidualView>
 
   81   const FluxView& flux, 
const WgbView& wgb,
 
   82   const SrcView& src, 
const WbsView& wbs,
 
   83   const ResidualView& residual)
 
   85   typedef typename ResidualView::execution_space execution_space;
 
   86   typedef typename ResidualView::non_const_value_type scalar_type;
 
   87   typedef Kokkos::TeamPolicy<execution_space> policy_type;
 
   88   typedef typename policy_type::member_type team_member;
 
   89   typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
 
   91   const size_t num_cells = wgb.extent(0);
 
   92   const int num_basis    = wgb.extent(1);
 
   93   const int num_points   = wgb.extent(2);
 
   94   const int num_dim      = wgb.extent(3);
 
   97   const int vector_size  = is_cuda ? 32 : 1;
 
   98   const int team_size    = is_cuda ? 256/vector_size : 1;
 
   99   const int fad_size     = Kokkos::dimension_scalar(residual);
 
  100   const size_t bytes     = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
 
  101   policy_type policy(num_cells,team_size,vector_size);
 
  103   Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
 
  104                        KOKKOS_LAMBDA (
const team_member& team)
 
  106     const int team_rank = team.team_rank();
 
  107     const size_t cell = team.league_rank();
 
  108     tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
 
  109     tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
 
  110     for (
int basis=team_rank; basis<num_basis; basis+=team_size) {
 
  111       value(team_rank) = 0.0;
 
  112       value2(team_rank) = 0.0;
 
  113       for (
int qp=0; qp<num_points; ++qp) {
 
  114         for (
int dim=0; dim<num_dim; ++dim)
 
  115           value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
 
  116         value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
 
  118       residual(cell,basis) = value(team_rank)+value2(team_rank);
 
  123 template <
int N, 
typename ExecSpace>
 
  125                                    int ndim, 
int ntrial, 
bool check)
 
  129   typedef typename ExecSpace::array_layout DefaultLayout;
 
  131   typedef Kokkos::View<FadType****,ContLayout,ExecSpace> t_4DView;
 
  132   typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
 
  133   typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
 
  135   t_4DView wgb(
"",ncells,num_basis,num_points,ndim,
N+1);
 
  136   t_3DView flux(
"",ncells,num_points,ndim,
N+1);
 
  137   t_3DView wbs(
"",ncells,num_basis,num_points,
N+1);
 
  138   t_2DView src(
"",ncells,num_points,
N+1);
 
  139   t_2DView residual(
"",ncells,num_basis,
N+1);
 
  140   init_fad(wgb, wbs, flux, src, residual);
 
  147   const size_t block_size = 
N*
sizeof(double);
 
  148   size_t nkernels = ExecSpace::concurrency()*2;
 
  151   const size_t mem_pool_size = 
static_cast<size_t>(1.2*nkernels*block_size);
 
  152   const size_t superblock_size =
 
  153     std::max<size_t>(nkernels / 100, 1) * block_size;
 
  154   ExecSpace exec_space;
 
  156                                  block_size, block_size, superblock_size);
 
  163   Kokkos::Impl::Timer timer;
 
  164   for (
int i=0; i<ntrial; ++i)
 
  167   double time = timer.seconds() / ntrial / ncells;
 
  179 template <
int N, 
typename ExecSpace>
 
  181   int ncells, 
int num_basis, 
int num_points, 
int ndim, 
int ntrial, 
bool check)
 
  185   typedef typename ExecSpace::array_layout DefaultLayout;
 
  187   typedef Kokkos::View<FadType****,ContLayout,ExecSpace> t_4DView;
 
  188   typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
 
  189   typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
 
  191   t_4DView wgb(
"",ncells,num_basis,num_points,ndim,
N+1);
 
  192   t_3DView flux(
"",ncells,num_points,ndim,
N+1);
 
  193   t_3DView wbs(
"",ncells,num_basis,num_points,
N+1);
 
  194   t_2DView src(
"",ncells,num_points,
N+1);
 
  195   t_2DView residual(
"",ncells,num_basis,
N+1);
 
  196   init_fad(wgb, wbs, flux, src, residual);
 
  203   Kokkos::Impl::Timer timer;
 
  204   for (
int i=0; i<ntrial; ++i)
 
  207   double time = timer.seconds() / ntrial / ncells;
 
  216 #define INST_FUNC_N_DEV(N,DEV) \ 
  217   template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ 
  218   template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); 
  220 #define INST_FUNC_DEV(DEV) \ 
  221   INST_FUNC_N_DEV( fad_dim, DEV ) 
  223 #ifdef KOKKOS_ENABLE_SERIAL 
  227 #ifdef KOKKOS_ENABLE_OPENMP 
  231 #ifdef KOKKOS_ENABLE_THREADS 
  235 #ifdef KOKKOS_ENABLE_CUDA 
double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
 
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
 
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
 
Sacado::Fad::DFad< double > FadType
 
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
 
void run_dfad_hierarchical_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
 
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
 
#define INST_FUNC_DEV(DEV)
 
void run_dfad_hierarchical_team_scratch(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
 
double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
 
void destroyGlobalMemoryPool(const ExecSpace &space)