3 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
4 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
5 #define SACADO_ALIGN_SFAD 1
12 #include "Kokkos_Core.hpp"
13 #include "Kokkos_MemoryPool.hpp"
14 #include "impl/Kokkos_Timer.hpp"
20 template<
typename FluxView,
typename WgbView,
typename SrcView,
21 typename WbsView,
typename ResidualView>
24 template<
typename FluxView,
typename WgbView,
typename SrcView,
25 typename WbsView,
typename ResidualView>
29 const SrcView& src,
const WbsView& bs,
30 const ResidualView& residual,
31 const typename FluxView::non_const_value_type&
coeff);
33 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
38 template<
typename FluxView,
typename WgbView,
typename SrcView,
39 typename WbsView,
typename ResidualView>
43 typedef typename Kokkos::TeamPolicy<execution_space>::member_type
team_handle;
60 const SrcView& src,
const WbsView& bs,
79 void operator() (
const size_t cell,
const int basis)
const {
82 for (
int dim=0; dim<
num_dim; ++dim)
83 value +=
flux_m_i(cell,qp,dim)*
wgb(cell,basis,qp,dim);
91 for (
int basis=0; basis<
num_basis; ++basis) {
96 template <
unsigned VS>
99 const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
104 template <
unsigned VS>
107 const size_t cell = team.league_rank();
108 const int team_size = team.team_size();
109 for (
int basis=team.team_rank(); basis<
num_basis; basis+=team_size)
110 (*
this)(cell, basis);
114 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
119 template<
typename FluxView,
typename WgbView,
typename SrcView,
120 typename WbsView,
typename ResidualView>
122 typedef typename FluxView::non_const_value_type
scalar_type;
123 typedef typename Kokkos::ThreadLocalScalarType<FluxView>::type local_scalar_type;
125 typedef typename Kokkos::TeamPolicy<execution_space>::member_type
team_handle;
126 enum { stride = Kokkos::ViewScalarStride<FluxView>::stride };
138 template <
unsigned VS>
struct HierarchicalFlatTag {};
139 template <
unsigned VS>
struct HierarchicalTeamTag {};
143 const SrcView& src,
const WbsView& bs,
162 void operator() (
const size_t cell,
const int basis)
const {
163 local_scalar_type value(0),value2(0);
164 local_scalar_type
c = Sacado::partition_scalar<stride>(
coeff);
166 for (
int dim=0; dim<
num_dim; ++dim)
167 value +=
flux_m_i(cell,qp,dim)*
wgb(cell,basis,qp,dim);
168 value2 +=
src_m_i(cell,qp)*
wbs(cell,basis,qp);
175 for (
int basis=0; basis<
num_basis; ++basis) {
180 template <
unsigned VS>
183 const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
188 template <
unsigned VS>
191 const size_t cell = team.league_rank();
192 const int team_size = team.team_size();
193 for (
int basis=team.team_rank(); basis<
num_basis; basis+=team_size)
194 (*
this)(cell, basis);
203 template<
typename FluxView,
typename WgbView,
typename SrcView,
204 typename WbsView,
typename ResidualView>
206 typedef typename FluxView::non_const_value_type
scalar_type;
208 typedef typename Kokkos::TeamPolicy<execution_space>::member_type
team_handle;
219 template <
unsigned VS>
struct HierarchicalFlatTag {};
220 template <
unsigned VS>
struct HierarchicalTeamTag {};
221 template <
unsigned VS>
struct PartitionedTag {};
225 const SrcView& src,
const WbsView& bs,
226 const ResidualView& residual,
const scalar_type& c) :
235 num_points(
wgb.extent(2)),
244 void operator() (
const size_t cell,
const int basis)
const {
247 for (
int dim=0; dim<
num_dim; ++dim)
248 value +=
flux_m_i(cell,qp,dim)*
wgb(cell,basis,qp,dim);
249 value2 +=
src_m_i(cell,qp)*
wbs(cell,basis,qp);
256 for (
int basis=0; basis<
num_basis; ++basis) {
261 template <
unsigned VS>
263 void operator() (
const PartitionedTag<VS>,
const size_t cell,
const int basis)
const {
269 const unsigned k = threadIdx.x;
271 const unsigned k = 0;
275 auto flux_part = Kokkos::partition<VS>(
flux_m_i, k, VS);
276 auto wgb_part = Kokkos::partition<VS>(
wgb, k, VS);
277 auto src_part = Kokkos::partition<VS>(
src_m_i, k, VS);
278 auto wbs_part = Kokkos::partition<VS>(
wbs, k, VS);
279 auto resid_part = Kokkos::partition<VS>(
residual_m_i, k, VS);
280 auto coeff_part = Sacado::partition_scalar<VS>(
coeff);
284 wbs_part, resid_part,
286 kernel_part(cell, basis);
289 template <
unsigned VS>
292 const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
294 for (
int basis=0; basis<
num_basis; ++basis)
295 (*
this)(PartitionedTag<VS>(), cell, basis);
298 template <
unsigned VS>
301 const size_t cell = team.league_rank();
302 const int team_size = team.team_size();
303 for (
int basis=team.team_rank(); basis<
num_basis; basis+=team_size)
304 (*
this)(PartitionedTag<VS>(), cell, basis);
310 template<
typename FluxView,
typename WgbView,
typename SrcView,
311 typename WbsView,
typename ResidualView>
315 const SrcView& src,
const WbsView& bs,
316 const ResidualView& residual,
317 const typename FluxView::non_const_value_type& coeff)
320 return kernel_type(flux,bg,src,bs,residual,coeff);
323 template<
typename KernelType>
325 typedef typename KernelType::execution_space execution_space;
326 Kokkos::RangePolicy<execution_space> policy(0,kernel.num_cells());
327 Kokkos::parallel_for(policy, kernel);
330 template<
typename KernelType>
332 typedef typename KernelType::execution_space execution_space;
333 #if defined (KOKKOS_ENABLE_CUDA)
334 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
336 const bool is_cuda =
false;
338 const unsigned vector_size = is_cuda ? 32 : 1;
340 const unsigned team_size = 256 / vector_size;
341 typedef typename KernelType::template HierarchicalFlatTag<vector_size> tag_type;
342 typedef Kokkos::TeamPolicy<execution_space,tag_type> policy_type;
343 const size_t range = (kernel.num_cells()+team_size-1)/team_size;
344 policy_type policy(range,team_size,vector_size);
345 Kokkos::parallel_for(policy, kernel);
352 template<
typename KernelType>
354 typedef typename KernelType::execution_space execution_space;
355 #if defined (KOKKOS_ENABLE_CUDA)
356 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
358 const bool is_cuda =
false;
360 const unsigned vector_size = is_cuda ? 32 : 1;
362 const unsigned team_size = 256 / vector_size;
363 typedef typename KernelType::template HierarchicalTeamTag<vector_size> tag_type;
364 typedef Kokkos::TeamPolicy<execution_space,tag_type> policy_type;
365 policy_type policy(kernel.num_cells(),team_size,vector_size);
366 Kokkos::parallel_for(policy, kernel);
376 return std::string(
"dfad")
377 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
378 + std::string(
", mempool")
380 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED)
381 + std::string(
", strided")
388 #if defined(SACADO_ALIGN_SFAD)
389 return "sfad, aligned";
396 static std::string
eval() {
return "slfad"; }
399 template<
typename ExecSpace,
int DIM,
int N>
411 typedef Kokkos::View<double****[N+1],ExecSpace>
t_4DView;
412 typedef Kokkos::View<double***[N+1],ExecSpace>
t_3DView;
413 typedef Kokkos::View<double**[N+1],ExecSpace>
t_2DView;
415 typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> >
t_3DView_const;
417 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
427 #if defined(KOKKOS_ENABLE_CUDA)
429 std::is_same< ExecSpace, Kokkos::Cuda >::value ? 32 : 1;
430 #if defined(SACADO_ALIGN_SFAD)
432 typedef typename FadType::template apply_N<Nalign>::type
AlignedFadType;
445 typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type
team_handle;
447 typedef Kokkos::View<double****[N+1],Kokkos::LayoutRight,ExecSpace>
t_4DView_team;
448 typedef Kokkos::View<double***[N+1],Kokkos::LayoutRight,ExecSpace>
t_3DView_team;
449 typedef Kokkos::View<double**[N+1],Kokkos::LayoutRight,ExecSpace>
t_2DView_team;
450 typedef Kokkos::View<const double***[N+1],Kokkos::LayoutRight,ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> >
t_3DView_const_team;
453 typedef Kokkos::View<double[N+1],typename ExecSpace::scratch_memory_space,Kokkos::MemoryTraits<Kokkos::Unmanaged> >
t_shared_scalar;
477 num_basis(num_basis_) ,
478 num_points(num_points_) ,
513 for (
int i=0; i<N; ++i)
518 typename FadType::value_type
520 const size_t n2,
const size_t n3,
const int fad_size,
521 const size_t i0,
const size_t i1,
522 const size_t i2,
const size_t i3,
525 typedef typename FadType::value_type scalar;
526 const scalar x0 = 10.0 + scalar(n0) / scalar(i0+1);
527 const scalar x1 = 100.0 + scalar(n1) / scalar(i1+1);
528 const scalar x2 = 1000.0 + scalar(n2) / scalar(i2+1);
529 const scalar x3 = 10000.0 + scalar(n3) / scalar(i3+1);
530 const scalar x = x0 + x1 + x2 + x3;
531 if (i_fad == fad_size)
533 const scalar x_fad = 1.0 + scalar(fad_size) / scalar(i_fad+1);
537 template <
typename V1,
typename V2,
typename V3,
typename V4,
typename V5>
538 void init_fad(
const V1& v1,
const V2& v2,
const V3& v3,
const V4& v4,
const V5& v5)
545 auto v1_h = Kokkos::create_mirror_view(v1);
546 auto v2_h = Kokkos::create_mirror_view(v2);
547 auto v3_h = Kokkos::create_mirror_view(v3);
548 auto v4_h = Kokkos::create_mirror_view(v4);
549 for (
int cell=0; cell<
ncells; ++cell) {
550 for (
int basis=0; basis<
num_basis; ++basis) {
552 for (
int dim=0; dim<
ndim; ++dim) {
553 for (
int i=0; i<N; ++i)
554 v1_h(cell,basis,qp,dim).fastAccessDx(i) =
555 generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
556 v1_h(cell,basis,qp,dim).val() =
557 generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
559 for (
int i=0; i<N; ++i)
560 v2_h(cell,basis,qp).fastAccessDx(i) =
561 generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
562 v2_h(cell,basis,qp).val() =
563 generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
567 for (
int dim=0; dim<
ndim; ++dim) {
568 for (
int i=0; i<N; ++i)
569 v3_h(cell,qp,dim).fastAccessDx(i) =
570 generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
571 v3_h(cell,qp,dim).val() =
572 generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
574 for (
int i=0; i<N; ++i)
575 v4_h(cell,qp).fastAccessDx(i) =
577 v4_h(cell,qp).val() =
582 Kokkos::deep_copy( v1, v1_h );
583 Kokkos::deep_copy( v2, v2_h );
584 Kokkos::deep_copy( v3, v3_h );
585 Kokkos::deep_copy( v4, v4_h );
587 Kokkos::deep_copy(
typename V5::array_type(v5), 0.0);
590 template <
typename V1,
typename V2,
typename V3,
typename V4,
typename V5>
591 void init_array(
const V1& v1,
const V2& v2,
const V3& v3,
const V4& v4,
const V5& v5)
598 auto v1_h = Kokkos::create_mirror_view(v1);
599 auto v2_h = Kokkos::create_mirror_view(v2);
600 auto v3_h = Kokkos::create_mirror_view(v3);
601 auto v4_h = Kokkos::create_mirror_view(v4);
602 for (
int cell=0; cell<
ncells; ++cell) {
603 for (
int basis=0; basis<
num_basis; ++basis) {
605 for (
int dim=0; dim<
ndim; ++dim) {
606 for (
int i=0; i<N; ++i)
607 v1_h(cell,basis,qp,dim,i) =
608 generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
609 v1_h(cell,basis,qp,dim,N) =
610 generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
612 for (
int i=0; i<N; ++i)
613 v2_h(cell,basis,qp,i) =
614 generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
615 v2_h(cell,basis,qp,N) =
616 generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
620 for (
int dim=0; dim<
ndim; ++dim) {
621 for (
int i=0; i<N; ++i)
622 v3_h(cell,qp,dim,i) =
623 generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
624 v3_h(cell,qp,dim,N) =
625 generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
627 for (
int i=0; i<N; ++i)
635 Kokkos::deep_copy( v1, v1_h );
636 Kokkos::deep_copy( v2, v2_h );
637 Kokkos::deep_copy( v3, v3_h );
638 Kokkos::deep_copy( v4, v4_h );
640 Kokkos::deep_copy(
typename V5::array_type(v5), 0.0);
643 template <
typename View1,
typename View2>
644 typename std::enable_if< !Kokkos::is_view_fad<View2>::value,
bool>::type
645 check(
const View1& v_gold,
const View2& v,
const double tol)
648 typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
649 typename View2::HostMirror v_h = Kokkos::create_mirror_view(v);
650 Kokkos::deep_copy(v_gold_h, v_gold);
651 Kokkos::deep_copy(v_h, v);
653 typedef typename View1::value_type value_type;
655 const size_t n0 = v_gold_h.extent(0);
656 const size_t n1 = v_gold_h.extent(1);
657 const size_t n2 = v_gold_h.extent(2);
660 for (
size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
661 for (
size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
662 for (
size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
663 value_type x_gold = v_gold_h(i0,i1,i2);
664 value_type x = v_h(i0,i1,i2);
666 std::cout <<
"Comparison failed! x_gold("
667 << i0 <<
"," << i1 <<
"," << i2 <<
") = "
668 << x_gold <<
" , x = " << x
679 template <
typename View1,
typename View2>
680 typename std::enable_if< Kokkos::is_view_fad<View2>::value,
bool>::type
681 check(
const View1& v_gold,
const View2& v,
const double tol)
684 typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
685 typename View2::HostMirror v_h = Kokkos::create_mirror_view(v);
686 Kokkos::deep_copy(v_gold_h, v_gold);
687 Kokkos::deep_copy(v_h, v);
689 typedef typename View1::value_type value_type;
691 const size_t n0 = v_gold_h.extent(0);
692 const size_t n1 = v_gold_h.extent(1);
693 const size_t n2 = v_gold_h.extent(2);
696 for (
size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
697 for (
size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
698 for (
size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
699 value_type x_gold = v_gold_h(i0,i1,i2);
700 value_type x = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2);
702 std::cout <<
"Comparison failed! x_gold("
703 << i0 <<
"," << i1 <<
"," << i2 <<
") = "
704 << x_gold <<
" , x = " << x
717 for (
int basis=0; basis<
num_basis; ++basis) {
718 double value[N+1],value2[N+1];
719 for (
int k=0; k<N+1; ++k) {
724 for (
int dim=0; dim<DIM; ++dim) {
725 const double flux_val =
flux_m_i(cell,qp,dim,N);
726 const double wgb_val =
wgb(cell,basis,qp,dim,N);
727 value[N] += flux_val*wgb_val;
728 for(
int k = 0; k<N;k++)
729 value[k] += flux_val*
wgb(cell,basis,qp,dim,k)+
flux_m_i(cell,qp,dim,k)*wgb_val;
731 const double src_val =
src_m_i(cell,qp,N);
732 const double wbs_val =
wbs(cell,basis,qp,N);
733 value2[N] += src_val*wbs_val;
734 for(
int k = 0; k<N;k++)
735 value2[k] += src_val*
wbs(cell,basis,qp,k)+
src_m_i(cell,qp,k)*wbs_val;
737 for(
int k = 0; k<N; k++)
739 coeff.val()*(value[k]+value2[k]) +
740 coeff.fastAccessDx(k)*(value[N]+value2[N]);
747 for (
int basis=0; basis<
num_basis; ++basis) {
748 double value[N+1],value2[N+1];
749 for (
int k=0; k<N+1; ++k) {
754 for (
int dim=0; dim<DIM; ++dim) {
755 const double flux_val =
flux_m_i(cell,qp,dim,N);
756 const double wgb_val =
wgb(cell,basis,qp,dim,N);
757 value[N] += flux_val*wgb_val;
758 for(
int k = 0; k<N;k++)
759 value[k] += flux_val*
wgb(cell,basis,qp,dim,k)+
flux_m_i_const(cell,qp,dim,k)*wgb_val;
761 const double src_val =
src_m_i(cell,qp,N);
762 const double wbs_val =
wbs(cell,basis,qp,N);
763 value2[N] += src_val*wbs_val;
764 for(
int k = 0; k<N;k++)
765 value2[k] += src_val*
wbs(cell,basis,qp,k)+
src_m_i(cell,qp,k)*wbs_val;
767 for(
int k = 0; k<N; k++)
769 coeff.val()*(value[k]+value2[k]) +
770 coeff.fastAccessDx(k)*(value[N]+value2[N]);
780 for (
int dim=0; dim<DIM; ++dim) {
781 const double flux_val =
tflux_m_i(cell,qp,dim,N);
782 const double wgb_val =
twgb(cell,basis,qp,dim,N);
783 Kokkos::single(Kokkos::PerThread(team), [&] () {
784 value[N] += flux_val*wgb_val;
786 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (
const int& k) {
790 const double src_val =
tsrc_m_i(cell,qp,N);
791 const double wbs_val =
twbs(cell,basis,qp,N);
792 Kokkos::single(Kokkos::PerThread(team), [&] () {
793 value2[N] += src_val*wbs_val;
795 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (
const int& k) {
796 value2[k] += src_val*
twbs(cell,basis,qp,k)+
tsrc_m_i(cell,qp,k)*wbs_val;
799 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (
const int& k) {
801 coeff.val()*(value[k]+value2[k]) +
802 coeff.fastAccessDx(k)*(value[N]+value2[N]);
804 Kokkos::single(Kokkos::PerThread(team), [&] () {
814 const int cell = team.league_rank();
815 Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis), [&] (
const int& basis) {
816 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1), [&] (
const int& k) {
824 void compute(
const int ntrial,
const bool do_check) {
827 Kokkos::Impl::Timer timer;
828 for (
int i=0; i<ntrial; ++i) {
832 double time_fad = timer.seconds() / ntrial /
ncells;
837 for (
int i=0; i<ntrial; ++i) {
841 double time_fad_cont = timer.seconds() / ntrial /
ncells;
844 for (
int i=0; i<ntrial; ++i)
845 Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace,MomFluxTag>(0,
ncells), *
this);
847 double time = timer.seconds() / ntrial /
ncells;
850 for (
int i=0; i<ntrial; ++i)
851 Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace,MomFluxTagConst>(0,
ncells), *
this);
853 double time_const = timer.seconds() / ntrial /
ncells;
856 for (
int i=0; i<ntrial; ++i)
857 Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace,MomFluxTagConstTeam>(
ncells,num_basis,32).set_scratch_size(0,Kokkos::PerThread(64*8*2)), *
this);
859 double time_team = timer.seconds() / ntrial /
ncells;
861 printf(
"%5d %9.3e %9.3e %9.3e %9.3e %9.3e\n",
ncells,time_fad,time_fad_cont,time,time_const,time_team);
864 const double tol = 1e-14;
873 template <
typename ExecSpace>
874 void run(
const int cell_begin,
const int cell_end,
const int cell_step,
875 const int nbasis,
const int npoint,
const int ntrial,
const bool check)
877 const int fad_dim = 50;
885 std::cout <<
"concurrency = " << ExecSpace::concurrency() << std::endl;
886 const size_t block_size = fad_dim*
sizeof(double);
887 size_t nkernels = ExecSpace::concurrency()*2;
888 #if defined(KOKKOS_ENABLE_CUDA)
889 if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
892 size_t mem_pool_size =
893 static_cast<size_t>(1.2*nkernels*block_size);
894 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
895 std::cout <<
"Memory pool size = " << mem_pool_size / (1024.0 * 1024.0)
896 <<
" MB" << std::endl;
897 ExecSpace exec_space;
899 block_size, block_size, superblock_size);
901 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
902 std::cout <<
"hierarchical";
903 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
904 std::cout <<
"hierarchical_dfad";
906 std::cout <<
"partitioned";
911 printf(
"ncell flat hier analytic const team\n");
912 for(
int i=cell_begin; i<=cell_end; i+=cell_step) {
913 test_type
test(i,nbasis,npoint);
914 test.compute(ntrial, check);
920 int main(
int argc,
char* argv[]) {
926 clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
927 #ifdef KOKKOS_ENABLE_SERIAL
929 clp.
setOption(
"serial",
"no-serial", &serial,
"Whether to run Serial");
931 #ifdef KOKKOS_ENABLE_OPENMP
933 clp.
setOption(
"openmp", &openmp,
"Number of OpenMP threads");
935 #ifdef KOKKOS_ENABLE_THREADS
937 clp.
setOption(
"threads", &threads,
"Number of pThreads threads");
939 #ifdef KOKKOS_ENABLE_CUDA
941 clp.
setOption(
"cuda",
"no-cuda", &cuda,
"Whether to run CUDA");
945 "Number of NUMA domains to use (set to 0 to use all NUMAs");
946 int cores_per_numa = 0;
947 clp.
setOption(
"cores-per-numa", &cores_per_numa,
948 "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
949 bool print_config =
false;
950 clp.
setOption(
"print-config",
"no-print-config", &print_config,
951 "Whether to print Kokkos device configuration");
952 int cell_begin = 100;
953 clp.
setOption(
"begin", &cell_begin,
"Starting number of cells");
955 clp.
setOption(
"end", &cell_end,
"Ending number of cells");
957 clp.
setOption(
"step", &cell_step,
"Cell increment");
959 clp.
setOption(
"basis", &nbasis,
"Number of basis functions");
961 clp.
setOption(
"point", &npoint,
"Number of integration points");
963 clp.
setOption(
"trial", &ntrial,
"Number of trials");
965 clp.
setOption(
"check",
"no-check", &check,
966 "Check correctness of results");
969 switch (clp.
parse(argc, argv)) {
979 Kokkos::InitArguments init_args;
980 init_args.num_threads = -1;
981 #ifdef KOKKOS_ENABLE_OPENMP
982 if(openmp) init_args.num_threads = openmp;
984 #ifdef KOKKOS_ENABLE_THREADS
985 if(threads) init_args.num_threads = threads;
988 Kokkos::initialize(init_args);
990 Kokkos::print_configuration(std::cout,
true);
992 #ifdef KOKKOS_ENABLE_SERIAL
994 using Kokkos::Serial;
995 run<Serial>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
999 #ifdef KOKKOS_ENABLE_OPENMP
1001 using Kokkos::OpenMP;
1002 run<OpenMP>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
1006 #ifdef KOKKOS_ENABLE_THREADS
1008 using Kokkos::Threads;
1009 run<Threads>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
1013 #ifdef KOKKOS_ENABLE_CUDA
1016 run<Cuda>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
static const int FadStride
Kokkos::TeamPolicy< ExecSpace >::member_type team_handle
FluxView::non_const_value_type scalar_type
t_3DViewFadCont wbs_fad_cont
KOKKOS_INLINE_FUNCTION size_t num_cells() const
Kokkos::View< AlignedFadType **, ContLayout, ExecSpace > t_2DViewFadCont
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_hierarchical_team(const KernelType &kernel)
t_3DViewFadCont flux_m_i_fad_cont
DrekarTest(int ncells_, int num_basis_, int num_points_)
GeneralFad< StaticStorage< T, Num > > SLFad
KOKKOS_INLINE_FUNCTION AdvectionKernel< FluxView, WgbView, SrcView, WbsView, ResidualView > create_advection_kernel(const FluxView &flux, const WgbView &bg, const SrcView &src, const WbsView &bs, const ResidualView &residual, const typename FluxView::non_const_value_type &coeff)
void run(const int cell_begin, const int cell_end, const int cell_step, const int nbasis, const int npoint, const int ntrial, const bool check)
Kokkos::View< double **[N+1], ExecSpace > t_2DView
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
t_3DView_const_team tflux_m_i_const
Kokkos::View< AlignedFadType ***, ContLayout, ExecSpace > t_3DViewFadCont
Kokkos::View< double **[N+1], Kokkos::LayoutRight, ExecSpace > t_2DView_team
t_4DViewFadCont wgb_fad_cont
KOKKOS_INLINE_FUNCTION void operator()(const MomFluxTag, const std::size_t &cell) const
#define KOKKOS_INLINE_FUNCTION
t_2DView_team tresidual_m_i
Sacado::Fad::DFad< double > FadType
void compute(const int ntrial, const bool do_check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
FadType::value_type generate_fad(const size_t n0, const size_t n1, const size_t n2, const size_t n3, const int fad_size, const size_t i0, const size_t i1, const size_t i2, const size_t i3, const int i_fad)
std::enable_if< Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
GeneralFad< DynamicStorage< T > > DFad
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
const ResidualView residual_m_i
static std::string eval()
void init_array(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
FluxView::execution_space execution_space
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
static std::string eval()
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
Kokkos::View< AlignedFadType ****, ContLayout, ExecSpace > t_4DViewFadCont
void run_flat(const KernelType &kernel)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
KOKKOS_INLINE_FUNCTION void operator()(const size_t cell, const int basis) const
Kokkos::View< double ***[N+1], Kokkos::LayoutRight, ExecSpace > t_3DView_team
Kokkos::TeamPolicy< execution_space >::member_type team_handle
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
t_2DViewFadCont residual_m_i_fad_cont
void run_hierarchical_flat(const KernelType &kernel)
t_2DView residual_m_i_const
void setDocString(const char doc_string[])
Kokkos::View< double[N+1], typename ExecSpace::scratch_memory_space, Kokkos::MemoryTraits< Kokkos::Unmanaged > > t_shared_scalar
t_3DView_const flux_m_i_const
Kokkos::View< FadType ****, ExecSpace > t_4DViewFad
Kokkos::View< FadType ***, ExecSpace > t_3DViewFad
ExecSpace::array_layout DefaultLayout
Kokkos::LayoutContiguous< DefaultLayout, FadStride > ContLayout
static std::string eval()
Kokkos::View< const double ***[N+1], ExecSpace, Kokkos::MemoryTraits< Kokkos::RandomAccess > > t_3DView_const
void destroyGlobalMemoryPool(const ExecSpace &space)
KOKKOS_INLINE_FUNCTION void compute_one(const MomFluxTagConstTeam, const team_handle &team, const int &cell, const int &basis, const t_shared_scalar &value, const t_shared_scalar &value2) const
Kokkos::View< double ***[N+1], ExecSpace > t_3DView
Kokkos::View< const double ***[N+1], Kokkos::LayoutRight, ExecSpace, Kokkos::MemoryTraits< Kokkos::RandomAccess > > t_3DView_const_team
t_2DViewFadCont src_m_i_fad_cont
t_2DViewFad residual_m_i_fad
Kokkos::View< double ****[N+1], Kokkos::LayoutRight, ExecSpace > t_4DView_team
GeneralFad< StaticFixedStorage< T, Num > > SFad
Kokkos::View< double ****[N+1], ExecSpace > t_4DView
KOKKOS_INLINE_FUNCTION AdvectionKernel(const FluxView &flux, const WgbView &gb, const SrcView &src, const WbsView &bs, const ResidualView &residual, const scalar_type &c)
Kokkos::View< FadType **, ExecSpace > t_2DViewFad