10 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
11 #define SACADO_ALIGN_SFAD 1
17 #include "Kokkos_Timer.hpp"
19 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
22 typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
23 typedef typename ViewTypeC::execution_space execution_space;
25 #if defined (KOKKOS_ENABLE_CUDA)
27 const unsigned vector_size = is_cuda ? 32 : 1;
28 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
29 #elif defined (KOKKOS_ENABLE_HIP)
31 const unsigned vector_size = is_hip ? 64 : 1;
32 const unsigned team_size = is_hip ? 128 / vector_size : 1;
34 const unsigned vector_size = 1;
35 const unsigned team_size = 1;
38 const int m = A.extent(0);
39 const int n = A.extent(1);
40 const int range = (m+team_size-1)/team_size;
42 typedef Kokkos::TeamPolicy<execution_space> Policy;
44 Policy( range,team_size,vector_size ),
45 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
46 const int i = team.league_rank()*team.team_size() + team.team_rank();
51 for (
int j=0; j<n; ++j)
58 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
62 const double tol = 1.0e-14;
63 typedef typename ViewTypeC::value_type value_type;
64 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
65 Kokkos::deep_copy(h_c, c);
66 const size_t m = A.extent(0);
67 const size_t n = A.extent(1);
68 const size_t p = Kokkos::dimension_scalar(A);
69 for (
size_t i=0;
i<m; ++
i) {
70 for (
size_t j=0; j<
p; ++j) {
71 value_type t = (j == p-1 ? n : 2*n);
73 std::cout <<
"Comparison failed! " <<
i <<
"," << j <<
" : "
74 << h_c(
i).fastAccessDx(j) <<
" , " << t << std::endl;
80 template <
typename FadType,
typename ... ViewArgs>
83 const size_t nloop,
const bool check)
85 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
86 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
87 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
88 typedef typename ViewTypeA::execution_space execution_space;
90 #if defined (KOKKOS_ENABLE_CUDA)
92 const int FadStride = is_cuda ? 32 : 1;
93 #elif defined (KOKKOS_ENABLE_HIP)
95 const int FadStride = is_hip ? 64 : 1;
97 const int FadStride = 1;
100 #if defined(SACADO_ALIGN_SFAD)
102 const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
103 const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride :
p;
104 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
106 typedef FadType AlignedFadType;
114 typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
115 typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
116 typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
118 ConViewTypeA
A(
"A",m,n,pa+1);
119 ConViewTypeB b(
"B",n,pa+1);
120 ConViewTypeC
c(
"c",m,pa+1);
125 Kokkos::deep_copy(
typename ConViewTypeA::array_type(A), 1.0);
126 Kokkos::deep_copy(
typename ConViewTypeB::array_type(b), 1.0);
128 Kokkos::Timer wall_clock;
133 execution_space().fence();
136 for (
size_t l=0; l<nloop; l++) {
139 execution_space().fence();
141 perf.
time = wall_clock.seconds() / nloop;
155 #define INST_FUNC_FAD_DEV(FAD,DEV) \
156 template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
157 template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
158 template Perf do_time_fad_hierarchical< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
160 #define INST_FUNC_DEV(DEV) \
161 INST_FUNC_FAD_DEV( SFad_type, DEV ) \
162 INST_FUNC_FAD_DEV( SLFad_type, DEV )
164 #ifdef KOKKOS_ENABLE_SERIAL
168 #ifdef KOKKOS_ENABLE_OPENMP
172 #ifdef KOKKOS_ENABLE_THREADS
176 #ifdef KOKKOS_ENABLE_CUDA
180 #ifdef KOKKOS_ENABLE_HIP
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_deriv_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
#define INST_FUNC_DEV(DEV)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
Sacado::Fad::SFad< double, SFadSize > SFad_type
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type