10 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
11 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
17 #include "Kokkos_Timer.hpp"
19 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
22 typedef typename ViewTypeC::value_type scalar_type;
23 typedef typename ViewTypeC::execution_space execution_space;
25 #if defined (KOKKOS_ENABLE_CUDA)
27 const unsigned vector_size = is_cuda ? 32 : 1;
28 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
29 #elif defined (KOKKOS_ENABLE_HIP)
31 const unsigned vector_size = is_hip ? 64 : 1;
32 const unsigned team_size = is_hip ? 128 / vector_size : 1;
34 const unsigned vector_size = 1;
35 const unsigned team_size = 1;
38 const int m = A.extent(0);
39 const int n = A.extent(1);
40 const int range = (m+team_size-1)/team_size;
42 typedef Kokkos::TeamPolicy<execution_space> Policy;
44 Policy( range,team_size,vector_size ),
45 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
46 const int i = team.league_rank()*team.team_size() + team.team_rank();
51 for (
int j=0; j<n; ++j)
58 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
60 const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c) {
61 typedef typename ViewTypeC::value_type scalar_type;
62 typedef typename ViewTypeC::execution_space execution_space;
63 typedef Kokkos::TeamPolicy<execution_space> Policy;
64 typedef typename Policy::member_type team_member;
65 typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
67 #if defined (KOKKOS_ENABLE_CUDA)
69 const unsigned VectorSize = is_cuda ? 32 : 1;
70 const unsigned TeamSize = is_cuda ? 128 / VectorSize : 1;
71 #elif defined (KOKKOS_ENABLE_HIP)
73 const unsigned VectorSize = is_hip ? 64 : 1;
74 const unsigned TeamSize = is_hip ? 128 / VectorSize : 1;
76 const unsigned VectorSize = 1;
77 const unsigned TeamSize = 1;
80 const int m = A.extent(0);
81 const int n = A.extent(1);
82 const int p = dimension_scalar(A);
83 const int N = (m+TeamSize-1)/TeamSize;
85 Policy policy(N, TeamSize, VectorSize);
86 const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
88 policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
89 KOKKOS_LAMBDA (
const team_member& team) {
90 const int team_rank = team.team_rank();
91 const int team_size = team.team_size();
92 TmpScratchSpace t(team.team_scratch(0), team_size,
p);
93 const int i = team.league_rank()*team_size + team_rank;
96 for (
int j=0; j<n; ++j)
97 t(team_rank) +=
A(i,j)*b(j);
104 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
108 const double tol = 1.0e-14;
109 typedef typename ViewTypeC::value_type value_type;
110 typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
111 Kokkos::deep_copy(h_c, c);
112 const size_t m = A.extent(0);
113 const size_t n = A.extent(1);
114 const size_t p = Kokkos::dimension_scalar(A);
115 for (
size_t i=0;
i<m; ++
i) {
116 for (
size_t j=0; j<
p; ++j) {
117 value_type t = (j == p-1 ? n : 2*n);
119 std::cout <<
"Comparison failed! " <<
i <<
"," << j <<
" : "
120 << h_c(
i).fastAccessDx(j) <<
" , " << t << std::endl;
126 template <
typename FadType,
typename ... ViewArgs>
129 const size_t nloop,
const bool check)
131 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
132 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
133 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
134 typedef typename ViewTypeA::execution_space execution_space;
138 typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
139 typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
140 typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
142 ConViewTypeA
A(
"A",m,n,p+1);
143 ConViewTypeB b(
"B",n,p+1);
144 ConViewTypeC
c(
"c",m,p+1);
149 #if KOKKOS_VERSION >= 40799
150 Kokkos::deep_copy(
typename ConViewTypeA::type(A), 1.0);
151 Kokkos::deep_copy(
typename ConViewTypeB::type(b), 1.0);
153 Kokkos::deep_copy(
typename ConViewTypeA::array_type(A), 1.0);
154 Kokkos::deep_copy(
typename ConViewTypeB::array_type(b), 1.0);
157 Kokkos::Timer wall_clock;
160 #if defined (KOKKOS_ENABLE_CUDA)
162 const size_t warp_dim = is_cuda ? 32 : 1;
163 #elif defined (KOKKOS_ENABLE_HIP)
165 const size_t warp_dim = is_hip ? 64 : 1;
167 const size_t warp_dim = 1;
170 const size_t concurrency = execution_space().concurrency();
171 const size_t block_size = p*
sizeof(double);
172 const size_t nkernels = concurrency / warp_dim;
173 const size_t mem_pool_size =
174 static_cast<size_t>(1.2*nkernels*block_size);
175 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
176 execution_space space;
185 execution_space().fence();
188 for (
size_t l=0; l<nloop; l++) {
191 execution_space().fence();
193 perf.
time = wall_clock.seconds() / nloop;
206 template <
typename FadType,
typename ... ViewArgs>
209 const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
212 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
213 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
214 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
215 typedef typename ViewTypeA::execution_space execution_space;
219 typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
220 typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
221 typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
223 ConViewTypeA
A(
"A",m,n,p+1);
224 ConViewTypeB b(
"B",n,p+1);
225 ConViewTypeC
c(
"c",m,p+1);
230 #if KOKKOS_VERSION >= 40799
231 Kokkos::deep_copy(
typename ConViewTypeA::type(A), 1.0);
232 Kokkos::deep_copy(
typename ConViewTypeB::type(b), 1.0);
234 Kokkos::deep_copy(
typename ConViewTypeA::array_type(A), 1.0);
235 Kokkos::deep_copy(
typename ConViewTypeB::array_type(b), 1.0);
238 Kokkos::Timer wall_clock;
243 execution_space().fence();
246 for (
size_t l=0; l<nloop; l++) {
249 execution_space().fence();
251 perf.
time = wall_clock.seconds() / nloop;
264 #define INST_FUNC_FAD_DEV(FAD,DEV) \
265 template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
266 template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
267 template Perf do_time_fad_hierarchical_dfad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
268 template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
269 template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
270 template Perf do_time_fad_hierarchical_dfad_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
272 #define INST_FUNC_DEV(DEV) \
273 INST_FUNC_FAD_DEV( DFad_type, DEV )
275 #ifdef KOKKOS_ENABLE_SERIAL
279 #ifdef KOKKOS_ENABLE_OPENMP
283 #ifdef KOKKOS_ENABLE_THREADS
287 #ifdef KOKKOS_ENABLE_CUDA
291 #ifdef KOKKOS_ENABLE_HIP
Sacado::Fad::DFad< double > DFad_type
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void check_deriv_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::DFad< double > FadType
Perf do_time_fad_hierarchical_dfad_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
Perf do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
#define INST_FUNC_DEV(DEV)
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
void run_mat_vec_hierarchical_dfad_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void destroyGlobalMemoryPool(const ExecSpace &space)
void run_mat_vec_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)