Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec_hierarchical_dfad.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
11 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
12 
13 #include "Sacado.hpp"
14 
16 
17 #include "Kokkos_Timer.hpp"
18 
19 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
20 void run_mat_vec_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b,
21  const ViewTypeC& c) {
22  typedef typename ViewTypeC::value_type scalar_type;
23  typedef typename ViewTypeC::execution_space execution_space;
24 
25 #if defined (KOKKOS_ENABLE_CUDA)
27  const unsigned vector_size = is_cuda ? 32 : 1;
28  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
29 #elif defined (KOKKOS_ENABLE_HIP)
31  const unsigned vector_size = is_hip ? 64 : 1;
32  const unsigned team_size = is_hip ? 128 / vector_size : 1;
33 #else
34  const unsigned vector_size = 1;
35  const unsigned team_size = 1;
36 #endif
37 
38  const int m = A.extent(0);
39  const int n = A.extent(1);
40  const int range = (m+team_size-1)/team_size;
41 
42  typedef Kokkos::TeamPolicy<execution_space> Policy;
43  Kokkos::parallel_for(
44  Policy( range,team_size,vector_size ),
45  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
46  const int i = team.league_rank()*team.team_size() + team.team_rank();
47  if (i >= m)
48  return;
49 
50  scalar_type t = 0.0;
51  for (int j=0; j<n; ++j)
52  t += A(i,j)*b(j);
53  c(i) = t;
54  }
55  );
56 }
57 
58 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
60  const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
61  typedef typename ViewTypeC::value_type scalar_type;
62  typedef typename ViewTypeC::execution_space execution_space;
63  typedef Kokkos::TeamPolicy<execution_space> Policy;
64  typedef typename Policy::member_type team_member;
65  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
66 
67 #if defined (KOKKOS_ENABLE_CUDA)
69  const unsigned VectorSize = is_cuda ? 32 : 1;
70  const unsigned TeamSize = is_cuda ? 128 / VectorSize : 1;
71 #elif defined (KOKKOS_ENABLE_HIP)
73  const unsigned VectorSize = is_hip ? 64 : 1;
74  const unsigned TeamSize = is_hip ? 128 / VectorSize : 1;
75 #else
76  const unsigned VectorSize = 1;
77  const unsigned TeamSize = 1;
78 #endif
79 
80  const int m = A.extent(0);
81  const int n = A.extent(1);
82  const int p = dimension_scalar(A);
83  const int N = (m+TeamSize-1)/TeamSize;
84 
85  Policy policy(N, TeamSize, VectorSize);
86  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
87  Kokkos::parallel_for(
88  policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
89  KOKKOS_LAMBDA (const team_member& team) {
90  const int team_rank = team.team_rank();
91  const int team_size = team.team_size();
92  TmpScratchSpace t(team.team_scratch(0), team_size, p);
93  const int i = team.league_rank()*team_size + team_rank;
94  if (i < m) {
95  t(team_rank) = 0.0;
96  for (int j=0; j<n; ++j)
97  t(team_rank) += A(i,j)*b(j);
98  c(i) = t(team_rank);
99  }
100  }
101  );
102 }
103 
104 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
105 void
106 check_deriv_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
107 {
108  const double tol = 1.0e-14;
109  typedef typename ViewTypeC::value_type value_type;
110  typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
111  Kokkos::deep_copy(h_c, c);
112  const size_t m = A.extent(0);
113  const size_t n = A.extent(1);
114  const size_t p = Kokkos::dimension_scalar(A);
115  for (size_t i=0; i<m; ++i) {
116  for (size_t j=0; j<p; ++j) {
117  value_type t = (j == p-1 ? n : 2*n);
118  if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
119  std::cout << "Comparison failed! " << i << "," << j << " : "
120  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
121  }
122  }
123  }
124 }
125 
126 template <typename FadType, typename ... ViewArgs>
127 Perf
128 do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p,
129  const size_t nloop, const bool check)
130 {
131  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
132  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
133  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
134  typedef typename ViewTypeA::execution_space execution_space;
138  typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
139  typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
140  typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
141 
142  ConViewTypeA A("A",m,n,p+1);
143  ConViewTypeB b("B",n,p+1);
144  ConViewTypeC c("c",m,p+1);
145 
146  // FadType a(p, 1.0);
147  // for (size_t k=0; k<p; ++k)
148  // a.fastAccessDx(k) = 1.0;
149 #if KOKKOS_VERSION >= 40799
150  Kokkos::deep_copy(typename ConViewTypeA::type(A), 1.0);
151  Kokkos::deep_copy(typename ConViewTypeB::type(b), 1.0);
152 #else
153  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
154  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
155 #endif
156 
157  Kokkos::Timer wall_clock;
158  Perf perf;
159 
160 #if defined (KOKKOS_ENABLE_CUDA)
162  const size_t warp_dim = is_cuda ? 32 : 1;
163 #elif defined (KOKKOS_ENABLE_HIP)
165  const size_t warp_dim = is_hip ? 64 : 1;
166 #else
167  const size_t warp_dim = 1;
168 #endif
169 
170  const size_t concurrency = execution_space().concurrency();
171  const size_t block_size = p*sizeof(double);
172  const size_t nkernels = concurrency / warp_dim;
173  const size_t mem_pool_size =
174  static_cast<size_t>(1.2*nkernels*block_size);
175  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
176  execution_space space;
177  Sacado::createGlobalMemoryPool(space, mem_pool_size,
178  block_size,
179  block_size,
180  superblock_size
181  );
182 
183  // Execute the kernel once to warm up
185  execution_space().fence();
186 
187  wall_clock.reset();
188  for (size_t l=0; l<nloop; l++) {
190  }
191  execution_space().fence();
192 
193  perf.time = wall_clock.seconds() / nloop;
194  perf.flops = m*n*(2+4*p);
195  perf.throughput = perf.flops / perf.time / 1.0e9;
196 
197  if (check) {
199  }
200 
202 
203  return perf;
204 }
205 
206 template <typename FadType, typename ... ViewArgs>
207 Perf
209  const size_t m, const size_t n, const size_t p, const size_t nloop,
210  const bool check)
211 {
212  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
213  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
214  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
215  typedef typename ViewTypeA::execution_space execution_space;
219  typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
220  typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
221  typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
222 
223  ConViewTypeA A("A",m,n,p+1);
224  ConViewTypeB b("B",n,p+1);
225  ConViewTypeC c("c",m,p+1);
226 
227  // FadType a(p, 1.0);
228  // for (size_t k=0; k<p; ++k)
229  // a.fastAccessDx(k) = 1.0;
230 #if KOKKOS_VERSION >= 40799
231  Kokkos::deep_copy(typename ConViewTypeA::type(A), 1.0);
232  Kokkos::deep_copy(typename ConViewTypeB::type(b), 1.0);
233 #else
234  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
235  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
236 #endif
237 
238  Kokkos::Timer wall_clock;
239  Perf perf;
240 
241  // Execute the kernel once to warm up
243  execution_space().fence();
244 
245  wall_clock.reset();
246  for (size_t l=0; l<nloop; l++) {
248  }
249  execution_space().fence();
250 
251  perf.time = wall_clock.seconds() / nloop;
252  perf.flops = m*n*(2+4*p);
253  perf.throughput = perf.flops / perf.time / 1.0e9;
254 
255  if (check) {
257  }
258 
259  return perf;
260 }
261 
263 
264 #define INST_FUNC_FAD_DEV(FAD,DEV) \
265  template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
266  template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
267  template Perf do_time_fad_hierarchical_dfad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
268  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
269  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
270  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
271 
272 #define INST_FUNC_DEV(DEV) \
273  INST_FUNC_FAD_DEV( DFad_type, DEV )
274 
275 #ifdef KOKKOS_ENABLE_SERIAL
276 INST_FUNC_DEV(Kokkos::Serial)
277 #endif
278 
279 #ifdef KOKKOS_ENABLE_OPENMP
280 INST_FUNC_DEV(Kokkos::OpenMP)
281 #endif
282 
283 #ifdef KOKKOS_ENABLE_THREADS
284 INST_FUNC_DEV(Kokkos::Threads)
285 #endif
286 
287 #ifdef KOKKOS_ENABLE_CUDA
288 INST_FUNC_DEV(Kokkos::Cuda)
289 #endif
290 
291 #ifdef KOKKOS_ENABLE_HIP
292 INST_FUNC_DEV(Kokkos::HIP)
293 #endif
Sacado::Fad::DFad< double > DFad_type
Definition: mat_vec.cpp:520
abs(expr.val())
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
double time
void check_deriv_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::DFad< double > FadType
Perf do_time_fad_hierarchical_dfad_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:552
Perf do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
const char * p
int value
const int N
#define INST_FUNC_DEV(DEV)
double throughput
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
const double tol
void run_mat_vec_hierarchical_dfad_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void destroyGlobalMemoryPool(const ExecSpace &space)
int n
void run_mat_vec_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)