Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec_hierarchical.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
11 #define SACADO_ALIGN_SFAD 1
12 
13 #include "Sacado.hpp"
14 
15 #include "mat_vec_hierarchical.hpp"
16 
17 #include "Kokkos_Timer.hpp"
18 
19 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
20 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
21  const ViewTypeC& c) {
22  typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
23  typedef typename ViewTypeC::execution_space execution_space;
24 
25 #if defined (KOKKOS_ENABLE_CUDA)
27  const unsigned vector_size = is_cuda ? 32 : 1;
28  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
29 #elif defined (KOKKOS_ENABLE_HIP)
31  const unsigned vector_size = is_hip ? 64 : 1;
32  const unsigned team_size = is_hip ? 128 / vector_size : 1;
33 #else
34  const unsigned vector_size = 1;
35  const unsigned team_size = 1;
36 #endif
37 
38  const int m = A.extent(0);
39  const int n = A.extent(1);
40  const int range = (m+team_size-1)/team_size;
41 
42  typedef Kokkos::TeamPolicy<execution_space> Policy;
43  Kokkos::parallel_for(
44  Policy( range,team_size,vector_size ),
45  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
46  const int i = team.league_rank()*team.team_size() + team.team_rank();
47  if (i >= m)
48  return;
49 
50  scalar_type t = 0.0;
51  for (int j=0; j<n; ++j)
52  t += A(i,j)*b(j);
53  c(i) = t;
54  }
55  );
56 }
57 
58 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
59 void
60 check_deriv_hierarchical(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
61 {
62  const double tol = 1.0e-14;
63  typedef typename ViewTypeC::value_type value_type;
64  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
65  Kokkos::deep_copy(h_c, c);
66  const size_t m = A.extent(0);
67  const size_t n = A.extent(1);
68  const size_t p = Kokkos::dimension_scalar(A);
69  for (size_t i=0; i<m; ++i) {
70  for (size_t j=0; j<p; ++j) {
71  value_type t = (j == p-1 ? n : 2*n);
72  if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
73  std::cout << "Comparison failed! " << i << "," << j << " : "
74  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
75  }
76  }
77  }
78 }
79 
80 template <typename FadType, typename ... ViewArgs>
81 Perf
82 do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
83  const size_t nloop, const bool check)
84 {
85  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
86  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
87  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
88  typedef typename ViewTypeA::execution_space execution_space;
89 
90 #if defined (KOKKOS_ENABLE_CUDA)
92  const int FadStride = is_cuda ? 32 : 1;
93 #elif defined (KOKKOS_ENABLE_HIP)
95  const int FadStride = is_hip ? 64 : 1;
96 #else
97  const int FadStride = 1;
98 #endif
99 
100 #if defined(SACADO_ALIGN_SFAD)
102  const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
103  const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
104  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
105 #else
106  typedef FadType AlignedFadType;
107  const size_t pa = p;
108 #endif
109 
113 
114  typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
115  typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
116  typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
117 
118  ConViewTypeA A("A",m,n,pa+1);
119  ConViewTypeB b("B",n,pa+1);
120  ConViewTypeC c("c",m,pa+1);
121 
122  // AlignedFadType a(pa, 1.0);
123  // for (size_t k=0; k<pa; ++k)
124  // a.fastAccessDx(k) = 1.0;
125  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
126  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
127 
128  Kokkos::Timer wall_clock;
129  Perf perf;
130 
131  // Execute the kernel once to warm up
132  run_mat_vec_hierarchical( A, b, c );
133  execution_space().fence();
134 
135  wall_clock.reset();
136  for (size_t l=0; l<nloop; l++) {
137  run_mat_vec_hierarchical( A, b, c );
138  }
139  execution_space().fence();
140 
141  perf.time = wall_clock.seconds() / nloop;
142  perf.flops = m*n*(2+4*p);
143  perf.throughput = perf.flops / perf.time / 1.0e9;
144 
145  if (check) {
146  check_deriv_hierarchical(A, b, c);
147  }
148 
149  return perf;
150 }
151 
154 
155 #define INST_FUNC_FAD_DEV(FAD,DEV) \
156  template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
157  template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
158  template Perf do_time_fad_hierarchical< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
159 
160 #define INST_FUNC_DEV(DEV) \
161  INST_FUNC_FAD_DEV( SFad_type, DEV ) \
162  INST_FUNC_FAD_DEV( SLFad_type, DEV )
163 
164 #ifdef KOKKOS_ENABLE_SERIAL
165 INST_FUNC_DEV(Kokkos::Serial)
166 #endif
167 
168 #ifdef KOKKOS_ENABLE_OPENMP
169 INST_FUNC_DEV(Kokkos::OpenMP)
170 #endif
171 
172 #ifdef KOKKOS_ENABLE_THREADS
173 INST_FUNC_DEV(Kokkos::Threads)
174 #endif
175 
176 #ifdef KOKKOS_ENABLE_CUDA
177 INST_FUNC_DEV(Kokkos::Cuda)
178 #endif
179 
180 #ifdef KOKKOS_ENABLE_HIP
181 INST_FUNC_DEV(Kokkos::HIP)
182 #endif
const char * p
abs(expr.val())
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double time
void check_deriv_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
#define INST_FUNC_DEV(DEV)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:552
Sacado::Fad::SFad< double, SFadSize > SFad_type
Definition: mat_vec.cpp:494
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
const int N
int value
double throughput
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
const double tol
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type
Definition: mat_vec.cpp:495
int n