Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec_hierarchical.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
31 #define SACADO_ALIGN_SFAD 1
32 
33 #include "Sacado.hpp"
34 
35 #include "mat_vec_hierarchical.hpp"
36 
37 #include "Kokkos_Timer.hpp"
38 
39 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
40 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
41  const ViewTypeC& c) {
42  typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
43  typedef typename ViewTypeC::execution_space execution_space;
44 
45 #if defined (KOKKOS_ENABLE_CUDA)
47  const unsigned vector_size = is_cuda ? 32 : 1;
48  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
49 #elif defined (KOKKOS_ENABLE_HIP)
51  const unsigned vector_size = is_hip ? 64 : 1;
52  const unsigned team_size = is_hip ? 128 / vector_size : 1;
53 #else
54  const unsigned vector_size = 1;
55  const unsigned team_size = 1;
56 #endif
57 
58  const int m = A.extent(0);
59  const int n = A.extent(1);
60  const int range = (m+team_size-1)/team_size;
61 
62  typedef Kokkos::TeamPolicy<execution_space> Policy;
63  Kokkos::parallel_for(
64  Policy( range,team_size,vector_size ),
65  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
66  const int i = team.league_rank()*team.team_size() + team.team_rank();
67  if (i >= m)
68  return;
69 
70  scalar_type t = 0.0;
71  for (int j=0; j<n; ++j)
72  t += A(i,j)*b(j);
73  c(i) = t;
74  }
75  );
76 }
77 
78 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
79 void
80 check_deriv_hierarchical(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
81 {
82  const double tol = 1.0e-14;
83  typedef typename ViewTypeC::value_type value_type;
84  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
85  Kokkos::deep_copy(h_c, c);
86  const size_t m = A.extent(0);
87  const size_t n = A.extent(1);
88  const size_t p = Kokkos::dimension_scalar(A);
89  for (size_t i=0; i<m; ++i) {
90  for (size_t j=0; j<p; ++j) {
91  value_type t = (j == p-1 ? n : 2*n);
92  if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
93  std::cout << "Comparison failed! " << i << "," << j << " : "
94  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
95  }
96  }
97  }
98 }
99 
100 template <typename FadType, typename ... ViewArgs>
101 Perf
102 do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
103  const size_t nloop, const bool check)
104 {
105  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
106  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
107  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
108  typedef typename ViewTypeA::execution_space execution_space;
109 
110 #if defined (KOKKOS_ENABLE_CUDA)
112  const int FadStride = is_cuda ? 32 : 1;
113 #elif defined (KOKKOS_ENABLE_HIP)
115  const int FadStride = is_hip ? 64 : 1;
116 #else
117  const int FadStride = 1;
118 #endif
119 
120 #if defined(SACADO_ALIGN_SFAD)
122  const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
123  const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
124  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
125 #else
126  typedef FadType AlignedFadType;
127  const size_t pa = p;
128 #endif
129 
133 
134  typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
135  typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
136  typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
137 
138  ConViewTypeA A("A",m,n,pa+1);
139  ConViewTypeB b("B",n,pa+1);
140  ConViewTypeC c("c",m,pa+1);
141 
142  // AlignedFadType a(pa, 1.0);
143  // for (size_t k=0; k<pa; ++k)
144  // a.fastAccessDx(k) = 1.0;
145  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
146  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
147 
148  Kokkos::Timer wall_clock;
149  Perf perf;
150 
151  // Execute the kernel once to warm up
152  run_mat_vec_hierarchical( A, b, c );
153  execution_space().fence();
154 
155  wall_clock.reset();
156  for (size_t l=0; l<nloop; l++) {
157  run_mat_vec_hierarchical( A, b, c );
158  }
159  execution_space().fence();
160 
161  perf.time = wall_clock.seconds() / nloop;
162  perf.flops = m*n*(2+4*p);
163  perf.throughput = perf.flops / perf.time / 1.0e9;
164 
165  if (check) {
166  check_deriv_hierarchical(A, b, c);
167  }
168 
169  return perf;
170 }
171 
174 
175 #define INST_FUNC_FAD_DEV(FAD,DEV) \
176  template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
177  template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
178  template Perf do_time_fad_hierarchical< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
179 
180 #define INST_FUNC_DEV(DEV) \
181  INST_FUNC_FAD_DEV( SFad_type, DEV ) \
182  INST_FUNC_FAD_DEV( SLFad_type, DEV )
183 
184 #ifdef KOKKOS_ENABLE_SERIAL
185 INST_FUNC_DEV(Kokkos::Serial)
186 #endif
187 
188 #ifdef KOKKOS_ENABLE_OPENMP
189 INST_FUNC_DEV(Kokkos::OpenMP)
190 #endif
191 
192 #ifdef KOKKOS_ENABLE_THREADS
193 INST_FUNC_DEV(Kokkos::Threads)
194 #endif
195 
196 #ifdef KOKKOS_ENABLE_CUDA
197 INST_FUNC_DEV(Kokkos::Cuda)
198 #endif
199 
200 #ifdef KOKKOS_ENABLE_HIP
201 INST_FUNC_DEV(Kokkos::HIP)
202 #endif
const char * p
abs(expr.val())
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double time
void check_deriv_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
#define INST_FUNC_DEV(DEV)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:572
Sacado::Fad::SFad< double, SFadSize > SFad_type
Definition: mat_vec.cpp:514
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
const int N
int value
double throughput
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
const double tol
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type
Definition: mat_vec.cpp:515
int n