Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 //#define SACADO_DISABLE_FAD_VIEW_SPEC
11 
12 #include "Sacado.hpp"
13 
14 #include "mat_vec.hpp"
15 
16 #include "Kokkos_Timer.hpp"
17 
18 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
19 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
20  typedef typename ViewTypeC::value_type scalar_type;
21  typedef typename ViewTypeC::execution_space execution_space;
22 
23  const int m = A.extent(0);
24  const int n = A.extent(1);
25  Kokkos::parallel_for(
26  Kokkos::RangePolicy<execution_space>( 0,m ),
27  KOKKOS_LAMBDA (const int i) {
28  scalar_type t = 0.0;
29  for (int j=0; j<n; ++j)
30  t += A(i,j)*b(j);
31  c(i) = t;
32  }
33  );
34 }
35 
36 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
37 void
38 run_mat_vec_scratch(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
39 {
40  typedef typename ViewTypeC::value_type scalar_type;
41  typedef typename ViewTypeC::execution_space execution_space;
42  typedef Kokkos::TeamPolicy<execution_space> Policy;
43  typedef typename Policy::member_type team_member;
44  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
45 
46  const int m = A.extent(0);
47  const int n = A.extent(1);
48  const int p = dimension_scalar(A);
49 
50 #ifdef KOKKOS_ENABLE_CUDA
52 #else
53  const bool is_cuda = false;
54 #endif
55  const int TeamSize = is_cuda ? 128 : 1;
56  const int N = (m+TeamSize-1)/TeamSize;
57  Policy policy(N, TeamSize, 1);
58  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
59  Kokkos::parallel_for(
60  policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
61  KOKKOS_LAMBDA (const team_member& team) {
62  const int team_rank = team.team_rank();
63  const int team_size = team.team_size();
64  TmpScratchSpace t(team.team_scratch(0), team_size, p);
65  const int i = team.league_rank()*team_size + team_rank;
66  if (i < m) {
67  t(team_rank) = 0.0;
68  for (int j=0; j<n; ++j)
69  t(team_rank) += A(i,j)*b(j);
70  c(i) = t(team_rank);
71  }
72  }
73  );
74 }
75 
76 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
77 void
78 run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
79 {
80  typedef typename ViewTypeC::execution_space execution_space;
81 
82  const int m = A.extent(0);
83  const int n = A.extent(1);
84  const int p = A.extent(2)-1;
85  Kokkos::parallel_for(
86  Kokkos::RangePolicy<execution_space>( 0,m ),
87  KOKKOS_LAMBDA (const int i) {
88  c(i,p) = 0.0;
89  for (int k=0; k<p; ++k)
90  c(i,k) = 0.0;
91  for (int j=0; j<n; ++j) {
92  c(i,p) += A(i,j,p)*b(j,p);
93  for (int k=0; k<p; ++k) {
94  c(i,k) += A(i,j,k)*b(j,p) + A(i,j,p)*b(j,k);
95  }
96  }
97  }
98  );
99 }
100 
101 template <int MaxP, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
102 void
103 run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
104 {
105  typedef typename ViewTypeC::value_type scalar_type;
106  typedef typename ViewTypeC::execution_space execution_space;
107 
108  const int m = A.extent(0);
109  const int n = A.extent(1);
110  const int p = A.extent(2)-1;
111  Kokkos::parallel_for(
112  Kokkos::RangePolicy<execution_space>( 0,m ),
113  KOKKOS_LAMBDA (const int i) {
114  scalar_type cv = 0.0;
115  scalar_type t[MaxP];
116  for (int k=0; k<p; ++k)
117  t[k] = 0.0;
118 
119  for (int j=0; j<n; ++j) {
120  scalar_type av = A(i,j,p);
121  scalar_type bv = b(j,p);
122  cv += av*bv;
123  for (int k=0; k<p; ++k) {
124  t[k] += A(i,j,k)*bv + av*b(j,k);
125  }
126  }
127 
128  for (int k=0; k<p; ++k)
129  c(i,k) = t[k];
130  c(i,p) = cv;
131  }
132  );
133 }
134 
135 template <int p, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
136 void
137 run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
138 {
139  typedef typename ViewTypeC::value_type scalar_type;
140  typedef typename ViewTypeC::execution_space execution_space;
141 
142  const int m = A.extent(0);
143  const int n = A.extent(1);
144  Kokkos::parallel_for(
145  Kokkos::RangePolicy<execution_space>( 0,m ),
146  KOKKOS_LAMBDA (const int i) {
147  scalar_type cv = 0.0;
148  scalar_type t[p];
149  for (int k=0; k<p; ++k)
150  t[k] = 0.0;
151 
152  for (int j=0; j<n; ++j) {
153  const scalar_type av = A(i,j,p);
154  const scalar_type bv = b(j,p);
155  cv += av*bv;
156 
157 // Using simd here results in much better performance. Othewise the compiler
158 // appears to try and vectorize the j loop with gather instructions, which
159 // doesn't work very well.
160 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
161 #pragma simd
162 #endif
163  for (int k=0; k<p; ++k) {
164  t[k] += A(i,j,k)*bv + av*b(j,k);
165  }
166  }
167 
168  for (int k=0; k<p; ++k)
169  c(i,k) = t[k];
170  c(i,p) = cv;
171  }
172  );
173 }
174 
175 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
176 void
177 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
178 {
179  const double tol = 1.0e-14;
180  typedef typename ViewTypeC::value_type value_type;
181  typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
182  Kokkos::deep_copy(h_c, c);
183  const size_t m = A.extent(0);
184  const size_t n = A.extent(1);
185  for (size_t i=0; i<m; ++i) {
186  value_type t = n;
187  if (std::abs(h_c(i)- t) > tol) {
188  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
189  << std::endl;
190  }
191  }
192 }
193 
194 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
195 void
196 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
197 {
198  const double tol = 1.0e-14;
199  typedef typename ViewTypeC::value_type value_type;
200  typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
201  Kokkos::deep_copy(h_c, c);
202  const size_t m = A.extent(0);
203  const size_t n = A.extent(1);
204  const size_t p = A.extent(2);
205  for (size_t i=0; i<m; ++i) {
206  for (size_t j=0; j<p; ++j) {
207  value_type t = (j == p-1 ? n : 2*n);
208  if (std::abs(h_c(i,j)- t) > tol) {
209  std::cout << "Comparison failed! " << i << "," << j << " : "
210  << h_c(i,j) << " , " << t << std::endl;
211  }
212  }
213  }
214 }
215 
216 template <typename ... ViewArgs>
217 Perf
218 do_time_val(const size_t m, const size_t n, const size_t nloop,
219  const bool check)
220 {
221  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
222  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
223  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
224  typedef typename ViewTypeA::execution_space execution_space;
225 
226  ViewTypeA A("A",m,n);
227  ViewTypeB b("B",n);
228  ViewTypeC c("c",m);
229 
230  Kokkos::deep_copy(A, 1.0);
231  Kokkos::deep_copy(b, 1.0);
232 
233  Kokkos::Timer wall_clock;
234  Perf perf;
235 
236  // Execute the kernel once to warm up
237  run_mat_vec( A, b, c );
238  execution_space().fence();
239 
240  wall_clock.reset();
241  for (size_t l=0; l<nloop; l++) {
242  run_mat_vec( A, b, c );
243  }
244  execution_space().fence();
245 
246  perf.time = wall_clock.seconds() / nloop;
247  perf.flops = m*n*2;
248  perf.throughput = perf.flops / perf.time / 1.0e9;
249 
250  if (check)
251  check_val(A,b,c);
252 
253  return perf;
254 }
255 
256 template <typename FadType, typename ... ViewArgs>
257 Perf
258 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
259  const bool check)
260 {
261  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
262  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
263  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
264  typedef typename ViewTypeA::execution_space execution_space;
265 
266  // Set amount of memory available for dynamic memory allocation on GPU
267 #ifdef KOKKOS_ENABLE_CUDA
269  std::is_same<FadType,Sacado::Fad::DFad<double> >::value) {
270  const size_t concurrency = execution_space().concurrency();
271  const size_t mem = std::min(m,concurrency) * p * sizeof(double);
272  //std::cout << "mem = " << mem / (1024*1024) << " MB" << std::endl;
273  cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
274  }
275 #endif
276 
277 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
278  ViewTypeA A("A",m,n,p+1);
279  ViewTypeB b("B",n,p+1);
280  ViewTypeC c("c",m,p+1);
281 #else
282  ViewTypeA A("A",m,n);
283  ViewTypeB b("B",n);
284  ViewTypeC c("c",m);
285 #endif
286 
287  // FadType a(p, 1.0);
288  // for (size_t k=0; k<p; ++k)
289  // a.fastAccessDx(k) = 1.0;
290  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
291  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
292 
293  Kokkos::Timer wall_clock;
294  Perf perf;
295 
296  // Execute the kernel once to warm up
297  run_mat_vec( A, b, c );
298  execution_space().fence();
299 
300  wall_clock.reset();
301  for (size_t l=0; l<nloop; l++) {
302  run_mat_vec( A, b, c );
303  }
304  execution_space().fence();
305 
306  perf.time = wall_clock.seconds() / nloop;
307  perf.flops = m*n*(2+4*p);
308  perf.throughput = perf.flops / perf.time / 1.0e9;
309 
310 // FIXME: this needs a new way of getting a flattened Kokkos::View from FadView
311 #if !defined(SACADO_DISABLE_FAD_VIEW_SPEC) && !defined(SACADO_HAS_NEW_KOKKOS_VIEW_IMPL)
312  if (check) {
313  typename ViewTypeA::array_type A_flat = A;
314  typename ViewTypeB::array_type b_flat = b;
315  typename ViewTypeC::array_type c_flat = c;
316  check_deriv(A_flat, b_flat, c_flat);
317  }
318 #endif
319 
320  return perf;
321 }
322 
323 template <typename FadType, typename ... ViewArgs>
324 Perf
325 do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop,
326  const bool check)
327 {
328  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
329  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
330  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
331  typedef typename ViewTypeA::execution_space execution_space;
332 
333 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
334  ViewTypeA A("A",m,n,p+1);
335  ViewTypeB b("B",n,p+1);
336  ViewTypeC c("c",m,p+1);
337 #else
338  ViewTypeA A("A",m,n);
339  ViewTypeB b("B",n);
340  ViewTypeC c("c",m);
341 #endif
342 
343  // FadType a(p, 1.0);
344  // for (size_t k=0; k<p; ++k)
345  // a.fastAccessDx(k) = 1.0;
346  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
347  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
348 
349  Kokkos::Timer wall_clock;
350  Perf perf;
351 
352  // Execute the kernel once to warm up
353  run_mat_vec_scratch( A, b, c );
354  execution_space().fence();
355 
356  wall_clock.reset();
357  for (size_t l=0; l<nloop; l++) {
358  run_mat_vec_scratch( A, b, c );
359  }
360  execution_space().fence();
361 
362  perf.time = wall_clock.seconds() / nloop;
363  perf.flops = m*n*(2+4*p);
364  perf.throughput = perf.flops / perf.time / 1.0e9;
365 
366 // FIXME: this needs a new way of getting a flattened Kokkos::View from FadView
367 #if !defined(SACADO_DISABLE_FAD_VIEW_SPEC) && !defined(SACADO_HAS_NEW_KOKKOS_VIEW_IMPL)
368  if (check) {
369  typename ViewTypeA::array_type A_flat = A;
370  typename ViewTypeB::array_type b_flat = b;
371  typename ViewTypeC::array_type c_flat = c;
372  check_deriv(A_flat, b_flat, c_flat);
373  }
374 #endif
375 
376  return perf;
377 }
378 
379 template <typename ... ViewArgs>
380 Perf
381 do_time_analytic(const size_t m, const size_t n, const size_t p,
382  const size_t nloop, const bool check)
383 {
384  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
385  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
386  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
387  typedef typename ViewTypeA::execution_space execution_space;
388 
389  ViewTypeA A("A",m,n,p+1);
390  ViewTypeB b("B",n,p+1);
391  ViewTypeC c("c",m,p+1);
392 
393  Kokkos::deep_copy(A, 1.0);
394  Kokkos::deep_copy(b, 1.0);
395 
396  Kokkos::Timer wall_clock;
397  Perf perf;
398 
399  // Execute the kernel once to warm up
400  run_mat_vec_deriv( A, b, c );
401  execution_space().fence();
402 
403  for (size_t l=0; l<nloop; l++) {
404  run_mat_vec_deriv( A, b, c );
405  }
406  execution_space().fence();
407 
408  perf.time = wall_clock.seconds() / nloop;
409  perf.flops = m*n*(2+4*p);
410  perf.throughput = perf.flops / perf.time / 1.0e9;
411 
412  if (check)
413  check_deriv(A,b,c);
414 
415  return perf;
416 }
417 
418 template <int MaxP, typename ... ViewArgs>
419 Perf
420 do_time_analytic_sl(const size_t m, const size_t n, const size_t p,
421  const size_t nloop, const bool check)
422 {
423  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
424  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
425  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
426  typedef typename ViewTypeA::execution_space execution_space;
427 
428  ViewTypeA A("A",m,n,p+1);
429  ViewTypeB b("B",n,p+1);
430  ViewTypeC c("c",m,p+1);
431 
432  Kokkos::deep_copy(A, 1.0);
433  Kokkos::deep_copy(b, 1.0);
434 
435  Kokkos::Timer wall_clock;
436  Perf perf;
437 
438  // Execute the kernel once to warm up
439  run_mat_vec_deriv_sl<MaxP>( A, b, c );
440  execution_space().fence();
441 
442  for (size_t l=0; l<nloop; l++) {
443  run_mat_vec_deriv_sl<MaxP>( A, b, c );
444  }
445  execution_space().fence();
446 
447  perf.time = wall_clock.seconds() / nloop;
448  perf.flops = m*n*(2+4*p);
449  perf.throughput = perf.flops / perf.time / 1.0e9;
450 
451  if (check)
452  check_deriv(A,b,c);
453 
454  return perf;
455 }
456 
457 template <int p, typename ... ViewArgs>
458 Perf
459 do_time_analytic_s(const size_t m, const size_t n,
460  const size_t nloop, const bool check)
461 {
462  typedef Kokkos::View<double**[p+1], ViewArgs...> ViewTypeA;
463  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
464  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
465  typedef typename ViewTypeA::execution_space execution_space;
466 
467  ViewTypeA A("A",m,n);
468  ViewTypeB b("B",n,p+1);
469  ViewTypeC c("c",m,p+1);
470 
471  Kokkos::deep_copy(A, 1.0);
472  Kokkos::deep_copy(b, 1.0);
473 
474  Kokkos::Timer wall_clock;
475  Perf perf;
476 
477  // Execute the kernel once to warm up
478  run_mat_vec_deriv_s<p>( A, b, c );
479  execution_space().fence();
480 
481  for (size_t l=0; l<nloop; l++) {
482  run_mat_vec_deriv_s<p>( A, b, c );
483  }
484  execution_space().fence();
485 
486  perf.time = wall_clock.seconds() / nloop;
487  perf.flops = m*n*(2+4*p);
488  perf.throughput = perf.flops / perf.time / 1.0e9;
489 
490  if (check)
491  check_deriv(A,b,c);
492 
493  return perf;
494 }
495 
499 
500 #define INST_FUNC_VAL_DEV(DEV) \
501  template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
502  template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
503  template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
504  template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
505  template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
506  template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
507  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
508  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
509  template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
510  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
511  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
512  template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
513 
514 #define INST_FUNC_FAD_DEV(FAD,DEV) \
515  template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
516  template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
517  template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
518  template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
519  template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
520  template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
521 
522 #define INST_FUNC_DEV(DEV) \
523  INST_FUNC_VAL_DEV( DEV ) \
524  INST_FUNC_FAD_DEV( SFad_type, DEV ) \
525  INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
526  INST_FUNC_FAD_DEV( DFad_type, DEV )
527 
528 #ifdef KOKKOS_ENABLE_SERIAL
529 INST_FUNC_DEV(Kokkos::Serial)
530 #endif
531 
532 #ifdef KOKKOS_ENABLE_OPENMP
533 INST_FUNC_DEV(Kokkos::OpenMP)
534 #endif
535 
536 #ifdef KOKKOS_ENABLE_THREADS
537 INST_FUNC_DEV(Kokkos::Threads)
538 #endif
539 
540 #ifdef KOKKOS_ENABLE_CUDA
541 INST_FUNC_DEV(Kokkos::Cuda)
542 #endif
543 
544 #ifdef KOKKOS_ENABLE_HIP
545 INST_FUNC_DEV(Kokkos::HIP)
546 #endif
Sacado::Fad::DFad< double > DFad_type
Definition: mat_vec.cpp:498
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:72
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:38
double time
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: mat_vec.cpp:459
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:420
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:552
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
Sacado::Fad::SFad< double, SFadSize > SFad_type
Definition: mat_vec.cpp:496
const char * p
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:325
#define INST_FUNC_DEV(DEV)
Definition: mat_vec.cpp:522
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
int value
const int N
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:103
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:78
double throughput
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const double tol
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:137
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type
Definition: mat_vec.cpp:497
int n