Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 //#define SACADO_DISABLE_FAD_VIEW_SPEC
11 
12 #include "Sacado.hpp"
13 
14 #include "mat_vec.hpp"
15 
16 #include "Kokkos_Timer.hpp"
17 
18 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
19 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
20  typedef typename ViewTypeC::value_type scalar_type;
21  typedef typename ViewTypeC::execution_space execution_space;
22 
23  const int m = A.extent(0);
24  const int n = A.extent(1);
25  Kokkos::parallel_for(
26  Kokkos::RangePolicy<execution_space>( 0,m ),
27  KOKKOS_LAMBDA (const int i) {
28  scalar_type t = 0.0;
29  for (int j=0; j<n; ++j)
30  t += A(i,j)*b(j);
31  c(i) = t;
32  }
33  );
34 }
35 
36 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
37 void
38 run_mat_vec_scratch(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
39 {
40  typedef typename ViewTypeC::value_type scalar_type;
41  typedef typename ViewTypeC::execution_space execution_space;
42  typedef Kokkos::TeamPolicy<execution_space> Policy;
43  typedef typename Policy::member_type team_member;
44  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
45 
46  const int m = A.extent(0);
47  const int n = A.extent(1);
48  const int p = dimension_scalar(A);
49 
50 #ifdef KOKKOS_ENABLE_CUDA
52 #else
53  const bool is_cuda = false;
54 #endif
55  const int TeamSize = is_cuda ? 128 : 1;
56  const int N = (m+TeamSize-1)/TeamSize;
57  Policy policy(N, TeamSize, 1);
58  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
59  Kokkos::parallel_for(
60  policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
61  KOKKOS_LAMBDA (const team_member& team) {
62  const int team_rank = team.team_rank();
63  const int team_size = team.team_size();
64  TmpScratchSpace t(team.team_scratch(0), team_size, p);
65  const int i = team.league_rank()*team_size + team_rank;
66  if (i < m) {
67  t(team_rank) = 0.0;
68  for (int j=0; j<n; ++j)
69  t(team_rank) += A(i,j)*b(j);
70  c(i) = t(team_rank);
71  }
72  }
73  );
74 }
75 
76 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
77 void
78 run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
79 {
80  typedef typename ViewTypeC::execution_space execution_space;
81 
82  const int m = A.extent(0);
83  const int n = A.extent(1);
84  const int p = A.extent(2)-1;
85  Kokkos::parallel_for(
86  Kokkos::RangePolicy<execution_space>( 0,m ),
87  KOKKOS_LAMBDA (const int i) {
88  c(i,p) = 0.0;
89  for (int k=0; k<p; ++k)
90  c(i,k) = 0.0;
91  for (int j=0; j<n; ++j) {
92  c(i,p) += A(i,j,p)*b(j,p);
93  for (int k=0; k<p; ++k) {
94  c(i,k) += A(i,j,k)*b(j,p) + A(i,j,p)*b(j,k);
95  }
96  }
97  }
98  );
99 }
100 
101 template <int MaxP, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
102 void
103 run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
104 {
105  typedef typename ViewTypeC::value_type scalar_type;
106  typedef typename ViewTypeC::execution_space execution_space;
107 
108  const int m = A.extent(0);
109  const int n = A.extent(1);
110  const int p = A.extent(2)-1;
111  Kokkos::parallel_for(
112  Kokkos::RangePolicy<execution_space>( 0,m ),
113  KOKKOS_LAMBDA (const int i) {
114  scalar_type cv = 0.0;
115  scalar_type t[MaxP];
116  for (int k=0; k<p; ++k)
117  t[k] = 0.0;
118 
119  for (int j=0; j<n; ++j) {
120  scalar_type av = A(i,j,p);
121  scalar_type bv = b(j,p);
122  cv += av*bv;
123  for (int k=0; k<p; ++k) {
124  t[k] += A(i,j,k)*bv + av*b(j,k);
125  }
126  }
127 
128  for (int k=0; k<p; ++k)
129  c(i,k) = t[k];
130  c(i,p) = cv;
131  }
132  );
133 }
134 
135 template <int p, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
136 void
137 run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
138 {
139  typedef typename ViewTypeC::value_type scalar_type;
140  typedef typename ViewTypeC::execution_space execution_space;
141 
142  const int m = A.extent(0);
143  const int n = A.extent(1);
144  Kokkos::parallel_for(
145  Kokkos::RangePolicy<execution_space>( 0,m ),
146  KOKKOS_LAMBDA (const int i) {
147  scalar_type cv = 0.0;
148  scalar_type t[p];
149  for (int k=0; k<p; ++k)
150  t[k] = 0.0;
151 
152  for (int j=0; j<n; ++j) {
153  const scalar_type av = A(i,j,p);
154  const scalar_type bv = b(j,p);
155  cv += av*bv;
156 
157 // Using simd here results in much better performance. Othewise the compiler
158 // appears to try and vectorize the j loop with gather instructions, which
159 // doesn't work very well.
160 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
161 #pragma simd
162 #endif
163  for (int k=0; k<p; ++k) {
164  t[k] += A(i,j,k)*bv + av*b(j,k);
165  }
166  }
167 
168  for (int k=0; k<p; ++k)
169  c(i,k) = t[k];
170  c(i,p) = cv;
171  }
172  );
173 }
174 
175 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
176 void
177 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
178 {
179  const double tol = 1.0e-14;
180  typedef typename ViewTypeC::value_type value_type;
181  typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
182  Kokkos::deep_copy(h_c, c);
183  const size_t m = A.extent(0);
184  const size_t n = A.extent(1);
185  for (size_t i=0; i<m; ++i) {
186  value_type t = n;
187  if (std::abs(h_c(i)- t) > tol) {
188  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
189  << std::endl;
190  }
191  }
192 }
193 
194 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
195 void
196 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
197 {
198  const double tol = 1.0e-14;
199  typedef typename ViewTypeC::value_type value_type;
200  typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
201  Kokkos::deep_copy(h_c, c);
202  const size_t m = A.extent(0);
203  const size_t n = A.extent(1);
204  const size_t p = A.extent(2);
205  for (size_t i=0; i<m; ++i) {
206  for (size_t j=0; j<p; ++j) {
207  value_type t = (j == p-1 ? n : 2*n);
208  if (std::abs(h_c(i,j)- t) > tol) {
209  std::cout << "Comparison failed! " << i << "," << j << " : "
210  << h_c(i,j) << " , " << t << std::endl;
211  }
212  }
213  }
214 }
215 
216 template <typename ... ViewArgs>
217 Perf
218 do_time_val(const size_t m, const size_t n, const size_t nloop,
219  const bool check)
220 {
221  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
222  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
223  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
224  typedef typename ViewTypeA::execution_space execution_space;
225 
226  ViewTypeA A("A",m,n);
227  ViewTypeB b("B",n);
228  ViewTypeC c("c",m);
229 
230  Kokkos::deep_copy(A, 1.0);
231  Kokkos::deep_copy(b, 1.0);
232 
233  Kokkos::Timer wall_clock;
234  Perf perf;
235 
236  // Execute the kernel once to warm up
237  run_mat_vec( A, b, c );
238  execution_space().fence();
239 
240  wall_clock.reset();
241  for (size_t l=0; l<nloop; l++) {
242  run_mat_vec( A, b, c );
243  }
244  execution_space().fence();
245 
246  perf.time = wall_clock.seconds() / nloop;
247  perf.flops = m*n*2;
248  perf.throughput = perf.flops / perf.time / 1.0e9;
249 
250  if (check)
251  check_val(A,b,c);
252 
253  return perf;
254 }
255 
256 template <typename FadType, typename ... ViewArgs>
257 Perf
258 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
259  const bool check)
260 {
261  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
262  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
263  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
264  typedef typename ViewTypeA::execution_space execution_space;
265 
266  // Set amount of memory available for dynamic memory allocation on GPU
267 #ifdef KOKKOS_ENABLE_CUDA
269  std::is_same<FadType,Sacado::Fad::DFad<double> >::value) {
270  const size_t concurrency = execution_space().concurrency();
271  const size_t mem = std::min(m,concurrency) * p * sizeof(double);
272  //std::cout << "mem = " << mem / (1024*1024) << " MB" << std::endl;
273  cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
274  }
275 #endif
276 
277 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
278  ViewTypeA A("A",m,n,p+1);
279  ViewTypeB b("B",n,p+1);
280  ViewTypeC c("c",m,p+1);
281 #else
282  ViewTypeA A("A",m,n);
283  ViewTypeB b("B",n);
284  ViewTypeC c("c",m);
285 #endif
286 
287  // FadType a(p, 1.0);
288  // for (size_t k=0; k<p; ++k)
289  // a.fastAccessDx(k) = 1.0;
290 #if KOKKOS_VERSION >= 40799
291  Kokkos::deep_copy(typename ViewTypeA::type(A), 1.0);
292  Kokkos::deep_copy(typename ViewTypeB::type(b), 1.0);
293 #else
294  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
295  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
296 #endif
297 
298  Kokkos::Timer wall_clock;
299  Perf perf;
300 
301  // Execute the kernel once to warm up
302  run_mat_vec( A, b, c );
303  execution_space().fence();
304 
305  wall_clock.reset();
306  for (size_t l=0; l<nloop; l++) {
307  run_mat_vec( A, b, c );
308  }
309  execution_space().fence();
310 
311  perf.time = wall_clock.seconds() / nloop;
312  perf.flops = m*n*(2+4*p);
313  perf.throughput = perf.flops / perf.time / 1.0e9;
314 
315 // FIXME: this needs a new way of getting a flattened Kokkos::View from FadView
316 #if !defined(SACADO_DISABLE_FAD_VIEW_SPEC) && !defined(SACADO_HAS_NEW_KOKKOS_VIEW_IMPL)
317  if (check) {
318 #if KOKKOS_VERSION >= 40799
319  typename ViewTypeA::type A_flat = A;
320  typename ViewTypeB::type b_flat = b;
321  typename ViewTypeC::type c_flat = c;
322 #else
323  typename ViewTypeA::array_type A_flat = A;
324  typename ViewTypeB::array_type b_flat = b;
325  typename ViewTypeC::array_type c_flat = c;
326 #endif
327  check_deriv(A_flat, b_flat, c_flat);
328  }
329 #endif
330 
331  return perf;
332 }
333 
334 template <typename FadType, typename ... ViewArgs>
335 Perf
336 do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop,
337  const bool check)
338 {
339  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
340  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
341  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
342  typedef typename ViewTypeA::execution_space execution_space;
343 
344 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
345  ViewTypeA A("A",m,n,p+1);
346  ViewTypeB b("B",n,p+1);
347  ViewTypeC c("c",m,p+1);
348 #else
349  ViewTypeA A("A",m,n);
350  ViewTypeB b("B",n);
351  ViewTypeC c("c",m);
352 #endif
353 
354  // FadType a(p, 1.0);
355  // for (size_t k=0; k<p; ++k)
356  // a.fastAccessDx(k) = 1.0;
357 #if KOKKOS_VERSION >= 40799
358  Kokkos::deep_copy(typename ViewTypeA::type(A), 1.0);
359  Kokkos::deep_copy(typename ViewTypeB::type(b), 1.0);
360 #else
361  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
362  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
363 #endif
364 
365  Kokkos::Timer wall_clock;
366  Perf perf;
367 
368  // Execute the kernel once to warm up
369  run_mat_vec_scratch( A, b, c );
370  execution_space().fence();
371 
372  wall_clock.reset();
373  for (size_t l=0; l<nloop; l++) {
374  run_mat_vec_scratch( A, b, c );
375  }
376  execution_space().fence();
377 
378  perf.time = wall_clock.seconds() / nloop;
379  perf.flops = m*n*(2+4*p);
380  perf.throughput = perf.flops / perf.time / 1.0e9;
381 
382 // FIXME: this needs a new way of getting a flattened Kokkos::View from FadView
383 #if !defined(SACADO_DISABLE_FAD_VIEW_SPEC) && !defined(SACADO_HAS_NEW_KOKKOS_VIEW_IMPL)
384  if (check) {
385 #if KOKKOS_VERSION >= 40799
386  typename ViewTypeA::type A_flat = A;
387  typename ViewTypeB::type b_flat = b;
388  typename ViewTypeC::type c_flat = c;
389 #else
390  typename ViewTypeA::array_type A_flat = A;
391  typename ViewTypeB::array_type b_flat = b;
392  typename ViewTypeC::array_type c_flat = c;
393 #endif
394  check_deriv(A_flat, b_flat, c_flat);
395  }
396 #endif
397 
398  return perf;
399 }
400 
401 template <typename ... ViewArgs>
402 Perf
403 do_time_analytic(const size_t m, const size_t n, const size_t p,
404  const size_t nloop, const bool check)
405 {
406  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
407  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
408  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
409  typedef typename ViewTypeA::execution_space execution_space;
410 
411  ViewTypeA A("A",m,n,p+1);
412  ViewTypeB b("B",n,p+1);
413  ViewTypeC c("c",m,p+1);
414 
415  Kokkos::deep_copy(A, 1.0);
416  Kokkos::deep_copy(b, 1.0);
417 
418  Kokkos::Timer wall_clock;
419  Perf perf;
420 
421  // Execute the kernel once to warm up
422  run_mat_vec_deriv( A, b, c );
423  execution_space().fence();
424 
425  for (size_t l=0; l<nloop; l++) {
426  run_mat_vec_deriv( A, b, c );
427  }
428  execution_space().fence();
429 
430  perf.time = wall_clock.seconds() / nloop;
431  perf.flops = m*n*(2+4*p);
432  perf.throughput = perf.flops / perf.time / 1.0e9;
433 
434  if (check)
435  check_deriv(A,b,c);
436 
437  return perf;
438 }
439 
440 template <int MaxP, typename ... ViewArgs>
441 Perf
442 do_time_analytic_sl(const size_t m, const size_t n, const size_t p,
443  const size_t nloop, const bool check)
444 {
445  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
446  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
447  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
448  typedef typename ViewTypeA::execution_space execution_space;
449 
450  ViewTypeA A("A",m,n,p+1);
451  ViewTypeB b("B",n,p+1);
452  ViewTypeC c("c",m,p+1);
453 
454  Kokkos::deep_copy(A, 1.0);
455  Kokkos::deep_copy(b, 1.0);
456 
457  Kokkos::Timer wall_clock;
458  Perf perf;
459 
460  // Execute the kernel once to warm up
461  run_mat_vec_deriv_sl<MaxP>( A, b, c );
462  execution_space().fence();
463 
464  for (size_t l=0; l<nloop; l++) {
465  run_mat_vec_deriv_sl<MaxP>( A, b, c );
466  }
467  execution_space().fence();
468 
469  perf.time = wall_clock.seconds() / nloop;
470  perf.flops = m*n*(2+4*p);
471  perf.throughput = perf.flops / perf.time / 1.0e9;
472 
473  if (check)
474  check_deriv(A,b,c);
475 
476  return perf;
477 }
478 
479 template <int p, typename ... ViewArgs>
480 Perf
481 do_time_analytic_s(const size_t m, const size_t n,
482  const size_t nloop, const bool check)
483 {
484  typedef Kokkos::View<double**[p+1], ViewArgs...> ViewTypeA;
485  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
486  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
487  typedef typename ViewTypeA::execution_space execution_space;
488 
489  ViewTypeA A("A",m,n);
490  ViewTypeB b("B",n,p+1);
491  ViewTypeC c("c",m,p+1);
492 
493  Kokkos::deep_copy(A, 1.0);
494  Kokkos::deep_copy(b, 1.0);
495 
496  Kokkos::Timer wall_clock;
497  Perf perf;
498 
499  // Execute the kernel once to warm up
500  run_mat_vec_deriv_s<p>( A, b, c );
501  execution_space().fence();
502 
503  for (size_t l=0; l<nloop; l++) {
504  run_mat_vec_deriv_s<p>( A, b, c );
505  }
506  execution_space().fence();
507 
508  perf.time = wall_clock.seconds() / nloop;
509  perf.flops = m*n*(2+4*p);
510  perf.throughput = perf.flops / perf.time / 1.0e9;
511 
512  if (check)
513  check_deriv(A,b,c);
514 
515  return perf;
516 }
517 
521 
522 #define INST_FUNC_VAL_DEV(DEV) \
523  template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
524  template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
525  template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
526  template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
527  template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
528  template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
529  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
530  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
531  template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
532  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
533  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
534  template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
535 
536 #define INST_FUNC_FAD_DEV(FAD,DEV) \
537  template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
538  template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
539  template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
540  template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
541  template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
542  template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
543 
544 #define INST_FUNC_DEV(DEV) \
545  INST_FUNC_VAL_DEV( DEV ) \
546  INST_FUNC_FAD_DEV( SFad_type, DEV ) \
547  INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
548  INST_FUNC_FAD_DEV( DFad_type, DEV )
549 
550 #ifdef KOKKOS_ENABLE_SERIAL
551 INST_FUNC_DEV(Kokkos::Serial)
552 #endif
553 
554 #ifdef KOKKOS_ENABLE_OPENMP
555 INST_FUNC_DEV(Kokkos::OpenMP)
556 #endif
557 
558 #ifdef KOKKOS_ENABLE_THREADS
559 INST_FUNC_DEV(Kokkos::Threads)
560 #endif
561 
562 #ifdef KOKKOS_ENABLE_CUDA
563 INST_FUNC_DEV(Kokkos::Cuda)
564 #endif
565 
566 #ifdef KOKKOS_ENABLE_HIP
567 INST_FUNC_DEV(Kokkos::HIP)
568 #endif
Sacado::Fad::DFad< double > DFad_type
Definition: mat_vec.cpp:520
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:72
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:38
double time
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: mat_vec.cpp:481
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:442
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:552
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
Sacado::Fad::SFad< double, SFadSize > SFad_type
Definition: mat_vec.cpp:518
const char * p
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:336
#define INST_FUNC_DEV(DEV)
Definition: mat_vec.cpp:544
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
int value
const int N
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:103
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:78
double throughput
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const double tol
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:137
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type
Definition: mat_vec.cpp:519
int n