Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 //#define SACADO_DISABLE_FAD_VIEW_SPEC
11 
12 #include "Sacado.hpp"
13 
14 #include "mat_vec.hpp"
15 
16 #include "Kokkos_Timer.hpp"
17 
18 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
19 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
20  typedef typename ViewTypeC::value_type scalar_type;
21  typedef typename ViewTypeC::execution_space execution_space;
22 
23  const int m = A.extent(0);
24  const int n = A.extent(1);
25  Kokkos::parallel_for(
26  Kokkos::RangePolicy<execution_space>( 0,m ),
27  KOKKOS_LAMBDA (const int i) {
28  scalar_type t = 0.0;
29  for (int j=0; j<n; ++j)
30  t += A(i,j)*b(j);
31  c(i) = t;
32  }
33  );
34 }
35 
36 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
37 void
38 run_mat_vec_scratch(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
39 {
40  typedef typename ViewTypeC::value_type scalar_type;
41  typedef typename ViewTypeC::execution_space execution_space;
42  typedef Kokkos::TeamPolicy<execution_space> Policy;
43  typedef typename Policy::member_type team_member;
44  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
45 
46  const int m = A.extent(0);
47  const int n = A.extent(1);
48  const int p = dimension_scalar(A);
49 
50 #ifdef KOKKOS_ENABLE_CUDA
52 #else
53  const bool is_cuda = false;
54 #endif
55  const int TeamSize = is_cuda ? 128 : 1;
56  const int N = (m+TeamSize-1)/TeamSize;
57  Policy policy(N, TeamSize, 1);
58  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
59  Kokkos::parallel_for(
60  policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
61  KOKKOS_LAMBDA (const team_member& team) {
62  const int team_rank = team.team_rank();
63  const int team_size = team.team_size();
64  TmpScratchSpace t(team.team_scratch(0), team_size, p);
65  const int i = team.league_rank()*team_size + team_rank;
66  if (i < m) {
67  t(team_rank) = 0.0;
68  for (int j=0; j<n; ++j)
69  t(team_rank) += A(i,j)*b(j);
70  c(i) = t(team_rank);
71  }
72  }
73  );
74 }
75 
76 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
77 void
78 run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
79 {
80  typedef typename ViewTypeC::execution_space execution_space;
81 
82  const int m = A.extent(0);
83  const int n = A.extent(1);
84  const int p = A.extent(2)-1;
85  Kokkos::parallel_for(
86  Kokkos::RangePolicy<execution_space>( 0,m ),
87  KOKKOS_LAMBDA (const int i) {
88  c(i,p) = 0.0;
89  for (int k=0; k<p; ++k)
90  c(i,k) = 0.0;
91  for (int j=0; j<n; ++j) {
92  c(i,p) += A(i,j,p)*b(j,p);
93  for (int k=0; k<p; ++k) {
94  c(i,k) += A(i,j,k)*b(j,p) + A(i,j,p)*b(j,k);
95  }
96  }
97  }
98  );
99 }
100 
101 template <int MaxP, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
102 void
103 run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
104 {
105  typedef typename ViewTypeC::value_type scalar_type;
106  typedef typename ViewTypeC::execution_space execution_space;
107 
108  const int m = A.extent(0);
109  const int n = A.extent(1);
110  const int p = A.extent(2)-1;
111  Kokkos::parallel_for(
112  Kokkos::RangePolicy<execution_space>( 0,m ),
113  KOKKOS_LAMBDA (const int i) {
114  scalar_type cv = 0.0;
115  scalar_type t[MaxP];
116  for (int k=0; k<p; ++k)
117  t[k] = 0.0;
118 
119  for (int j=0; j<n; ++j) {
120  scalar_type av = A(i,j,p);
121  scalar_type bv = b(j,p);
122  cv += av*bv;
123  for (int k=0; k<p; ++k) {
124  t[k] += A(i,j,k)*bv + av*b(j,k);
125  }
126  }
127 
128  for (int k=0; k<p; ++k)
129  c(i,k) = t[k];
130  c(i,p) = cv;
131  }
132  );
133 }
134 
135 template <int p, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
136 void
137 run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
138 {
139  typedef typename ViewTypeC::value_type scalar_type;
140  typedef typename ViewTypeC::execution_space execution_space;
141 
142  const int m = A.extent(0);
143  const int n = A.extent(1);
144  Kokkos::parallel_for(
145  Kokkos::RangePolicy<execution_space>( 0,m ),
146  KOKKOS_LAMBDA (const int i) {
147  scalar_type cv = 0.0;
148  scalar_type t[p];
149  for (int k=0; k<p; ++k)
150  t[k] = 0.0;
151 
152  for (int j=0; j<n; ++j) {
153  const scalar_type av = A(i,j,p);
154  const scalar_type bv = b(j,p);
155  cv += av*bv;
156 
157 // Using simd here results in much better performance. Othewise the compiler
158 // appears to try and vectorize the j loop with gather instructions, which
159 // doesn't work very well.
160 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
161 #pragma simd
162 #endif
163  for (int k=0; k<p; ++k) {
164  t[k] += A(i,j,k)*bv + av*b(j,k);
165  }
166  }
167 
168  for (int k=0; k<p; ++k)
169  c(i,k) = t[k];
170  c(i,p) = cv;
171  }
172  );
173 }
174 
175 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
176 void
177 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
178 {
179  const double tol = 1.0e-14;
180  typedef typename ViewTypeC::value_type value_type;
181  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
182  Kokkos::deep_copy(h_c, c);
183  const size_t m = A.extent(0);
184  const size_t n = A.extent(1);
185  for (size_t i=0; i<m; ++i) {
186  value_type t = n;
187  if (std::abs(h_c(i)- t) > tol) {
188  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
189  << std::endl;
190  }
191  }
192 }
193 
194 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
195 void
196 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
197 {
198  const double tol = 1.0e-14;
199  typedef typename ViewTypeC::value_type value_type;
200  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
201  Kokkos::deep_copy(h_c, c);
202  const size_t m = A.extent(0);
203  const size_t n = A.extent(1);
204  const size_t p = A.extent(2);
205  for (size_t i=0; i<m; ++i) {
206  for (size_t j=0; j<p; ++j) {
207  value_type t = (j == p-1 ? n : 2*n);
208  if (std::abs(h_c(i,j)- t) > tol) {
209  std::cout << "Comparison failed! " << i << "," << j << " : "
210  << h_c(i,j) << " , " << t << std::endl;
211  }
212  }
213  }
214 }
215 
216 template <typename ... ViewArgs>
217 Perf
218 do_time_val(const size_t m, const size_t n, const size_t nloop,
219  const bool check)
220 {
221  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
222  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
223  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
224  typedef typename ViewTypeA::execution_space execution_space;
225 
226  ViewTypeA A("A",m,n);
227  ViewTypeB b("B",n);
228  ViewTypeC c("c",m);
229 
230  Kokkos::deep_copy(A, 1.0);
231  Kokkos::deep_copy(b, 1.0);
232 
233  Kokkos::Timer wall_clock;
234  Perf perf;
235 
236  // Execute the kernel once to warm up
237  run_mat_vec( A, b, c );
238  execution_space().fence();
239 
240  wall_clock.reset();
241  for (size_t l=0; l<nloop; l++) {
242  run_mat_vec( A, b, c );
243  }
244  execution_space().fence();
245 
246  perf.time = wall_clock.seconds() / nloop;
247  perf.flops = m*n*2;
248  perf.throughput = perf.flops / perf.time / 1.0e9;
249 
250  if (check)
251  check_val(A,b,c);
252 
253  return perf;
254 }
255 
256 template <typename FadType, typename ... ViewArgs>
257 Perf
258 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
259  const bool check)
260 {
261  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
262  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
263  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
264  typedef typename ViewTypeA::execution_space execution_space;
265 
266  // Set amount of memory available for dynamic memory allocation on GPU
267 #ifdef KOKKOS_ENABLE_CUDA
269  std::is_same<FadType,Sacado::Fad::DFad<double> >::value) {
270  const size_t concurrency = execution_space().concurrency();
271  const size_t mem = std::min(m,concurrency) * p * sizeof(double);
272  //std::cout << "mem = " << mem / (1024*1024) << " MB" << std::endl;
273  cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
274  }
275 #endif
276 
277 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
278  ViewTypeA A("A",m,n,p+1);
279  ViewTypeB b("B",n,p+1);
280  ViewTypeC c("c",m,p+1);
281 #else
282  ViewTypeA A("A",m,n);
283  ViewTypeB b("B",n);
284  ViewTypeC c("c",m);
285 #endif
286 
287  // FadType a(p, 1.0);
288  // for (size_t k=0; k<p; ++k)
289  // a.fastAccessDx(k) = 1.0;
290  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
291  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
292 
293  Kokkos::Timer wall_clock;
294  Perf perf;
295 
296  // Execute the kernel once to warm up
297  run_mat_vec( A, b, c );
298  execution_space().fence();
299 
300  wall_clock.reset();
301  for (size_t l=0; l<nloop; l++) {
302  run_mat_vec( A, b, c );
303  }
304  execution_space().fence();
305 
306  perf.time = wall_clock.seconds() / nloop;
307  perf.flops = m*n*(2+4*p);
308  perf.throughput = perf.flops / perf.time / 1.0e9;
309 
310 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
311  if (check) {
312  typename ViewTypeA::array_type A_flat = A;
313  typename ViewTypeB::array_type b_flat = b;
314  typename ViewTypeC::array_type c_flat = c;
315  check_deriv(A_flat, b_flat, c_flat);
316  }
317 #endif
318 
319  return perf;
320 }
321 
322 template <typename FadType, typename ... ViewArgs>
323 Perf
324 do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop,
325  const bool check)
326 {
327  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
328  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
329  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
330  typedef typename ViewTypeA::execution_space execution_space;
331 
332 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
333  ViewTypeA A("A",m,n,p+1);
334  ViewTypeB b("B",n,p+1);
335  ViewTypeC c("c",m,p+1);
336 #else
337  ViewTypeA A("A",m,n);
338  ViewTypeB b("B",n);
339  ViewTypeC c("c",m);
340 #endif
341 
342  // FadType a(p, 1.0);
343  // for (size_t k=0; k<p; ++k)
344  // a.fastAccessDx(k) = 1.0;
345  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
346  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
347 
348  Kokkos::Timer wall_clock;
349  Perf perf;
350 
351  // Execute the kernel once to warm up
352  run_mat_vec_scratch( A, b, c );
353  execution_space().fence();
354 
355  wall_clock.reset();
356  for (size_t l=0; l<nloop; l++) {
357  run_mat_vec_scratch( A, b, c );
358  }
359  execution_space().fence();
360 
361  perf.time = wall_clock.seconds() / nloop;
362  perf.flops = m*n*(2+4*p);
363  perf.throughput = perf.flops / perf.time / 1.0e9;
364 
365 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
366  if (check) {
367  typename ViewTypeA::array_type A_flat = A;
368  typename ViewTypeB::array_type b_flat = b;
369  typename ViewTypeC::array_type c_flat = c;
370  check_deriv(A_flat, b_flat, c_flat);
371  }
372 #endif
373 
374  return perf;
375 }
376 
377 template <typename ... ViewArgs>
378 Perf
379 do_time_analytic(const size_t m, const size_t n, const size_t p,
380  const size_t nloop, const bool check)
381 {
382  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
383  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
384  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
385  typedef typename ViewTypeA::execution_space execution_space;
386 
387  ViewTypeA A("A",m,n,p+1);
388  ViewTypeB b("B",n,p+1);
389  ViewTypeC c("c",m,p+1);
390 
391  Kokkos::deep_copy(A, 1.0);
392  Kokkos::deep_copy(b, 1.0);
393 
394  Kokkos::Timer wall_clock;
395  Perf perf;
396 
397  // Execute the kernel once to warm up
398  run_mat_vec_deriv( A, b, c );
399  execution_space().fence();
400 
401  for (size_t l=0; l<nloop; l++) {
402  run_mat_vec_deriv( A, b, c );
403  }
404  execution_space().fence();
405 
406  perf.time = wall_clock.seconds() / nloop;
407  perf.flops = m*n*(2+4*p);
408  perf.throughput = perf.flops / perf.time / 1.0e9;
409 
410  if (check)
411  check_deriv(A,b,c);
412 
413  return perf;
414 }
415 
416 template <int MaxP, typename ... ViewArgs>
417 Perf
418 do_time_analytic_sl(const size_t m, const size_t n, const size_t p,
419  const size_t nloop, const bool check)
420 {
421  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
422  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
423  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
424  typedef typename ViewTypeA::execution_space execution_space;
425 
426  ViewTypeA A("A",m,n,p+1);
427  ViewTypeB b("B",n,p+1);
428  ViewTypeC c("c",m,p+1);
429 
430  Kokkos::deep_copy(A, 1.0);
431  Kokkos::deep_copy(b, 1.0);
432 
433  Kokkos::Timer wall_clock;
434  Perf perf;
435 
436  // Execute the kernel once to warm up
437  run_mat_vec_deriv_sl<MaxP>( A, b, c );
438  execution_space().fence();
439 
440  for (size_t l=0; l<nloop; l++) {
441  run_mat_vec_deriv_sl<MaxP>( A, b, c );
442  }
443  execution_space().fence();
444 
445  perf.time = wall_clock.seconds() / nloop;
446  perf.flops = m*n*(2+4*p);
447  perf.throughput = perf.flops / perf.time / 1.0e9;
448 
449  if (check)
450  check_deriv(A,b,c);
451 
452  return perf;
453 }
454 
455 template <int p, typename ... ViewArgs>
456 Perf
457 do_time_analytic_s(const size_t m, const size_t n,
458  const size_t nloop, const bool check)
459 {
460  typedef Kokkos::View<double**[p+1], ViewArgs...> ViewTypeA;
461  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
462  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
463  typedef typename ViewTypeA::execution_space execution_space;
464 
465  ViewTypeA A("A",m,n);
466  ViewTypeB b("B",n,p+1);
467  ViewTypeC c("c",m,p+1);
468 
469  Kokkos::deep_copy(A, 1.0);
470  Kokkos::deep_copy(b, 1.0);
471 
472  Kokkos::Timer wall_clock;
473  Perf perf;
474 
475  // Execute the kernel once to warm up
476  run_mat_vec_deriv_s<p>( A, b, c );
477  execution_space().fence();
478 
479  for (size_t l=0; l<nloop; l++) {
480  run_mat_vec_deriv_s<p>( A, b, c );
481  }
482  execution_space().fence();
483 
484  perf.time = wall_clock.seconds() / nloop;
485  perf.flops = m*n*(2+4*p);
486  perf.throughput = perf.flops / perf.time / 1.0e9;
487 
488  if (check)
489  check_deriv(A,b,c);
490 
491  return perf;
492 }
493 
497 
498 #define INST_FUNC_VAL_DEV(DEV) \
499  template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
500  template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
501  template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
502  template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
503  template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
504  template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
505  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
506  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
507  template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
508  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
509  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
510  template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
511 
512 #define INST_FUNC_FAD_DEV(FAD,DEV) \
513  template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
514  template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
515  template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
516  template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
517  template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
518  template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
519 
520 #define INST_FUNC_DEV(DEV) \
521  INST_FUNC_VAL_DEV( DEV ) \
522  INST_FUNC_FAD_DEV( SFad_type, DEV ) \
523  INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
524  INST_FUNC_FAD_DEV( DFad_type, DEV )
525 
526 #ifdef KOKKOS_ENABLE_SERIAL
527 INST_FUNC_DEV(Kokkos::Serial)
528 #endif
529 
530 #ifdef KOKKOS_ENABLE_OPENMP
531 INST_FUNC_DEV(Kokkos::OpenMP)
532 #endif
533 
534 #ifdef KOKKOS_ENABLE_THREADS
535 INST_FUNC_DEV(Kokkos::Threads)
536 #endif
537 
538 #ifdef KOKKOS_ENABLE_CUDA
539 INST_FUNC_DEV(Kokkos::Cuda)
540 #endif
541 
542 #ifdef KOKKOS_ENABLE_HIP
543 INST_FUNC_DEV(Kokkos::HIP)
544 #endif
Sacado::Fad::DFad< double > DFad_type
Definition: mat_vec.cpp:496
const char * p
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:72
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:38
double time
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: mat_vec.cpp:457
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:418
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:552
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
Sacado::Fad::SFad< double, SFadSize > SFad_type
Definition: mat_vec.cpp:494
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:324
#define INST_FUNC_DEV(DEV)
Definition: mat_vec.cpp:520
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const int N
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:103
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:78
int value
double throughput
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const double tol
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:137
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type
Definition: mat_vec.cpp:495
int n