Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 //#define SACADO_DISABLE_FAD_VIEW_SPEC
31 
32 #include "Sacado.hpp"
33 
34 #include "mat_vec.hpp"
35 
36 #include "Kokkos_Timer.hpp"
37 
38 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
39 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
40  typedef typename ViewTypeC::value_type scalar_type;
41  typedef typename ViewTypeC::execution_space execution_space;
42 
43  const int m = A.extent(0);
44  const int n = A.extent(1);
45  Kokkos::parallel_for(
46  Kokkos::RangePolicy<execution_space>( 0,m ),
47  KOKKOS_LAMBDA (const int i) {
48  scalar_type t = 0.0;
49  for (int j=0; j<n; ++j)
50  t += A(i,j)*b(j);
51  c(i) = t;
52  }
53  );
54 }
55 
56 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
57 void
58 run_mat_vec_scratch(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
59 {
60  typedef typename ViewTypeC::value_type scalar_type;
61  typedef typename ViewTypeC::execution_space execution_space;
62  typedef Kokkos::TeamPolicy<execution_space> Policy;
63  typedef typename Policy::member_type team_member;
64  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
65 
66  const int m = A.extent(0);
67  const int n = A.extent(1);
68  const int p = dimension_scalar(A);
69 
70 #ifdef KOKKOS_ENABLE_CUDA
72 #else
73  const bool is_cuda = false;
74 #endif
75  const int TeamSize = is_cuda ? 128 : 1;
76  const int N = (m+TeamSize-1)/TeamSize;
77  Policy policy(N, TeamSize, 1);
78  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
79  Kokkos::parallel_for(
80  policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
81  KOKKOS_LAMBDA (const team_member& team) {
82  const int team_rank = team.team_rank();
83  const int team_size = team.team_size();
84  TmpScratchSpace t(team.team_scratch(0), team_size, p);
85  const int i = team.league_rank()*team_size + team_rank;
86  if (i < m) {
87  t(team_rank) = 0.0;
88  for (int j=0; j<n; ++j)
89  t(team_rank) += A(i,j)*b(j);
90  c(i) = t(team_rank);
91  }
92  }
93  );
94 }
95 
96 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
97 void
98 run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
99 {
100  typedef typename ViewTypeC::execution_space execution_space;
101 
102  const int m = A.extent(0);
103  const int n = A.extent(1);
104  const int p = A.extent(2)-1;
105  Kokkos::parallel_for(
106  Kokkos::RangePolicy<execution_space>( 0,m ),
107  KOKKOS_LAMBDA (const int i) {
108  c(i,p) = 0.0;
109  for (int k=0; k<p; ++k)
110  c(i,k) = 0.0;
111  for (int j=0; j<n; ++j) {
112  c(i,p) += A(i,j,p)*b(j,p);
113  for (int k=0; k<p; ++k) {
114  c(i,k) += A(i,j,k)*b(j,p) + A(i,j,p)*b(j,k);
115  }
116  }
117  }
118  );
119 }
120 
121 template <int MaxP, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
122 void
123 run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
124 {
125  typedef typename ViewTypeC::value_type scalar_type;
126  typedef typename ViewTypeC::execution_space execution_space;
127 
128  const int m = A.extent(0);
129  const int n = A.extent(1);
130  const int p = A.extent(2)-1;
131  Kokkos::parallel_for(
132  Kokkos::RangePolicy<execution_space>( 0,m ),
133  KOKKOS_LAMBDA (const int i) {
134  scalar_type cv = 0.0;
135  scalar_type t[MaxP];
136  for (int k=0; k<p; ++k)
137  t[k] = 0.0;
138 
139  for (int j=0; j<n; ++j) {
140  scalar_type av = A(i,j,p);
141  scalar_type bv = b(j,p);
142  cv += av*bv;
143  for (int k=0; k<p; ++k) {
144  t[k] += A(i,j,k)*bv + av*b(j,k);
145  }
146  }
147 
148  for (int k=0; k<p; ++k)
149  c(i,k) = t[k];
150  c(i,p) = cv;
151  }
152  );
153 }
154 
155 template <int p, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
156 void
157 run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
158 {
159  typedef typename ViewTypeC::value_type scalar_type;
160  typedef typename ViewTypeC::execution_space execution_space;
161 
162  const int m = A.extent(0);
163  const int n = A.extent(1);
164  Kokkos::parallel_for(
165  Kokkos::RangePolicy<execution_space>( 0,m ),
166  KOKKOS_LAMBDA (const int i) {
167  scalar_type cv = 0.0;
168  scalar_type t[p];
169  for (int k=0; k<p; ++k)
170  t[k] = 0.0;
171 
172  for (int j=0; j<n; ++j) {
173  const scalar_type av = A(i,j,p);
174  const scalar_type bv = b(j,p);
175  cv += av*bv;
176 
177 // Using simd here results in much better performance. Othewise the compiler
178 // appears to try and vectorize the j loop with gather instructions, which
179 // doesn't work very well.
180 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
181 #pragma simd
182 #endif
183  for (int k=0; k<p; ++k) {
184  t[k] += A(i,j,k)*bv + av*b(j,k);
185  }
186  }
187 
188  for (int k=0; k<p; ++k)
189  c(i,k) = t[k];
190  c(i,p) = cv;
191  }
192  );
193 }
194 
195 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
196 void
197 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
198 {
199  const double tol = 1.0e-14;
200  typedef typename ViewTypeC::value_type value_type;
201  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
202  Kokkos::deep_copy(h_c, c);
203  const size_t m = A.extent(0);
204  const size_t n = A.extent(1);
205  for (size_t i=0; i<m; ++i) {
206  value_type t = n;
207  if (std::abs(h_c(i)- t) > tol) {
208  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
209  << std::endl;
210  }
211  }
212 }
213 
214 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
215 void
216 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
217 {
218  const double tol = 1.0e-14;
219  typedef typename ViewTypeC::value_type value_type;
220  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
221  Kokkos::deep_copy(h_c, c);
222  const size_t m = A.extent(0);
223  const size_t n = A.extent(1);
224  const size_t p = A.extent(2);
225  for (size_t i=0; i<m; ++i) {
226  for (size_t j=0; j<p; ++j) {
227  value_type t = (j == p-1 ? n : 2*n);
228  if (std::abs(h_c(i,j)- t) > tol) {
229  std::cout << "Comparison failed! " << i << "," << j << " : "
230  << h_c(i,j) << " , " << t << std::endl;
231  }
232  }
233  }
234 }
235 
236 template <typename ... ViewArgs>
237 Perf
238 do_time_val(const size_t m, const size_t n, const size_t nloop,
239  const bool check)
240 {
241  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
242  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
243  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
244  typedef typename ViewTypeA::execution_space execution_space;
245 
246  ViewTypeA A("A",m,n);
247  ViewTypeB b("B",n);
248  ViewTypeC c("c",m);
249 
250  Kokkos::deep_copy(A, 1.0);
251  Kokkos::deep_copy(b, 1.0);
252 
253  Kokkos::Timer wall_clock;
254  Perf perf;
255 
256  // Execute the kernel once to warm up
257  run_mat_vec( A, b, c );
258  execution_space().fence();
259 
260  wall_clock.reset();
261  for (size_t l=0; l<nloop; l++) {
262  run_mat_vec( A, b, c );
263  }
264  execution_space().fence();
265 
266  perf.time = wall_clock.seconds() / nloop;
267  perf.flops = m*n*2;
268  perf.throughput = perf.flops / perf.time / 1.0e9;
269 
270  if (check)
271  check_val(A,b,c);
272 
273  return perf;
274 }
275 
276 template <typename FadType, typename ... ViewArgs>
277 Perf
278 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
279  const bool check)
280 {
281  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
282  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
283  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
284  typedef typename ViewTypeA::execution_space execution_space;
285 
286  // Set amount of memory available for dynamic memory allocation on GPU
287 #ifdef KOKKOS_ENABLE_CUDA
289  std::is_same<FadType,Sacado::Fad::DFad<double> >::value) {
290  const size_t concurrency = execution_space().concurrency();
291  const size_t mem = std::min(m,concurrency) * p * sizeof(double);
292  //std::cout << "mem = " << mem / (1024*1024) << " MB" << std::endl;
293  cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
294  }
295 #endif
296 
297 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
298  ViewTypeA A("A",m,n,p+1);
299  ViewTypeB b("B",n,p+1);
300  ViewTypeC c("c",m,p+1);
301 #else
302  ViewTypeA A("A",m,n);
303  ViewTypeB b("B",n);
304  ViewTypeC c("c",m);
305 #endif
306 
307  // FadType a(p, 1.0);
308  // for (size_t k=0; k<p; ++k)
309  // a.fastAccessDx(k) = 1.0;
310  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
311  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
312 
313  Kokkos::Timer wall_clock;
314  Perf perf;
315 
316  // Execute the kernel once to warm up
317  run_mat_vec( A, b, c );
318  execution_space().fence();
319 
320  wall_clock.reset();
321  for (size_t l=0; l<nloop; l++) {
322  run_mat_vec( A, b, c );
323  }
324  execution_space().fence();
325 
326  perf.time = wall_clock.seconds() / nloop;
327  perf.flops = m*n*(2+4*p);
328  perf.throughput = perf.flops / perf.time / 1.0e9;
329 
330 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
331  if (check) {
332  typename ViewTypeA::array_type A_flat = A;
333  typename ViewTypeB::array_type b_flat = b;
334  typename ViewTypeC::array_type c_flat = c;
335  check_deriv(A_flat, b_flat, c_flat);
336  }
337 #endif
338 
339  return perf;
340 }
341 
342 template <typename FadType, typename ... ViewArgs>
343 Perf
344 do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop,
345  const bool check)
346 {
347  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
348  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
349  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
350  typedef typename ViewTypeA::execution_space execution_space;
351 
352 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
353  ViewTypeA A("A",m,n,p+1);
354  ViewTypeB b("B",n,p+1);
355  ViewTypeC c("c",m,p+1);
356 #else
357  ViewTypeA A("A",m,n);
358  ViewTypeB b("B",n);
359  ViewTypeC c("c",m);
360 #endif
361 
362  // FadType a(p, 1.0);
363  // for (size_t k=0; k<p; ++k)
364  // a.fastAccessDx(k) = 1.0;
365  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
366  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
367 
368  Kokkos::Timer wall_clock;
369  Perf perf;
370 
371  // Execute the kernel once to warm up
372  run_mat_vec_scratch( A, b, c );
373  execution_space().fence();
374 
375  wall_clock.reset();
376  for (size_t l=0; l<nloop; l++) {
377  run_mat_vec_scratch( A, b, c );
378  }
379  execution_space().fence();
380 
381  perf.time = wall_clock.seconds() / nloop;
382  perf.flops = m*n*(2+4*p);
383  perf.throughput = perf.flops / perf.time / 1.0e9;
384 
385 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
386  if (check) {
387  typename ViewTypeA::array_type A_flat = A;
388  typename ViewTypeB::array_type b_flat = b;
389  typename ViewTypeC::array_type c_flat = c;
390  check_deriv(A_flat, b_flat, c_flat);
391  }
392 #endif
393 
394  return perf;
395 }
396 
397 template <typename ... ViewArgs>
398 Perf
399 do_time_analytic(const size_t m, const size_t n, const size_t p,
400  const size_t nloop, const bool check)
401 {
402  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
403  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
404  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
405  typedef typename ViewTypeA::execution_space execution_space;
406 
407  ViewTypeA A("A",m,n,p+1);
408  ViewTypeB b("B",n,p+1);
409  ViewTypeC c("c",m,p+1);
410 
411  Kokkos::deep_copy(A, 1.0);
412  Kokkos::deep_copy(b, 1.0);
413 
414  Kokkos::Timer wall_clock;
415  Perf perf;
416 
417  // Execute the kernel once to warm up
418  run_mat_vec_deriv( A, b, c );
419  execution_space().fence();
420 
421  for (size_t l=0; l<nloop; l++) {
422  run_mat_vec_deriv( A, b, c );
423  }
424  execution_space().fence();
425 
426  perf.time = wall_clock.seconds() / nloop;
427  perf.flops = m*n*(2+4*p);
428  perf.throughput = perf.flops / perf.time / 1.0e9;
429 
430  if (check)
431  check_deriv(A,b,c);
432 
433  return perf;
434 }
435 
436 template <int MaxP, typename ... ViewArgs>
437 Perf
438 do_time_analytic_sl(const size_t m, const size_t n, const size_t p,
439  const size_t nloop, const bool check)
440 {
441  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
442  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
443  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
444  typedef typename ViewTypeA::execution_space execution_space;
445 
446  ViewTypeA A("A",m,n,p+1);
447  ViewTypeB b("B",n,p+1);
448  ViewTypeC c("c",m,p+1);
449 
450  Kokkos::deep_copy(A, 1.0);
451  Kokkos::deep_copy(b, 1.0);
452 
453  Kokkos::Timer wall_clock;
454  Perf perf;
455 
456  // Execute the kernel once to warm up
457  run_mat_vec_deriv_sl<MaxP>( A, b, c );
458  execution_space().fence();
459 
460  for (size_t l=0; l<nloop; l++) {
461  run_mat_vec_deriv_sl<MaxP>( A, b, c );
462  }
463  execution_space().fence();
464 
465  perf.time = wall_clock.seconds() / nloop;
466  perf.flops = m*n*(2+4*p);
467  perf.throughput = perf.flops / perf.time / 1.0e9;
468 
469  if (check)
470  check_deriv(A,b,c);
471 
472  return perf;
473 }
474 
475 template <int p, typename ... ViewArgs>
476 Perf
477 do_time_analytic_s(const size_t m, const size_t n,
478  const size_t nloop, const bool check)
479 {
480  typedef Kokkos::View<double**[p+1], ViewArgs...> ViewTypeA;
481  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
482  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
483  typedef typename ViewTypeA::execution_space execution_space;
484 
485  ViewTypeA A("A",m,n);
486  ViewTypeB b("B",n,p+1);
487  ViewTypeC c("c",m,p+1);
488 
489  Kokkos::deep_copy(A, 1.0);
490  Kokkos::deep_copy(b, 1.0);
491 
492  Kokkos::Timer wall_clock;
493  Perf perf;
494 
495  // Execute the kernel once to warm up
496  run_mat_vec_deriv_s<p>( A, b, c );
497  execution_space().fence();
498 
499  for (size_t l=0; l<nloop; l++) {
500  run_mat_vec_deriv_s<p>( A, b, c );
501  }
502  execution_space().fence();
503 
504  perf.time = wall_clock.seconds() / nloop;
505  perf.flops = m*n*(2+4*p);
506  perf.throughput = perf.flops / perf.time / 1.0e9;
507 
508  if (check)
509  check_deriv(A,b,c);
510 
511  return perf;
512 }
513 
517 
518 #define INST_FUNC_VAL_DEV(DEV) \
519  template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
520  template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
521  template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
522  template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
523  template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
524  template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
525  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
526  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
527  template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
528  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
529  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
530  template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
531 
532 #define INST_FUNC_FAD_DEV(FAD,DEV) \
533  template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
534  template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
535  template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
536  template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
537  template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
538  template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
539 
540 #define INST_FUNC_DEV(DEV) \
541  INST_FUNC_VAL_DEV( DEV ) \
542  INST_FUNC_FAD_DEV( SFad_type, DEV ) \
543  INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
544  INST_FUNC_FAD_DEV( DFad_type, DEV )
545 
546 #ifdef KOKKOS_ENABLE_SERIAL
547 INST_FUNC_DEV(Kokkos::Serial)
548 #endif
549 
550 #ifdef KOKKOS_ENABLE_OPENMP
551 INST_FUNC_DEV(Kokkos::OpenMP)
552 #endif
553 
554 #ifdef KOKKOS_ENABLE_THREADS
555 INST_FUNC_DEV(Kokkos::Threads)
556 #endif
557 
558 #ifdef KOKKOS_ENABLE_CUDA
559 INST_FUNC_DEV(Kokkos::Cuda)
560 #endif
561 
562 #ifdef KOKKOS_ENABLE_HIP
563 INST_FUNC_DEV(Kokkos::HIP)
564 #endif
Sacado::Fad::DFad< double > DFad_type
Definition: mat_vec.cpp:516
const char * p
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:94
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:58
double time
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: mat_vec.cpp:477
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:438
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:572
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
Sacado::Fad::SFad< double, SFadSize > SFad_type
Definition: mat_vec.cpp:514
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:344
#define INST_FUNC_DEV(DEV)
Definition: mat_vec.cpp:540
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const int N
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:123
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:98
int value
double throughput
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const double tol
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: mat_vec.cpp:157
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type
Definition: mat_vec.cpp:515
int n