Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fad_kokkos_mat_vec_perf.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
11 //#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
12 //#define SACADO_KOKKOS_USE_MEMORY_POOL 1
13 #define SACADO_ALIGN_SFAD 1
14 
15 //#define SACADO_DISABLE_FAD_VIEW_SPEC
16 #include "Sacado.hpp"
17 
20 #include "Teuchos_Time.hpp"
21 
22 #include "Kokkos_Timer.hpp"
23 
24 // For vtune
25 #include <sys/types.h>
26 #include <unistd.h>
27 #include <algorithm>
28 
29 // A performance test that computes the derivative of a simple Kokkos kernel
30 // using various Fad classes
31 
32 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
33 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
34  typedef typename ViewTypeC::value_type scalar_type;
35  typedef typename ViewTypeC::execution_space execution_space;
36 
37  const int m = A.extent(0);
38  const int n = A.extent(1);
39  Kokkos::parallel_for(
40  Kokkos::RangePolicy<execution_space>( 0,m ),
41  KOKKOS_LAMBDA (const int i) {
42  scalar_type t = 0.0;
43  for (int j=0; j<n; ++j)
44  t += A(i,j)*b(j);
45  c(i) = t;
46  }
47  );
48 }
49 
50 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
51 
52 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
53 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
54  const ViewTypeC& c) {
55  typedef typename ViewTypeC::value_type scalar_type;
56  typedef typename ViewTypeC::execution_space execution_space;
57 
58 #if defined (KOKKOS_ENABLE_CUDA)
60  const unsigned vector_size = is_cuda ? 32 : 1;
61  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
62 #elif defined (KOKKOS_ENABLE_HIP)
64  const unsigned vector_size = is_hip ? 64 : 1;
65  const unsigned team_size = is_hip ? 128 / vector_size : 1;
66 #else
67  const unsigned vector_size = 1;
68  const unsigned team_size = 1;
69 #endif
70 
71  const int m = A.extent(0);
72  const int n = A.extent(1);
73  const int range = (m+team_size-1)/team_size;
74 
75  typedef Kokkos::TeamPolicy<execution_space> Policy;
76  Kokkos::parallel_for(
77  Policy( range,team_size,vector_size ),
78  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
79  const int i = team.league_rank()*team.team_size() + team.team_rank();
80  if (i >= m)
81  return;
82 
83  scalar_type t = 0.0;
84  for (int j=0; j<n; ++j)
85  t += A(i,j)*b(j);
86  c(i) = t;
87  }
88  );
89 }
90 
91 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
92 
93 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
94 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
95  const ViewTypeC& c) {
96  typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
97  typedef typename ViewTypeC::execution_space execution_space;
98 
99 #if defined (KOKKOS_ENABLE_CUDA)
101  const unsigned vector_size = is_cuda ? 32 : 1;
102  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
103 #elif defined (KOKKOS_ENABLE_HIP)
105  const unsigned vector_size = is_hip ? 64 : 1;
106  const unsigned team_size = is_hip ? 128 / vector_size : 1;
107 #else
108  const unsigned vector_size = 1;
109  const unsigned team_size = 1;
110 #endif
111 
112  const int m = A.extent(0);
113  const int n = A.extent(1);
114  const int range = (m+team_size-1)/team_size;
115 
116  typedef Kokkos::TeamPolicy<execution_space> Policy;
117  Kokkos::parallel_for(
118  Policy( range,team_size,vector_size ),
119  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
120  const int i = team.league_rank()*team.team_size() + team.team_rank();
121  if (i >= m)
122  return;
123 
124  scalar_type t = 0.0;
125  for (int j=0; j<n; ++j)
126  t += A(i,j)*b(j);
127  c(i) = t;
128  }
129  );
130 }
131 
132 #else
133 
134 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
135 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
136  const ViewTypeC& c) {
137  typedef typename ViewTypeC::value_type scalar_type;
138  typedef typename ViewTypeC::execution_space execution_space;
139 
140 #if defined (KOKKOS_ENABLE_CUDA)
142 #else
143  const bool is_cuda = false;
144 #endif
145  const unsigned vector_size = 1;
146  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
147 
148  const int m = A.extent(0);
149  const int n = A.extent(1);
150  const int range = (m+team_size-1)/team_size;
151 
152  typedef Kokkos::TeamPolicy<execution_space> Policy;
153  Kokkos::parallel_for(
154  Policy( range,team_size,vector_size ),
155  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
156  const int i = team.league_rank()*team.team_size() + team.team_rank();
157  if (i >= m)
158  return;
159 
160  scalar_type t = 0.0;
161  for (int j=0; j<n; ++j)
162  t += A(i,j)*b(j);
163  c(i) = t;
164  }
165  );
166 }
167 
168 #endif
169 
170 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
171 void
172 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
173 {
174  const double tol = 1.0e-14;
175  typedef typename ViewTypeC::value_type value_type;
176  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
177  Kokkos::deep_copy(h_c, c);
178  const size_t m = A.extent(0);
179  const size_t n = A.extent(1);
180  for (size_t i=0; i<m; ++i) {
181  value_type t = n;
182  if (std::abs(h_c(i)- t) > tol) {
183  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
184  << std::endl;
185  }
186  }
187 }
188 
189 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
190 void
191 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
192 {
193  const double tol = 1.0e-14;
194  typedef typename ViewTypeC::value_type value_type;
195  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
196  Kokkos::deep_copy(h_c, c);
197  const size_t m = A.extent(0);
198  const size_t n = A.extent(1);
199  const size_t p = Kokkos::dimension_scalar(A);
200  for (size_t i=0; i<m; ++i) {
201  for (size_t j=0; j<p; ++j) {
202  value_type t = (j == p-1 ? n : 2*n);
203  if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
204  std::cout << "Comparison failed! " << i << "," << j << " : "
205  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
206  }
207  }
208  }
209 }
210 
211 struct Perf {
212  double time;
213  double flops;
214  double throughput;
215 };
216 
217 template <typename FadType, typename ... ViewArgs>
218 Perf
219 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
220  const bool check)
221 {
222  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
223  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
224  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
225  typedef typename ViewTypeA::execution_space execution_space;
226 
227  ViewTypeA A("A",m,n,p+1);
228  ViewTypeB b("B",n,p+1);
229  ViewTypeC c("c",m,p+1);
230 
231  FadType a(p, 1.0);
232  for (size_t k=0; k<p; ++k)
233  a.fastAccessDx(k) = 1.0;
234  Kokkos::deep_copy(A, a);
235  Kokkos::deep_copy(b, a);
236 
237  Kokkos::Timer wall_clock;
238  Perf perf;
239 
240  // Execute the kernel once to warm up
241  run_mat_vec( A, b, c );
242  execution_space().fence();
243 
244  wall_clock.reset();
245  for (size_t l=0; l<nloop; l++) {
246  run_mat_vec( A, b, c );
247  }
248  execution_space().fence();
249 
250  perf.time = wall_clock.seconds() / nloop;
251  perf.flops = m*n*(2+4*p);
252  perf.throughput = perf.flops / perf.time / 1.0e9;
253 
254  if (check) {
255  check_deriv(A, b, c);
256  }
257 
258  return perf;
259 }
260 
261 template <typename FadType, typename ... ViewArgs>
262 Perf
263 do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
264  const size_t nloop, const bool check)
265 {
266  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
267  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
268  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
269  typedef typename ViewTypeA::execution_space execution_space;
270 
271 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
272 #if defined (KOKKOS_ENABLE_CUDA)
274  const int FadStride = is_cuda ? 32 : 1;
275 #elif defined(KOKKOS_ENABLE_HIP)
277  const int FadStride = is_hip ? 64 : 1;
278 #else
279  const int FadStride 1;
280 #endif
281 #if defined(SACADO_ALIGN_SFAD)
283  const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
284  const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
285  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
286 #else
287  typedef FadType AlignedFadType;
288  const size_t pa = p;
289 #endif
290 #else
291  const int FadStride = 1;
292  typedef FadType AlignedFadType;
293  const size_t pa = p;
294 #endif
295 
296 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
300 #else
301  typedef typename ViewTypeA::array_layout ConLayoutA;
302  typedef typename ViewTypeB::array_layout ConLayoutB;
303  typedef typename ViewTypeC::array_layout ConLayoutC;
304  (void) FadStride;
305 #endif
306 
307 
308  typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
309  typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
310  typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
311 
312  ConViewTypeA A("A",m,n,pa+1);
313  ConViewTypeB b("B",n,pa+1);
314  ConViewTypeC c("c",m,pa+1);
315 
316  AlignedFadType a(pa, 1.0);
317  for (size_t k=0; k<pa; ++k)
318  a.fastAccessDx(k) = 1.0;
319  Kokkos::deep_copy(A, a);
320  Kokkos::deep_copy(b, a);
321 
322  Kokkos::Timer wall_clock;
323  Perf perf;
324 
325 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
326  const size_t concurrency = execution_space().concurrency();
327 
328 #if defined (KOKKOS_ENABLE_CUDA)
329  const size_t warp_dim = is_cuda ? 32 : 1;
330 #elif defined (KOKKOS_ENABLE_HIP)
331  const size_t warp_dim = is_hip ? 64 : 1;
332 #else
333  const size_t warp_dim = 1;
334 #endif
335 
336  const size_t block_size = pa*sizeof(double);
337  const size_t nkernels = concurrency / warp_dim;
338  const size_t mem_pool_size =
339  static_cast<size_t>(1.2*nkernels*block_size);
340  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
341  execution_space space;
342  Sacado::createGlobalMemoryPool(space, mem_pool_size,
343  block_size,
344  block_size,
345  superblock_size
346  );
347 #endif
348 
349  // Execute the kernel once to warm up
350  run_mat_vec_hierarchical( A, b, c );
351  execution_space().fence();
352 
353  wall_clock.reset();
354  for (size_t l=0; l<nloop; l++) {
355  run_mat_vec_hierarchical( A, b, c );
356  }
357  execution_space().fence();
358 
359  perf.time = wall_clock.seconds() / nloop;
360  perf.flops = m*n*(2+4*p);
361  perf.throughput = perf.flops / perf.time / 1.0e9;
362 
363  if (check) {
364  check_deriv(A, b, c);
365  }
366 
367 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
369 #endif
370 
371  return perf;
372 }
373 
374 template <typename ... ViewArgs>
375 Perf
376 do_time_val(const size_t m, const size_t n, const size_t nloop,
377  const bool check)
378 {
379  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
380  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
381  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
382  typedef typename ViewTypeA::execution_space execution_space;
383 
384  ViewTypeA A("A",m,n);
385  ViewTypeB b("B",n);
386  ViewTypeC c("c",m);
387 
388  Kokkos::deep_copy(A, 1.0);
389  Kokkos::deep_copy(b, 1.0);
390 
391  Kokkos::Timer wall_clock;
392  Perf perf;
393 
394  // Execute the kernel once to warm up
395  run_mat_vec( A, b, c );
396  execution_space().fence();
397 
398  wall_clock.reset();
399  for (size_t l=0; l<nloop; l++) {
400  run_mat_vec( A, b, c );
401  }
402  execution_space().fence();
403 
404  perf.time = wall_clock.seconds() / nloop;
405  perf.flops = m*n*2;
406  perf.throughput = perf.flops / perf.time / 1.0e9;
407 
408  if (check)
409  check_val(A,b,c);
410 
411  return perf;
412 }
413 
414 void
415 print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
416  const std::string& name)
417 {
418  std::cout << name << "\t "
419  << perf.time << "\t "
420  << perf.throughput << "\t "
421  << perf.time / (perf_base.time*p)
422  << std::endl;
423 }
424 
425 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
426  typename ... ViewArgs>
427 void
428 do_times(const size_t m,
429  const size_t n,
430  const size_t p,
431  const size_t ph,
432  const size_t nloop,
433  const bool value,
434  const bool sfad,
435  const bool slfad,
436  const bool dfad,
437  const bool hierarchical,
438  const bool check)
439 {
440  Perf perf_value;
441  perf_value.time = 1.0;
442 
443  // Run value
444  if (value) {
445  Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
446  perf_value = perf;
447  print_perf(perf, perf_value, p, "Value ");
448  }
449 
450  // Run SFad
451  if (sfad && p == SFadSize) {
452  Perf perf =
453  do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
454  print_perf(perf, perf_value, p, "SFad ");
455  }
456 
457  // Run SLFad
458  if (slfad && p <= SLFadSize) {
459  Perf perf =
460  do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
461  print_perf(perf, perf_value, p, "SLFad ");
462  }
463 
464  // Run DFad
465  if (dfad) {
466  Perf perf =
467  do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
468  print_perf(perf, perf_value, p, "DFad ");
469  }
470 
471  // Run hierarchical
472  if (hierarchical) {
473  if (sfad && ph == HierSFadSize) {
474  Perf perf =
475  do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,check);
476  print_perf(perf, perf_value, ph, "Hier SFad ");
477  }
478  if (slfad && ph <= HierSLFadSize) {
479  Perf perf =
480  do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,check);
481  print_perf(perf, perf_value, ph, "Hier SLFad");
482  }
483  if (dfad) {
484  Perf perf =
485  do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,check);
486  print_perf(perf, perf_value, ph, "Hier DFad ");
487  }
488  }
489 
490 }
491 
496 };
497 const int num_layout_types = 3;
500 const char *layout_names[] = { "left", "right", "default" };
501 
502 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
503  typename Device>
504 void
505 do_times_layout(const size_t m,
506  const size_t n,
507  const size_t p,
508  const size_t ph,
509  const size_t nloop,
510  const bool value,
511  const bool sfad,
512  const bool slfad,
513  const bool dfad,
514  const bool hierarchical,
515  const bool check,
516  const LayoutType& layout,
517  const std::string& device)
518 {
519  int prec = 2;
520  std::cout.setf(std::ios::scientific);
521  std::cout.precision(prec);
522  std::cout << std::endl
523  << device
524  << " performance for layout "
525  << layout_names[layout]
526  << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph
527  << std::endl << std::endl;
528  std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
529 
530  if (layout == LAYOUT_LEFT)
531  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
532  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
533  else if (layout == LAYOUT_RIGHT)
534  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
535  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
536  else
537  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
538  (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
539 }
540 
541 // Connect executable to vtune for profiling
543  std::stringstream cmd;
544  pid_t my_os_pid=getpid();
545  const std::string vtune_loc =
546  "amplxe-cl";
547  const std::string output_dir = "./vtune";
548  cmd << vtune_loc
549  << " -collect hotspots -result-dir " << output_dir
550  << " -target-pid " << my_os_pid << " &";
551  std::cout << cmd.str() << std::endl;
552  system(cmd.str().c_str());
553  system("sleep 10");
554 }
555 
556 //const int SFadSize = 8;
557 const int SFadSize = 32;
558 const int SLFadSize = SFadSize;
559 //const int HierSFadSize = 50;
560 const int HierSFadSize = 32;
561 const int HierSLFadSize = HierSFadSize;
562 
563 int main(int argc, char* argv[]) {
564  bool success = true;
565  try {
566 
567  // Set up command line options
569  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
570  int m = 100000;
571  clp.setOption("m", &m, "Number of matrix rows");
572  int n = 100;
573  clp.setOption("n", &n, "Number of matrix columns");
574  int p = SFadSize;
575  clp.setOption("p", &p, "Number of derivative components");
576  int ph = HierSFadSize;
577  clp.setOption("ph", &ph, "Number of derivative components for hierarchical");
578  int nloop = 10;
579  clp.setOption("nloop", &nloop, "Number of loops");
580 #ifdef KOKKOS_ENABLE_SERIAL
581  bool serial = 0;
582  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
583 #endif
584 #ifdef KOKKOS_ENABLE_OPENMP
585  int openmp = 0;
586  clp.setOption("openmp", &openmp, "Number of OpenMP threads");
587 #endif
588 #ifdef KOKKOS_ENABLE_THREADS
589  int threads = 0;
590  clp.setOption("threads", &threads, "Number of pThreads threads");
591 #endif
592 #ifdef KOKKOS_ENABLE_CUDA
593  bool cuda = 0;
594  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
595 #endif
596 #ifdef KOKKOS_ENABLE_HIP
597  bool hip = 0;
598  clp.setOption("hip", "no-hip", &cuda, "Whether to run HIP");
599 #endif
600  int numa = 0;
601  clp.setOption("numa", &numa,
602  "Number of NUMA domains to use (set to 0 to use all NUMAs");
603  int cores_per_numa = 0;
604  clp.setOption("cores-per-numa", &cores_per_numa,
605  "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
606  bool print_config = false;
607  clp.setOption("print-config", "no-print-config", &print_config,
608  "Whether to print Kokkos device configuration");
609  LayoutType layout = LAYOUT_DEFAULT;
610  clp.setOption("layout", &layout, num_layout_types, layout_values,
611  layout_names, "View layout");
612  bool vtune = false;
613  clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
614  bool value = true;
615  clp.setOption("value", "no-value", &value, "Run value calculation");
616  bool sfad = true;
617  clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
618  bool slfad = true;
619  clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
620  bool dfad = true;
621  clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
622  bool hierarchical = true;
623  clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
624  bool check = false;
625  clp.setOption("check", "no-check", &check, "Check calculations are correct");
626 
627  // Parse options
628  switch (clp.parse(argc, argv)) {
630  return 0;
633  return 1;
635  break;
636  }
637 
638  if (vtune)
639  connect_vtune();
640 
641  Kokkos::InitializationSettings init_args;
642  init_args.set_num_threads(cores_per_numa);
643 
644  Kokkos::initialize(init_args);
645 
646  if (print_config)
647  Kokkos::print_configuration(std::cout, true);
648 
649 #ifdef KOKKOS_ENABLE_SERIAL
650  if (serial) {
651  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
652  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial");
653  }
654 #endif
655 
656 #ifdef KOKKOS_ENABLE_OPENMP
657  if (openmp) {
658  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
659  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP");
660  }
661 #endif
662 
663 #ifdef KOKKOS_ENABLE_THREADS
664  if (threads) {
665  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
666  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads");
667  }
668 #endif
669 
670 #ifdef KOKKOS_ENABLE_CUDA
671  if (cuda) {
672  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
673  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda");
674  }
675 #endif
676 
677 #ifdef KOKKOS_ENABLE_HIP
678  if (hip) {
679  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::HIP>(
680  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"HIP");
681  }
682 #endif
683 
684  Kokkos::finalize();
685 
686  }
687  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
688 
689  return !success;
690 }
const char * p
const int SLFadSize
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double time
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
void connect_vtune()
const char * layout_names[]
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:552
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
int main()
Definition: ad_example.cpp:171
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const int N
const LayoutType layout_values[]
void
Definition: uninit.c:105
int value
double throughput
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const double tol
LayoutType
const int HierSLFadSize
const int HierSFadSize
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const int SFadSize
void destroyGlobalMemoryPool(const ExecSpace &space)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
int n