Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fad_kokkos_mat_vec_perf.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
31 //#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
32 //#define SACADO_KOKKOS_USE_MEMORY_POOL 1
33 #define SACADO_ALIGN_SFAD 1
34 
35 //#define SACADO_DISABLE_FAD_VIEW_SPEC
36 #include "Sacado.hpp"
37 
40 #include "Teuchos_Time.hpp"
41 
42 #include "Kokkos_Timer.hpp"
43 
44 // For vtune
45 #include <sys/types.h>
46 #include <unistd.h>
47 #include <algorithm>
48 
49 // A performance test that computes the derivative of a simple Kokkos kernel
50 // using various Fad classes
51 
52 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
53 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
54  typedef typename ViewTypeC::value_type scalar_type;
55  typedef typename ViewTypeC::execution_space execution_space;
56 
57  const int m = A.extent(0);
58  const int n = A.extent(1);
59  Kokkos::parallel_for(
60  Kokkos::RangePolicy<execution_space>( 0,m ),
61  KOKKOS_LAMBDA (const int i) {
62  scalar_type t = 0.0;
63  for (int j=0; j<n; ++j)
64  t += A(i,j)*b(j);
65  c(i) = t;
66  }
67  );
68 }
69 
70 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
71 
72 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
73 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
74  const ViewTypeC& c) {
75  typedef typename ViewTypeC::value_type scalar_type;
76  typedef typename ViewTypeC::execution_space execution_space;
77 
78 #if defined (KOKKOS_ENABLE_CUDA)
80  const unsigned vector_size = is_cuda ? 32 : 1;
81  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
82 #elif defined (KOKKOS_ENABLE_HIP)
84  const unsigned vector_size = is_hip ? 64 : 1;
85  const unsigned team_size = is_hip ? 128 / vector_size : 1;
86 #else
87  const unsigned vector_size = 1;
88  const unsigned team_size = 1;
89 #endif
90 
91  const int m = A.extent(0);
92  const int n = A.extent(1);
93  const int range = (m+team_size-1)/team_size;
94 
95  typedef Kokkos::TeamPolicy<execution_space> Policy;
96  Kokkos::parallel_for(
97  Policy( range,team_size,vector_size ),
98  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
99  const int i = team.league_rank()*team.team_size() + team.team_rank();
100  if (i >= m)
101  return;
102 
103  scalar_type t = 0.0;
104  for (int j=0; j<n; ++j)
105  t += A(i,j)*b(j);
106  c(i) = t;
107  }
108  );
109 }
110 
111 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
112 
113 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
114 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
115  const ViewTypeC& c) {
116  typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
117  typedef typename ViewTypeC::execution_space execution_space;
118 
119 #if defined (KOKKOS_ENABLE_CUDA)
121  const unsigned vector_size = is_cuda ? 32 : 1;
122  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
123 #elif defined (KOKKOS_ENABLE_HIP)
125  const unsigned vector_size = is_hip ? 64 : 1;
126  const unsigned team_size = is_hip ? 128 / vector_size : 1;
127 #else
128  const unsigned vector_size = 1;
129  const unsigned team_size = 1;
130 #endif
131 
132  const int m = A.extent(0);
133  const int n = A.extent(1);
134  const int range = (m+team_size-1)/team_size;
135 
136  typedef Kokkos::TeamPolicy<execution_space> Policy;
137  Kokkos::parallel_for(
138  Policy( range,team_size,vector_size ),
139  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
140  const int i = team.league_rank()*team.team_size() + team.team_rank();
141  if (i >= m)
142  return;
143 
144  scalar_type t = 0.0;
145  for (int j=0; j<n; ++j)
146  t += A(i,j)*b(j);
147  c(i) = t;
148  }
149  );
150 }
151 
152 #else
153 
154 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
155 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
156  const ViewTypeC& c) {
157  typedef typename ViewTypeC::value_type scalar_type;
158  typedef typename ViewTypeC::execution_space execution_space;
159 
160 #if defined (KOKKOS_ENABLE_CUDA)
162 #else
163  const bool is_cuda = false;
164 #endif
165  const unsigned vector_size = 1;
166  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
167 
168  const int m = A.extent(0);
169  const int n = A.extent(1);
170  const int range = (m+team_size-1)/team_size;
171 
172  typedef Kokkos::TeamPolicy<execution_space> Policy;
173  Kokkos::parallel_for(
174  Policy( range,team_size,vector_size ),
175  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
176  const int i = team.league_rank()*team.team_size() + team.team_rank();
177  if (i >= m)
178  return;
179 
180  scalar_type t = 0.0;
181  for (int j=0; j<n; ++j)
182  t += A(i,j)*b(j);
183  c(i) = t;
184  }
185  );
186 }
187 
188 #endif
189 
190 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
191 void
192 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
193 {
194  const double tol = 1.0e-14;
195  typedef typename ViewTypeC::value_type value_type;
196  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
197  Kokkos::deep_copy(h_c, c);
198  const size_t m = A.extent(0);
199  const size_t n = A.extent(1);
200  for (size_t i=0; i<m; ++i) {
201  value_type t = n;
202  if (std::abs(h_c(i)- t) > tol) {
203  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
204  << std::endl;
205  }
206  }
207 }
208 
209 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
210 void
211 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
212 {
213  const double tol = 1.0e-14;
214  typedef typename ViewTypeC::value_type value_type;
215  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
216  Kokkos::deep_copy(h_c, c);
217  const size_t m = A.extent(0);
218  const size_t n = A.extent(1);
219  const size_t p = Kokkos::dimension_scalar(A);
220  for (size_t i=0; i<m; ++i) {
221  for (size_t j=0; j<p; ++j) {
222  value_type t = (j == p-1 ? n : 2*n);
223  if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
224  std::cout << "Comparison failed! " << i << "," << j << " : "
225  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
226  }
227  }
228  }
229 }
230 
231 struct Perf {
232  double time;
233  double flops;
234  double throughput;
235 };
236 
237 template <typename FadType, typename ... ViewArgs>
238 Perf
239 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
240  const bool check)
241 {
242  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
243  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
244  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
245  typedef typename ViewTypeA::execution_space execution_space;
246 
247  ViewTypeA A("A",m,n,p+1);
248  ViewTypeB b("B",n,p+1);
249  ViewTypeC c("c",m,p+1);
250 
251  FadType a(p, 1.0);
252  for (size_t k=0; k<p; ++k)
253  a.fastAccessDx(k) = 1.0;
254  Kokkos::deep_copy(A, a);
255  Kokkos::deep_copy(b, a);
256 
257  Kokkos::Timer wall_clock;
258  Perf perf;
259 
260  // Execute the kernel once to warm up
261  run_mat_vec( A, b, c );
262  execution_space().fence();
263 
264  wall_clock.reset();
265  for (size_t l=0; l<nloop; l++) {
266  run_mat_vec( A, b, c );
267  }
268  execution_space().fence();
269 
270  perf.time = wall_clock.seconds() / nloop;
271  perf.flops = m*n*(2+4*p);
272  perf.throughput = perf.flops / perf.time / 1.0e9;
273 
274  if (check) {
275  check_deriv(A, b, c);
276  }
277 
278  return perf;
279 }
280 
281 template <typename FadType, typename ... ViewArgs>
282 Perf
283 do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
284  const size_t nloop, const bool check)
285 {
286  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
287  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
288  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
289  typedef typename ViewTypeA::execution_space execution_space;
290 
291 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
292 #if defined (KOKKOS_ENABLE_CUDA)
294  const int FadStride = is_cuda ? 32 : 1;
295 #elif defined(KOKKOS_ENABLE_HIP)
297  const int FadStride = is_hip ? 64 : 1;
298 #else
299  const int FadStride 1;
300 #endif
301 #if defined(SACADO_ALIGN_SFAD)
303  const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
304  const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
305  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
306 #else
307  typedef FadType AlignedFadType;
308  const size_t pa = p;
309 #endif
310 #else
311  const int FadStride = 1;
312  typedef FadType AlignedFadType;
313  const size_t pa = p;
314 #endif
315 
316 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
320 #else
321  typedef typename ViewTypeA::array_layout ConLayoutA;
322  typedef typename ViewTypeB::array_layout ConLayoutB;
323  typedef typename ViewTypeC::array_layout ConLayoutC;
324  (void) FadStride;
325 #endif
326 
327 
328  typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
329  typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
330  typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
331 
332  ConViewTypeA A("A",m,n,pa+1);
333  ConViewTypeB b("B",n,pa+1);
334  ConViewTypeC c("c",m,pa+1);
335 
336  AlignedFadType a(pa, 1.0);
337  for (size_t k=0; k<pa; ++k)
338  a.fastAccessDx(k) = 1.0;
339  Kokkos::deep_copy(A, a);
340  Kokkos::deep_copy(b, a);
341 
342  Kokkos::Timer wall_clock;
343  Perf perf;
344 
345 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
346  const size_t concurrency = execution_space().concurrency();
347 
348 #if defined (KOKKOS_ENABLE_CUDA)
349  const size_t warp_dim = is_cuda ? 32 : 1;
350 #elif defined (KOKKOS_ENABLE_HIP)
351  const size_t warp_dim = is_hip ? 64 : 1;
352 #else
353  const size_t warp_dim = 1;
354 #endif
355 
356  const size_t block_size = pa*sizeof(double);
357  const size_t nkernels = concurrency / warp_dim;
358  const size_t mem_pool_size =
359  static_cast<size_t>(1.2*nkernels*block_size);
360  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
361  execution_space space;
362  Sacado::createGlobalMemoryPool(space, mem_pool_size,
363  block_size,
364  block_size,
365  superblock_size
366  );
367 #endif
368 
369  // Execute the kernel once to warm up
370  run_mat_vec_hierarchical( A, b, c );
371  execution_space().fence();
372 
373  wall_clock.reset();
374  for (size_t l=0; l<nloop; l++) {
375  run_mat_vec_hierarchical( A, b, c );
376  }
377  execution_space().fence();
378 
379  perf.time = wall_clock.seconds() / nloop;
380  perf.flops = m*n*(2+4*p);
381  perf.throughput = perf.flops / perf.time / 1.0e9;
382 
383  if (check) {
384  check_deriv(A, b, c);
385  }
386 
387 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
389 #endif
390 
391  return perf;
392 }
393 
394 template <typename ... ViewArgs>
395 Perf
396 do_time_val(const size_t m, const size_t n, const size_t nloop,
397  const bool check)
398 {
399  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
400  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
401  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
402  typedef typename ViewTypeA::execution_space execution_space;
403 
404  ViewTypeA A("A",m,n);
405  ViewTypeB b("B",n);
406  ViewTypeC c("c",m);
407 
408  Kokkos::deep_copy(A, 1.0);
409  Kokkos::deep_copy(b, 1.0);
410 
411  Kokkos::Timer wall_clock;
412  Perf perf;
413 
414  // Execute the kernel once to warm up
415  run_mat_vec( A, b, c );
416  execution_space().fence();
417 
418  wall_clock.reset();
419  for (size_t l=0; l<nloop; l++) {
420  run_mat_vec( A, b, c );
421  }
422  execution_space().fence();
423 
424  perf.time = wall_clock.seconds() / nloop;
425  perf.flops = m*n*2;
426  perf.throughput = perf.flops / perf.time / 1.0e9;
427 
428  if (check)
429  check_val(A,b,c);
430 
431  return perf;
432 }
433 
434 void
435 print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
436  const std::string& name)
437 {
438  std::cout << name << "\t "
439  << perf.time << "\t "
440  << perf.throughput << "\t "
441  << perf.time / (perf_base.time*p)
442  << std::endl;
443 }
444 
445 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
446  typename ... ViewArgs>
447 void
448 do_times(const size_t m,
449  const size_t n,
450  const size_t p,
451  const size_t ph,
452  const size_t nloop,
453  const bool value,
454  const bool sfad,
455  const bool slfad,
456  const bool dfad,
457  const bool hierarchical,
458  const bool check)
459 {
460  Perf perf_value;
461  perf_value.time = 1.0;
462 
463  // Run value
464  if (value) {
465  Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
466  perf_value = perf;
467  print_perf(perf, perf_value, p, "Value ");
468  }
469 
470  // Run SFad
471  if (sfad && p == SFadSize) {
472  Perf perf =
473  do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
474  print_perf(perf, perf_value, p, "SFad ");
475  }
476 
477  // Run SLFad
478  if (slfad && p <= SLFadSize) {
479  Perf perf =
480  do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
481  print_perf(perf, perf_value, p, "SLFad ");
482  }
483 
484  // Run DFad
485  if (dfad) {
486  Perf perf =
487  do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
488  print_perf(perf, perf_value, p, "DFad ");
489  }
490 
491  // Run hierarchical
492  if (hierarchical) {
493  if (sfad && ph == HierSFadSize) {
494  Perf perf =
495  do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,check);
496  print_perf(perf, perf_value, ph, "Hier SFad ");
497  }
498  if (slfad && ph <= HierSLFadSize) {
499  Perf perf =
500  do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,check);
501  print_perf(perf, perf_value, ph, "Hier SLFad");
502  }
503  if (dfad) {
504  Perf perf =
505  do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,check);
506  print_perf(perf, perf_value, ph, "Hier DFad ");
507  }
508  }
509 
510 }
511 
516 };
517 const int num_layout_types = 3;
520 const char *layout_names[] = { "left", "right", "default" };
521 
522 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
523  typename Device>
524 void
525 do_times_layout(const size_t m,
526  const size_t n,
527  const size_t p,
528  const size_t ph,
529  const size_t nloop,
530  const bool value,
531  const bool sfad,
532  const bool slfad,
533  const bool dfad,
534  const bool hierarchical,
535  const bool check,
536  const LayoutType& layout,
537  const std::string& device)
538 {
539  int prec = 2;
540  std::cout.setf(std::ios::scientific);
541  std::cout.precision(prec);
542  std::cout << std::endl
543  << device
544  << " performance for layout "
545  << layout_names[layout]
546  << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph
547  << std::endl << std::endl;
548  std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
549 
550  if (layout == LAYOUT_LEFT)
551  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
552  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
553  else if (layout == LAYOUT_RIGHT)
554  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
555  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
556  else
557  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
558  (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
559 }
560 
561 // Connect executable to vtune for profiling
563  std::stringstream cmd;
564  pid_t my_os_pid=getpid();
565  const std::string vtune_loc =
566  "amplxe-cl";
567  const std::string output_dir = "./vtune";
568  cmd << vtune_loc
569  << " -collect hotspots -result-dir " << output_dir
570  << " -target-pid " << my_os_pid << " &";
571  std::cout << cmd.str() << std::endl;
572  system(cmd.str().c_str());
573  system("sleep 10");
574 }
575 
576 //const int SFadSize = 8;
577 const int SFadSize = 32;
578 const int SLFadSize = SFadSize;
579 //const int HierSFadSize = 50;
580 const int HierSFadSize = 32;
581 const int HierSLFadSize = HierSFadSize;
582 
583 int main(int argc, char* argv[]) {
584  bool success = true;
585  try {
586 
587  // Set up command line options
589  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
590  int m = 100000;
591  clp.setOption("m", &m, "Number of matrix rows");
592  int n = 100;
593  clp.setOption("n", &n, "Number of matrix columns");
594  int p = SFadSize;
595  clp.setOption("p", &p, "Number of derivative components");
596  int ph = HierSFadSize;
597  clp.setOption("ph", &ph, "Number of derivative components for hierarchical");
598  int nloop = 10;
599  clp.setOption("nloop", &nloop, "Number of loops");
600 #ifdef KOKKOS_ENABLE_SERIAL
601  bool serial = 0;
602  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
603 #endif
604 #ifdef KOKKOS_ENABLE_OPENMP
605  int openmp = 0;
606  clp.setOption("openmp", &openmp, "Number of OpenMP threads");
607 #endif
608 #ifdef KOKKOS_ENABLE_THREADS
609  int threads = 0;
610  clp.setOption("threads", &threads, "Number of pThreads threads");
611 #endif
612 #ifdef KOKKOS_ENABLE_CUDA
613  bool cuda = 0;
614  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
615 #endif
616 #ifdef KOKKOS_ENABLE_HIP
617  bool hip = 0;
618  clp.setOption("hip", "no-hip", &cuda, "Whether to run HIP");
619 #endif
620  int numa = 0;
621  clp.setOption("numa", &numa,
622  "Number of NUMA domains to use (set to 0 to use all NUMAs");
623  int cores_per_numa = 0;
624  clp.setOption("cores-per-numa", &cores_per_numa,
625  "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
626  bool print_config = false;
627  clp.setOption("print-config", "no-print-config", &print_config,
628  "Whether to print Kokkos device configuration");
629  LayoutType layout = LAYOUT_DEFAULT;
630  clp.setOption("layout", &layout, num_layout_types, layout_values,
631  layout_names, "View layout");
632  bool vtune = false;
633  clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
634  bool value = true;
635  clp.setOption("value", "no-value", &value, "Run value calculation");
636  bool sfad = true;
637  clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
638  bool slfad = true;
639  clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
640  bool dfad = true;
641  clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
642  bool hierarchical = true;
643  clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
644  bool check = false;
645  clp.setOption("check", "no-check", &check, "Check calculations are correct");
646 
647  // Parse options
648  switch (clp.parse(argc, argv)) {
650  return 0;
653  return 1;
655  break;
656  }
657 
658  if (vtune)
659  connect_vtune();
660 
661  Kokkos::InitializationSettings init_args;
662  init_args.set_num_threads(cores_per_numa);
663 
664  Kokkos::initialize(init_args);
665 
666  if (print_config)
667  Kokkos::print_configuration(std::cout, true);
668 
669 #ifdef KOKKOS_ENABLE_SERIAL
670  if (serial) {
671  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
672  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial");
673  }
674 #endif
675 
676 #ifdef KOKKOS_ENABLE_OPENMP
677  if (openmp) {
678  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
679  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP");
680  }
681 #endif
682 
683 #ifdef KOKKOS_ENABLE_THREADS
684  if (threads) {
685  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
686  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads");
687  }
688 #endif
689 
690 #ifdef KOKKOS_ENABLE_CUDA
691  if (cuda) {
692  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
693  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda");
694  }
695 #endif
696 
697 #ifdef KOKKOS_ENABLE_HIP
698  if (hip) {
699  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::HIP>(
700  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"HIP");
701  }
702 #endif
703 
704  Kokkos::finalize();
705 
706  }
707  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
708 
709  return !success;
710 }
const char * p
const int SLFadSize
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
double time
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
void connect_vtune()
const char * layout_names[]
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:572
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
int main()
Definition: ad_example.cpp:191
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const int N
const LayoutType layout_values[]
void
Definition: uninit.c:96
int value
double throughput
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const double tol
LayoutType
const int HierSLFadSize
const int HierSFadSize
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const int SFadSize
void destroyGlobalMemoryPool(const ExecSpace &space)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
int n