Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fad_kokkos_mat_vec_perf.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 //#define SACADO_VIEW_CUDA_HIERARCHICAL 1
31 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
32 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
33 //#define SACADO_ALIGN_SFAD 1
34 
35 //#define SACADO_DISABLE_FAD_VIEW_SPEC
36 #include "Sacado.hpp"
37 
40 #include "Teuchos_Time.hpp"
41 
42 #include "impl/Kokkos_Timer.hpp"
43 
44 // For vtune
45 #include <sys/types.h>
46 #include <unistd.h>
47 #include <algorithm>
48 
49 // A performance test that computes the derivative of a simple Kokkos kernel
50 // using various Fad classes
51 
52 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
53 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
54  typedef typename ViewTypeC::value_type scalar_type;
55  typedef typename ViewTypeC::execution_space execution_space;
56 
57  const int m = A.extent(0);
58  const int n = A.extent(1);
59  Kokkos::parallel_for(
60  Kokkos::RangePolicy<execution_space>( 0,m ),
61  KOKKOS_LAMBDA (const int i) {
62  scalar_type t = 0.0;
63  for (int j=0; j<n; ++j)
64  t += A(i,j)*b(j);
65  c(i) = t;
66  }
67  );
68 }
69 
70 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
71 
72 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
73 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
74  const ViewTypeC& c) {
75  typedef typename ViewTypeC::value_type scalar_type;
76  typedef typename ViewTypeC::execution_space execution_space;
77 
78 #if defined (KOKKOS_ENABLE_CUDA)
79  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
80 #else
81  const bool is_cuda = false;
82 #endif
83  const unsigned vector_size = is_cuda ? 32 : 1;
84  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
85 
86  const int m = A.extent(0);
87  const int n = A.extent(1);
88  const int range = (m+team_size-1)/team_size;
89 
90  typedef Kokkos::TeamPolicy<execution_space> Policy;
91  Kokkos::parallel_for(
92  Policy( range,team_size,vector_size ),
93  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
94  const int i = team.league_rank()*team.team_size() + team.team_rank();
95  if (i >= m)
96  return;
97 
98  scalar_type t = 0.0;
99  for (int j=0; j<n; ++j)
100  t += A(i,j)*b(j);
101  c(i) = t;
102  }
103  );
104 }
105 
106 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
107 
108 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
109 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
110  const ViewTypeC& c) {
111  typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
112  typedef typename ViewTypeC::execution_space execution_space;
113 
114 #if defined (KOKKOS_ENABLE_CUDA)
115  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
116 #else
117  const bool is_cuda = false;
118 #endif
119  const unsigned vector_size = is_cuda ? 32 : 1;
120  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
121 
122  const int m = A.extent(0);
123  const int n = A.extent(1);
124  const int range = (m+team_size-1)/team_size;
125 
126  typedef Kokkos::TeamPolicy<execution_space> Policy;
127  Kokkos::parallel_for(
128  Policy( range,team_size,vector_size ),
129  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
130  const int i = team.league_rank()*team.team_size() + team.team_rank();
131  if (i >= m)
132  return;
133 
134  scalar_type t = 0.0;
135  for (int j=0; j<n; ++j)
136  t += A(i,j)*b(j);
137  c(i) = t;
138  }
139  );
140 }
141 
142 #else
143 
144 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
145 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
146  const ViewTypeC& c) {
147  typedef typename ViewTypeC::value_type scalar_type;
148  typedef typename ViewTypeC::execution_space execution_space;
149 
150 #if defined (KOKKOS_ENABLE_CUDA)
151  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
152 #else
153  const bool is_cuda = false;
154 #endif
155  const unsigned vector_size = 1;
156  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
157 
158  const int m = A.extent(0);
159  const int n = A.extent(1);
160  const int range = (m+team_size-1)/team_size;
161 
162  typedef Kokkos::TeamPolicy<execution_space> Policy;
163  Kokkos::parallel_for(
164  Policy( range,team_size,vector_size ),
165  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
166  const int i = team.league_rank()*team.team_size() + team.team_rank();
167  if (i >= m)
168  return;
169 
170  scalar_type t = 0.0;
171  for (int j=0; j<n; ++j)
172  t += A(i,j)*b(j);
173  c(i) = t;
174  }
175  );
176 }
177 
178 #endif
179 
180 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
181 void
182 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
183 {
184  const double tol = 1.0e-14;
185  typedef typename ViewTypeC::value_type value_type;
186  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
187  Kokkos::deep_copy(h_c, c);
188  const size_t m = A.extent(0);
189  const size_t n = A.extent(1);
190  for (size_t i=0; i<m; ++i) {
191  value_type t = n;
192  if (std::abs(h_c(i)- t) > tol) {
193  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
194  << std::endl;
195  }
196  }
197 }
198 
199 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
200 void
201 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
202 {
203  const double tol = 1.0e-14;
204  typedef typename ViewTypeC::value_type value_type;
205  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
206  Kokkos::deep_copy(h_c, c);
207  const size_t m = A.extent(0);
208  const size_t n = A.extent(1);
209  const size_t p = Kokkos::dimension_scalar(A);
210  for (size_t i=0; i<m; ++i) {
211  for (size_t j=0; j<p; ++j) {
212  value_type t = (j == p-1 ? n : 2*n);
213  if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
214  std::cout << "Comparison failed! " << i << "," << j << " : "
215  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
216  }
217  }
218  }
219 }
220 
221 struct Perf {
222  double time;
223  double flops;
224  double throughput;
225 };
226 
227 template <typename FadType, typename ... ViewArgs>
228 Perf
229 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
230  const bool check)
231 {
232  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
233  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
234  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
235  typedef typename ViewTypeA::execution_space execution_space;
236 
237  ViewTypeA A("A",m,n,p+1);
238  ViewTypeB b("B",n,p+1);
239  ViewTypeC c("c",m,p+1);
240 
241  FadType a(p, 1.0);
242  for (size_t k=0; k<p; ++k)
243  a.fastAccessDx(k) = 1.0;
244  Kokkos::deep_copy(A, a);
245  Kokkos::deep_copy(b, a);
246 
247  Kokkos::Impl::Timer wall_clock;
248  Perf perf;
249 
250  // Execute the kernel once to warm up
251  run_mat_vec( A, b, c );
252  execution_space::fence();
253 
254  wall_clock.reset();
255  for (size_t l=0; l<nloop; l++) {
256  run_mat_vec( A, b, c );
257  }
258  execution_space::fence();
259 
260  perf.time = wall_clock.seconds() / nloop;
261  perf.flops = m*n*(2+4*p);
262  perf.throughput = perf.flops / perf.time / 1.0e9;
263 
264  if (check) {
265  check_deriv(A, b, c);
266  }
267 
268  return perf;
269 }
270 
271 template <typename FadType, typename ... ViewArgs>
272 Perf
273 do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
274  const size_t nloop, const bool check)
275 {
276  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
277  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
278  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
279  typedef typename ViewTypeA::execution_space execution_space;
280 
281 #if defined (KOKKOS_ENABLE_CUDA)
282  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
283 #else
284  const bool is_cuda = false;
285 #endif
286 
287 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
288  const int FadStride = is_cuda ? 32 : 1;
289 #if defined(SACADO_ALIGN_SFAD)
291  const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
292  const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
293  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
294 #else
295  typedef FadType AlignedFadType;
296  const size_t pa = p;
297 #endif
298 #else
299  const int FadStride = 1;
300  typedef FadType AlignedFadType;
301  const size_t pa = p;
302 #endif
303 
304 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
308 #else
309  typedef typename ViewTypeA::array_layout ConLayoutA;
310  typedef typename ViewTypeB::array_layout ConLayoutB;
311  typedef typename ViewTypeC::array_layout ConLayoutC;
312  (void) FadStride;
313 #endif
314 
315 
316  typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
317  typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
318  typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
319 
320  ConViewTypeA A("A",m,n,pa+1);
321  ConViewTypeB b("B",n,pa+1);
322  ConViewTypeC c("c",m,pa+1);
323 
324  AlignedFadType a(pa, 1.0);
325  for (size_t k=0; k<pa; ++k)
326  a.fastAccessDx(k) = 1.0;
327  Kokkos::deep_copy(A, a);
328  Kokkos::deep_copy(b, a);
329 
330  Kokkos::Impl::Timer wall_clock;
331  Perf perf;
332 
333 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
334  const size_t concurrency = execution_space::concurrency();
335  const size_t warp_dim = is_cuda ? 32 : 1;
336  const size_t block_size = pa*sizeof(double);
337  const size_t nkernels = concurrency / warp_dim;
338  const size_t mem_pool_size =
339  static_cast<size_t>(1.2*nkernels*block_size);
340  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
341  execution_space space;
342  Sacado::createGlobalMemoryPool(space, mem_pool_size,
343  block_size,
344  block_size,
345  superblock_size
346  );
347 #endif
348 
349  // Execute the kernel once to warm up
350  run_mat_vec_hierarchical( A, b, c );
351  execution_space::fence();
352 
353  wall_clock.reset();
354  for (size_t l=0; l<nloop; l++) {
355  run_mat_vec_hierarchical( A, b, c );
356  }
357  execution_space::fence();
358 
359  perf.time = wall_clock.seconds() / nloop;
360  perf.flops = m*n*(2+4*p);
361  perf.throughput = perf.flops / perf.time / 1.0e9;
362 
363  if (check) {
364  check_deriv(A, b, c);
365  }
366 
367 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
369 #endif
370 
371  return perf;
372 }
373 
374 template <typename ... ViewArgs>
375 Perf
376 do_time_val(const size_t m, const size_t n, const size_t nloop,
377  const bool check)
378 {
379  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
380  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
381  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
382  typedef typename ViewTypeA::execution_space execution_space;
383 
384  ViewTypeA A("A",m,n);
385  ViewTypeB b("B",n);
386  ViewTypeC c("c",m);
387 
388  Kokkos::deep_copy(A, 1.0);
389  Kokkos::deep_copy(b, 1.0);
390 
391  Kokkos::Impl::Timer wall_clock;
392  Perf perf;
393 
394  // Execute the kernel once to warm up
395  run_mat_vec( A, b, c );
396  execution_space::fence();
397 
398  wall_clock.reset();
399  for (size_t l=0; l<nloop; l++) {
400  run_mat_vec( A, b, c );
401  }
402  execution_space::fence();
403 
404  perf.time = wall_clock.seconds() / nloop;
405  perf.flops = m*n*2;
406  perf.throughput = perf.flops / perf.time / 1.0e9;
407 
408  if (check)
409  check_val(A,b,c);
410 
411  return perf;
412 }
413 
414 void
415 print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
416  const std::string& name)
417 {
418  std::cout << name << "\t "
419  << perf.time << "\t "
420  << perf.throughput << "\t "
421  << perf.time / (perf_base.time*p)
422  << std::endl;
423 }
424 
425 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
426  typename ... ViewArgs>
427 void
428 do_times(const size_t m,
429  const size_t n,
430  const size_t p,
431  const size_t ph,
432  const size_t nloop,
433  const bool value,
434  const bool sfad,
435  const bool slfad,
436  const bool dfad,
437  const bool hierarchical,
438  const bool check)
439 {
440  Perf perf_value;
441  perf_value.time = 1.0;
442 
443  // Run value
444  if (value) {
445  Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
446  perf_value = perf;
447  print_perf(perf, perf_value, p, "Value ");
448  }
449 
450  // Run SFad
451  if (sfad && p == SFadSize) {
452  Perf perf =
453  do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
454  print_perf(perf, perf_value, p, "SFad ");
455  }
456 
457  // Run SLFad
458  if (slfad && p <= SLFadSize) {
459  Perf perf =
460  do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
461  print_perf(perf, perf_value, p, "SLFad ");
462  }
463 
464  // Run DFad
465  if (dfad) {
466  Perf perf =
467  do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
468  print_perf(perf, perf_value, p, "DFad ");
469  }
470 
471  // Run hierarchical
472  if (hierarchical) {
473  if (sfad && ph == HierSFadSize) {
474  Perf perf =
475  do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,check);
476  print_perf(perf, perf_value, ph, "Hier SFad ");
477  }
478  if (slfad && ph <= HierSLFadSize) {
479  Perf perf =
480  do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,check);
481  print_perf(perf, perf_value, ph, "Hier SLFad");
482  }
483  if (dfad) {
484  Perf perf =
485  do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,check);
486  print_perf(perf, perf_value, ph, "Hier DFad ");
487  }
488  }
489 
490 }
491 
496 };
497 const int num_layout_types = 3;
500 const char *layout_names[] = { "left", "right", "default" };
501 
502 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
503  typename Device>
504 void
505 do_times_layout(const size_t m,
506  const size_t n,
507  const size_t p,
508  const size_t ph,
509  const size_t nloop,
510  const bool value,
511  const bool sfad,
512  const bool slfad,
513  const bool dfad,
514  const bool hierarchical,
515  const bool check,
516  const LayoutType& layout,
517  const std::string& device)
518 {
519  int prec = 2;
520  std::cout.setf(std::ios::scientific);
521  std::cout.precision(prec);
522  std::cout << std::endl
523  << device
524  << " performance for layout "
525  << layout_names[layout]
526  << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph
527  << std::endl << std::endl;
528  std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
529 
530  if (layout == LAYOUT_LEFT)
531  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
532  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
533  else if (layout == LAYOUT_RIGHT)
534  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
535  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
536  else
537  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
538  (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
539 }
540 
541 // Connect executable to vtune for profiling
543  std::stringstream cmd;
544  pid_t my_os_pid=getpid();
545  const std::string vtune_loc =
546  "amplxe-cl";
547  const std::string output_dir = "./vtune";
548  cmd << vtune_loc
549  << " -collect hotspots -result-dir " << output_dir
550  << " -target-pid " << my_os_pid << " &";
551  std::cout << cmd.str() << std::endl;
552  system(cmd.str().c_str());
553  system("sleep 10");
554 }
555 
556 const int SFadSize = 8;
557 const int SLFadSize = SFadSize;
558 const int HierSFadSize = 50;
559 const int HierSLFadSize = HierSFadSize;
560 
561 int main(int argc, char* argv[]) {
562  bool success = true;
563  try {
564 
565  // Set up command line options
567  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
568  int m = 100000;
569  clp.setOption("m", &m, "Number of matrix rows");
570  int n = 100;
571  clp.setOption("n", &n, "Number of matrix columns");
572  int p = SFadSize;
573  clp.setOption("p", &p, "Number of derivative components");
574  int ph = HierSFadSize;
575  clp.setOption("ph", &ph, "Number of derivative components for hierarchical");
576  int nloop = 10;
577  clp.setOption("nloop", &nloop, "Number of loops");
578 #ifdef KOKKOS_ENABLE_SERIAL
579  bool serial = 0;
580  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
581 #endif
582 #ifdef KOKKOS_ENABLE_OPENMP
583  int openmp = 0;
584  clp.setOption("openmp", &openmp, "Number of OpenMP threads");
585 #endif
586 #ifdef KOKKOS_ENABLE_THREADS
587  int threads = 0;
588  clp.setOption("threads", &threads, "Number of pThreads threads");
589 #endif
590 #ifdef KOKKOS_ENABLE_CUDA
591  bool cuda = 0;
592  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
593 #endif
594  int numa = 0;
595  clp.setOption("numa", &numa,
596  "Number of NUMA domains to use (set to 0 to use all NUMAs");
597  int cores_per_numa = 0;
598  clp.setOption("cores-per-numa", &cores_per_numa,
599  "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
600  bool print_config = false;
601  clp.setOption("print-config", "no-print-config", &print_config,
602  "Whether to print Kokkos device configuration");
603  LayoutType layout = LAYOUT_DEFAULT;
604  clp.setOption("layout", &layout, num_layout_types, layout_values,
605  layout_names, "View layout");
606  bool vtune = false;
607  clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
608  bool value = true;
609  clp.setOption("value", "no-value", &value, "Run value calculation");
610  bool sfad = true;
611  clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
612  bool slfad = true;
613  clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
614  bool dfad = true;
615  clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
616  bool hierarchical = true;
617  clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
618  bool check = false;
619  clp.setOption("check", "no-check", &check, "Check calculations are correct");
620 
621  // Parse options
622  switch (clp.parse(argc, argv)) {
624  return 0;
627  return 1;
629  break;
630  }
631 
632  if (vtune)
633  connect_vtune();
634 
635 // #ifdef KOKKOS_ENABLE_SERIAL
636 // if (serial) {
637 // Kokkos::initialize();
638 // if (print_config)
639 // Kokkos::print_configuration(std::cout, true);
640 // do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
641 // m,n,p,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial");
642 // Kokkos::finalize();
643 // }
644 // #endif
645 
646 #ifdef KOKKOS_ENABLE_OPENMP
647  if (openmp) {
648  Kokkos::InitArgs init_args;
649  init_args.num_threads = openmp;
650  init_args.num_numa = numa;
651  Kokkos::initialize( init_args );
652  if (print_config)
653  Kokkos::print_configuration(std::cout, true);
654  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
655  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP");
656  Kokkos::finalize();
657  }
658 #endif
659 
660 #ifdef KOKKOS_ENABLE_THREADS
661  if (threads) {
662  Kokkos::InitArgs init_args;
663  init_args.num_threads = threads;
664  init_args.num_numa = numa;
665  Kokkos::initialize( init_args );
666  if (print_config)
667  Kokkos::print_configuration(std::cout, true);
668  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
669  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads");
670  Kokkos::finalize();
671  }
672 #endif
673 
674 #ifdef KOKKOS_ENABLE_CUDA
675  if (cuda) {
676  Kokkos::initialize();
677  if (print_config)
678  Kokkos::print_configuration(std::cout, true);
679  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
680  m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda");
681  Kokkos::finalize();
682  }
683 #endif
684 
685  }
686  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
687 
688  return !success;
689 }
const char * layout_names[]
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
const int HierSLFadSize
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
const int SFadSize
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t ph, const size_t nloop, const bool value, const bool sfad, const bool slfad, const bool dfad, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
Base template specification for static size.
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void connect_vtune()
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:572
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
const LayoutType layout_values[]
int main()
Definition: ad_example.cpp:191
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
void
Definition: uninit.c:96
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
void setDocString(const char doc_string[])
const int HierSFadSize
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
const double tol
const int SLFadSize
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void destroyGlobalMemoryPool(const ExecSpace &space)
const int num_layout_types
int n