Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fad_kokkos_view.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 //#define SACADO_DISABLE_FAD_VIEW_SPEC
31 #include "Sacado.hpp"
32 
35 #include "Teuchos_Time.hpp"
36 
37 #include "impl/Kokkos_Timer.hpp"
38 
39 // For vtune
40 #include <sys/types.h>
41 #include <unistd.h>
42 
43 // A performance test that computes the derivative of a simple Kokkos kernel
44 // using various Fad classes
45 
46 // Our Kokkos kernel: computes c = A*b for A mxn and b nx1
47 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
48 struct MatVecFunctor {
49 
50  // The scalar type used in this calculation, e.g., double
51  typedef typename ViewTypeC::value_type scalar_type;
52 
53  // The best ordinal type for the architecture we are running on,
54  // e.g., int or size_t
55  //typedef typename ViewTypeC::size_type size_type;
56  typedef int size_type;
57 
58  // The execution space where this functor will run
59  typedef typename ViewTypeC::execution_space execution_space;
60 
61  // Data needed by functor
62  const ViewTypeA A;
63  const ViewTypeB b;
64  const ViewTypeC c;
65  const size_type n;
66 
67  // Constructor
68  MatVecFunctor(const ViewTypeA& A_arg,
69  const ViewTypeB& b_arg,
70  const ViewTypeC& c_arg) :
71  A(A_arg), b(b_arg), c(c_arg), n(A.extent(1))
72  {}
73 
74  // Function to compute matrix-vector product for a given row i
76  void operator() (const size_type i) const
77  {
78  scalar_type t = 0.0;
79  for (size_type j=0; j<n; ++j)
80  t += A(i,j)*b(j);
81  c(i) = t;
82  }
83 
84 };
85 
86 // Computes the derivative of c = A*b for A mxnx(p+1) and b nx1x(p+1)
87 // where p is the number of derivatives
88 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
90 
91  // The scalar type used in this calculation, e.g., double
92  typedef typename ViewTypeC::value_type scalar_type;
93 
94  // The best ordinal type for the architecture we are running on,
95  // e.g., int or size_t
96  //typedef typename ViewTypeC::size_type size_type;
97  typedef int size_type;
98 
99  // The execution space where this functor will run
100  typedef typename ViewTypeC::execution_space execution_space;
101 
102  // Data needed by functor
103  const ViewTypeA A;
104  const ViewTypeB b;
105  const ViewTypeC c;
106  const size_type n;
107  const size_type p;
108 
109  // Constructor
110  MatVecDerivFunctor(const ViewTypeA& A_arg,
111  const ViewTypeB& b_arg,
112  const ViewTypeC& c_arg) :
113  A(A_arg), b(b_arg), c(c_arg), n(A.extent(1)), p(A.extent(2)-1)
114  {}
115 
117  void operator() (const size_type i) const
118  {
119  c(i,p) = 0.0;
120  for (size_type k=0; k<p; ++k)
121  c(i,k) = 0.0;
122  for (size_type j=0; j<n; ++j) {
123  c(i,p) += A(i,j,p)*b(j,p);
124  for (size_type k=0; k<p; ++k) {
125  c(i,k) += A(i,j,k)*b(j,p) + A(i,j,p)*b(j,k);
126  }
127  }
128  }
129 
130 };
131 
132 // Computes the derivative of c = A*b for A mxnx(p+1) and b nx1x(p+1)
133 // where p is the number of derivatives
134 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC,
135  int MaxP>
137 
138  // The scalar type used in this calculation, e.g., double
139  typedef typename ViewTypeC::value_type scalar_type;
140 
141  // The best ordinal type for the architecture we are running on,
142  // e.g., int or size_t
143  //typedef typename ViewTypeC::size_type size_type;
144  typedef int size_type;
145 
146  // The execution space where this functor will run
147  typedef typename ViewTypeC::execution_space execution_space;
148 
149  // Data needed by functor
150  const ViewTypeA A;
151  const ViewTypeB b;
152  const ViewTypeC c;
153  const size_type n;
154  const size_type p;
155 
156  // Constructor
157  SLMatVecDerivFunctor(const ViewTypeA& A_arg,
158  const ViewTypeB& b_arg,
159  const ViewTypeC& c_arg) :
160  A(A_arg), b(b_arg), c(c_arg), n(A.extent(1)), p(A.extent(2)-1)
161  {}
162 
164  void operator() (const size_type i) const
165  {
166  scalar_type cv = 0.0;
167  scalar_type t[MaxP];
168  for (size_type k=0; k<p; ++k)
169  t[k] = 0.0;
170 
171  for (size_type j=0; j<n; ++j) {
172  scalar_type av = A(i,j,p);
173  scalar_type bv = b(j,p);
174  cv += av*bv;
175  for (size_type k=0; k<p; ++k) {
176  t[k] += A(i,j,k)*bv + av*b(j,k);
177  }
178  }
179 
180  for (size_type k=0; k<p; ++k)
181  c(i,k) = t[k];
182  c(i,p) = cv;
183  }
184 
185 };
186 
187 // Computes the derivative of c = A*b for A mxnx(p+1) and b nx1x(p+1)
188 // where p is the number of derivatives
189 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC,
190  int p>
192 
193  // The scalar type used in this calculation, e.g., double
194  typedef typename ViewTypeC::value_type scalar_type;
195 
196  // The best ordinal type for the architecture we are running on,
197  // e.g., int or size_t
198  //typedef typename ViewTypeC::size_type size_type;
199  typedef int size_type;
200 
201  // The execution space where this functor will run
202  typedef typename ViewTypeC::execution_space execution_space;
203 
204  // Data needed by functor
205  const ViewTypeA A;
206  const ViewTypeB b;
207  const ViewTypeC c;
208  const size_type n;
209 
210  // Constructor
211  SMatVecDerivFunctor(const ViewTypeA& A_arg,
212  const ViewTypeB& b_arg,
213  const ViewTypeC& c_arg) :
214  A(A_arg), b(b_arg), c(c_arg), n(A.extent(1))
215  {}
216 
218  void operator() (const size_type i) const
219  {
220  scalar_type cv = 0.0;
221  scalar_type t[p];
222  for (size_type k=0; k<p; ++k)
223  t[k] = 0.0;
224 
225  for (size_type j=0; j<n; ++j) {
226  const scalar_type av = A(i,j,p);
227  const scalar_type bv = b(j,p);
228  cv += av*bv;
229 
230 // Using simd here results in much better performance. Othewise the compiler
231 // appears to try and vectorize the j loop with gather instructions, which
232 // doesn't work very well.
233 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
234 #pragma simd
235 #endif
236  for (size_type k=0; k<p; ++k) {
237  t[k] += A(i,j,k)*bv + av*b(j,k);
238  }
239  }
240 
241  for (size_type k=0; k<p; ++k)
242  c(i,k) = t[k];
243  c(i,p) = cv;
244  }
245 
246 };
247 
248 // Create a mat-vec functor from given A, b, c
249 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
250 void
251 run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
252 {
254  Kokkos::parallel_for( A.extent(0), f );
255 }
256 
257 // Create a mat-vec derivative functor from given A, b, c
258 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
259 void
260 run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
261 {
263  Kokkos::parallel_for( A.extent(0), f );
264 }
265 
266 // Create a mat-vec derivative functor from given A, b, c
267 template <int MaxP, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
268 void
269 run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
270 {
272  Kokkos::parallel_for( A.extent(0), f );
273 }
274 
275 // Create a mat-vec derivative functor from given A, b, c
276 template <int p, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
277 void
278 run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
279 {
281  Kokkos::parallel_for( A.extent(0), f );
282 }
283 
284 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
285 void
286 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
287 {
288  const double tol = 1.0e-14;
289  typedef typename ViewTypeC::value_type value_type;
290  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
291  Kokkos::deep_copy(h_c, c);
292  const size_t m = A.extent(0);
293  const size_t n = A.extent(1);
294  for (size_t i=0; i<m; ++i) {
295  value_type t = n;
296  if (std::abs(h_c(i)- t) > tol) {
297  std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
298  << std::endl;
299  }
300  }
301 }
302 
303 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
304 void
305 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
306 {
307  const double tol = 1.0e-14;
308  typedef typename ViewTypeC::value_type value_type;
309  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
310  Kokkos::deep_copy(h_c, c);
311  const size_t m = A.extent(0);
312  const size_t n = A.extent(1);
313  const size_t p = A.extent(2);
314  for (size_t i=0; i<m; ++i) {
315  for (size_t j=0; j<p; ++j) {
316  value_type t = (j == p-1 ? n : 2*n);
317  if (std::abs(h_c(i,j)- t) > tol) {
318  std::cout << "Comparison failed! " << i << "," << j << " : "
319  << h_c(i,j) << " , " << t << std::endl;
320  }
321  }
322  }
323 }
324 
325 struct Perf {
326  double time;
327  double flops;
328  double throughput;
329 };
330 
331 template <typename FadType, typename ... ViewArgs>
332 Perf
333 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
334  const bool check)
335 {
336  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
337  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
338  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
339  typedef typename ViewTypeA::execution_space execution_space;
340 
341  ViewTypeA A("A",m,n,p+1);
342  ViewTypeB b("B",n,p+1);
343  ViewTypeC c("c",m,p+1);
344 
345  FadType a(p, 1.0);
346  for (size_t k=0; k<p; ++k)
347  a.fastAccessDx(k) = 1.0;
348  Kokkos::deep_copy(A, a);
349  Kokkos::deep_copy(b, a);
350 
351  Kokkos::Impl::Timer wall_clock;
352  Perf perf;
353 
354  // Execute the kernel once to warm up
355  run_mat_vec( A, b, c );
356  execution_space::fence();
357 
358  wall_clock.reset();
359  for (size_t l=0; l<nloop; l++) {
360  run_mat_vec( A, b, c );
361  }
362  execution_space::fence();
363 
364  perf.time = wall_clock.seconds() / nloop;
365  perf.flops = m*n*(2+4*p);
366  perf.throughput = perf.flops / perf.time / 1.0e9;
367 
368  if (check) {
369  typename ViewTypeA::array_type A_flat = A;
370  typename ViewTypeB::array_type b_flat = b;
371  typename ViewTypeC::array_type c_flat = c;
372  check_deriv(A_flat, b_flat, c_flat);
373  }
374 
375  return perf;
376 }
377 
378 template <typename ... ViewArgs>
379 Perf
380 do_time_analytic(const size_t m, const size_t n, const size_t p,
381  const size_t nloop, const bool check)
382 {
383  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
384  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
385  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
386  typedef typename ViewTypeA::execution_space execution_space;
387 
388  ViewTypeA A("A",m,n,p+1);
389  ViewTypeB b("B",n,p+1);
390  ViewTypeC c("c",m,p+1);
391 
392  Kokkos::deep_copy(A, 1.0);
393  Kokkos::deep_copy(b, 1.0);
394 
395  Kokkos::Impl::Timer wall_clock;
396  Perf perf;
397 
398  // Execute the kernel once to warm up
399  run_mat_vec_deriv( A, b, c );
400  execution_space::fence();
401 
402  Teuchos::Time timer("mult", false);
403  timer.start(true);
404  for (size_t l=0; l<nloop; l++) {
405  run_mat_vec_deriv( A, b, c );
406  }
407  execution_space::fence();
408  timer.stop();
409 
410  perf.time = wall_clock.seconds() / nloop;
411  perf.flops = m*n*(2+4*p);
412  perf.throughput = perf.flops / perf.time / 1.0e9;
413 
414  if (check)
415  check_deriv(A,b,c);
416 
417  return perf;
418 }
419 
420 template <int MaxP, typename ... ViewArgs>
421 Perf
422 do_time_analytic_sl(const size_t m, const size_t n, const size_t p,
423  const size_t nloop, const bool check)
424 {
425  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
426  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
427  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
428  typedef typename ViewTypeA::execution_space execution_space;
429 
430  ViewTypeA A("A",m,n,p+1);
431  ViewTypeB b("B",n,p+1);
432  ViewTypeC c("c",m,p+1);
433 
434  Kokkos::deep_copy(A, 1.0);
435  Kokkos::deep_copy(b, 1.0);
436 
437  Kokkos::Impl::Timer wall_clock;
438  Perf perf;
439 
440  // Execute the kernel once to warm up
441  run_mat_vec_deriv_sl<MaxP>( A, b, c );
442  execution_space::fence();
443 
444  Teuchos::Time timer("mult", false);
445  timer.start(true);
446  for (size_t l=0; l<nloop; l++) {
447  run_mat_vec_deriv_sl<MaxP>( A, b, c );
448  }
449  execution_space::fence();
450  timer.stop();
451 
452  perf.time = wall_clock.seconds() / nloop;
453  perf.flops = m*n*(2+4*p);
454  perf.throughput = perf.flops / perf.time / 1.0e9;
455 
456  if (check)
457  check_deriv(A,b,c);
458 
459  return perf;
460 }
461 
462 template <int p, typename ... ViewArgs>
463 Perf
464 do_time_analytic_s(const size_t m, const size_t n,
465  const size_t nloop, const bool check)
466 {
467  typedef Kokkos::View<double**[p+1], ViewArgs...> ViewTypeA;
468  typedef Kokkos::View<double**, ViewArgs...> ViewTypeB;
469  typedef Kokkos::View<double**, ViewArgs...> ViewTypeC;
470  typedef typename ViewTypeA::execution_space execution_space;
471 
472  ViewTypeA A("A",m,n,p+1);
473  ViewTypeB b("B",n,p+1);
474  ViewTypeC c("c",m,p+1);
475 
476  Kokkos::deep_copy(A, 1.0);
477  Kokkos::deep_copy(b, 1.0);
478 
479  Kokkos::Impl::Timer wall_clock;
480  Perf perf;
481 
482  // Execute the kernel once to warm up
483  run_mat_vec_deriv_s<p>( A, b, c );
484  execution_space::fence();
485 
486  Teuchos::Time timer("mult", false);
487  timer.start(true);
488  for (size_t l=0; l<nloop; l++) {
489  run_mat_vec_deriv_s<p>( A, b, c );
490  }
491  execution_space::fence();
492  timer.stop();
493 
494  perf.time = wall_clock.seconds() / nloop;
495  perf.flops = m*n*(2+4*p);
496  perf.throughput = perf.flops / perf.time / 1.0e9;
497 
498  if (check)
499  check_deriv(A,b,c);
500 
501  return perf;
502 }
503 
504 template <typename ... ViewArgs>
505 Perf
506 do_time_val(const size_t m, const size_t n, const size_t nloop,
507  const bool check)
508 {
509  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
510  typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
511  typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
512  typedef typename ViewTypeA::execution_space execution_space;
513 
514  ViewTypeA A("A",m,n);
515  ViewTypeB b("B",n);
516  ViewTypeC c("c",m);
517 
518  Kokkos::deep_copy(A, 1.0);
519  Kokkos::deep_copy(b, 1.0);
520 
521  Kokkos::Impl::Timer wall_clock;
522  Perf perf;
523 
524  // Execute the kernel once to warm up
525  run_mat_vec( A, b, c );
526  execution_space::fence();
527 
528  wall_clock.reset();
529  for (size_t l=0; l<nloop; l++) {
530  run_mat_vec( A, b, c );
531  }
532  execution_space::fence();
533 
534  perf.time = wall_clock.seconds() / nloop;
535  perf.flops = m*n*2;
536  perf.throughput = perf.flops / perf.time / 1.0e9;
537 
538  if (check)
539  check_val(A,b,c);
540 
541  return perf;
542 }
543 
544 void
545 print_perf(const Perf& perf, const Perf& perf_base, const std::string& name)
546 {
547  std::cout << name << "\t "
548  << perf.time << "\t "
549  << perf.throughput << "\t "
550  << perf.time / perf_base.time
551  << std::endl;
552 }
553 
554 template <int SFadSize, int SLFadSize, typename ... ViewArgs>
555 void
556 do_times(const size_t m,
557  const size_t n,
558  const size_t p,
559  const size_t nloop,
560  const bool value,
561  const bool analytic,
562  const bool sfad,
563  const bool slfad,
564  const bool dfad,
565  const bool check)
566 {
567  Perf perf_analytic;
568  perf_analytic.time = 1.0;
569 
570  // Run analytic
571  if (analytic) {
572  perf_analytic = do_time_analytic<ViewArgs...>(m,n,p,nloop,check);
573  }
574 
575  // Run value
576  if (value) {
577  Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
578  print_perf(perf, perf_analytic, "Value ");
579  }
580 
581  if (analytic) {
582  print_perf(perf_analytic, perf_analytic, "Analytic ");
583  }
584 
585  if(analytic && p == SFadSize) {
586  Perf perf =
587  do_time_analytic_s<SFadSize, ViewArgs...>(m,n,nloop,check);
588  print_perf(perf, perf_analytic, "Analytic-s");
589  }
590 
591  if(analytic && p <= SLFadSize) {
592  Perf perf =
593  do_time_analytic_sl<SLFadSize, ViewArgs...>(m,n,p,nloop,check);
594  print_perf(perf, perf_analytic, "Analytic-sl");
595  }
596 
597  // Run SFad
598  if (sfad && p == SFadSize) {
599  Perf perf =
600  do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
601  print_perf(perf, perf_analytic, "SFad ");
602  }
603 
604  // Run SLFad
605  if (slfad && p <= SLFadSize) {
606  Perf perf =
607  do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
608  print_perf(perf, perf_analytic, "SLFad ");
609  }
610 
611  // Run DFad
612  if (dfad) {
613  Perf perf =
614  do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
615  print_perf(perf, perf_analytic, "DFad ");
616  }
617 
618 }
619 
624 };
625 const int num_layout_types = 3;
628 const char *layout_names[] = { "left", "right", "default" };
629 
630 template <int SFadSize, int SLFadSize, typename Device>
631 void
632 do_times_layout(const size_t m,
633  const size_t n,
634  const size_t p,
635  const size_t nloop,
636  const bool value,
637  const bool analytic,
638  const bool sfad,
639  const bool slfad,
640  const bool dfad,
641  const bool check,
642  const LayoutType& layout,
643  const std::string& device)
644 {
645  int prec = 2;
646  std::cout.setf(std::ios::scientific);
647  std::cout.precision(prec);
648  std::cout << std::endl
649  << device
650  << " performance for layout "
651  << layout_names[layout]
652  << " m = " << m << " n = " << n << " p = " << p
653  << std::endl << std::endl;
654  std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
655 
656  if (layout == LAYOUT_LEFT)
657  do_times<SFadSize,SLFadSize,Kokkos::LayoutLeft,Device>(
658  m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
659  else if (layout == LAYOUT_RIGHT)
660  do_times<SFadSize,SLFadSize,Kokkos::LayoutRight,Device>(
661  m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
662  else
663  do_times<SFadSize,SLFadSize,Device>
664  (m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
665 }
666 
667 // Connect executable to vtune for profiling
669  std::stringstream cmd;
670  pid_t my_os_pid=getpid();
671  const std::string vtune_loc =
672  "amplxe-cl";
673  const std::string output_dir = "./vtune";
674  cmd << vtune_loc
675  << " -collect hotspots -result-dir " << output_dir
676  << " -target-pid " << my_os_pid << " &";
677  std::cout << cmd.str() << std::endl;
678  system(cmd.str().c_str());
679  system("sleep 10");
680 }
681 
682 const int SFadSize = 8;
683 const int SLFadSize = SFadSize;
684 
685 int main(int argc, char* argv[]) {
686  bool success = true;
687  try {
688 
689  // Set up command line options
691  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
692  int m = 100000;
693  clp.setOption("m", &m, "Number of matrix rows");
694  int n = 100;
695  clp.setOption("n", &n, "Number of matrix columns");
696  int p = SFadSize;
697  clp.setOption("p", &p, "Number of derivative components");
698  int nloop = 10;
699  clp.setOption("nloop", &nloop, "Number of loops");
700 #ifdef KOKKOS_ENABLE_SERIAL
701  bool serial = 0;
702  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
703 #endif
704 #ifdef KOKKOS_ENABLE_OPENMP
705  int openmp = 0;
706  clp.setOption("openmp", &openmp, "Number of OpenMP threads");
707 #endif
708 #ifdef KOKKOS_ENABLE_THREADS
709  int threads = 0;
710  clp.setOption("threads", &threads, "Number of pThreads threads");
711 #endif
712 #ifdef KOKKOS_ENABLE_CUDA
713  bool cuda = 0;
714  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
715 #endif
716  int numa = 0;
717  clp.setOption("numa", &numa,
718  "Number of NUMA domains to use (set to 0 to use all NUMAs");
719  int cores_per_numa = 0;
720  clp.setOption("cores-per-numa", &cores_per_numa,
721  "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
722  bool print_config = false;
723  clp.setOption("print-config", "no-print-config", &print_config,
724  "Whether to print Kokkos device configuration");
725  LayoutType layout = LAYOUT_DEFAULT;
726  clp.setOption("layout", &layout, num_layout_types, layout_values,
727  layout_names, "View layout");
728  bool vtune = false;
729  clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
730  bool value = true;
731  clp.setOption("value", "no-value", &value, "Run value calculation");
732  bool analytic = true;
733  clp.setOption("analytic", "no-analytic", &analytic,
734  "Run analytic derivative calculation");
735  bool sfad = true;
736  clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
737  bool slfad = true;
738  clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
739 #if defined(KOKKOS_ENABLE_CUDA_UVM)
740  bool dfad = true;
741  clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
742 #else
743  bool dfad = false;
744 #endif
745  bool check = false;
746  clp.setOption("check", "no-check", &check, "Check calculations are correct");
747 
748  // Parse options
749  switch (clp.parse(argc, argv)) {
751  return 0;
754  return 1;
756  break;
757  }
758 
759  if (vtune)
760  connect_vtune();
761 
762  Kokkos::InitArguments init_args;
763  init_args.num_threads = cores_per_numa;
764  init_args.num_numa = numa;
765 
766  Kokkos::initialize(init_args);
767 
768  if (print_config)
769  Kokkos::print_configuration(std::cout, true);
770 
771 #ifdef KOKKOS_ENABLE_SERIAL
772  if (serial) {
773  do_times_layout<SFadSize,SLFadSize,Kokkos::Serial>(
774  m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"Serial");
775  }
776 #endif
777 
778 #ifdef KOKKOS_ENABLE_OPENMP
779  if (openmp) {
780  do_times_layout<SFadSize,SLFadSize,Kokkos::OpenMP>(
781  m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"OpenMP");
782  }
783 #endif
784 
785 #ifdef KOKKOS_ENABLE_THREADS
786  if (threads) {
787  do_times_layout<SFadSize,SLFadSize,Kokkos::Threads>(
788  m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"Threads");
789  }
790 #endif
791 
792 #ifdef KOKKOS_ENABLE_CUDA
793  if (cuda) {
794  do_times_layout<SFadSize,SLFadSize,Kokkos::Cuda>(
795  m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"Cuda");
796  }
797 #endif
798  Kokkos::finalize();
799 
800  }
801  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
802 
803  return !success;
804 }
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:97
const char * layout_names[]
void f()
abs(expr.val())
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
const int SFadSize
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t ph, const size_t nloop, const bool value, const bool sfad, const bool slfad, const bool dfad, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
const ViewTypeC c
void connect_vtune()
#define KOKKOS_INLINE_FUNCTION
ViewTypeC::value_type scalar_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
void start(bool reset=false)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:572
double stop()
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
MatVecFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
const LayoutType layout_values[]
int main()
Definition: ad_example.cpp:191
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
ViewTypeC::value_type scalar_type
const ViewTypeB b
MatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
SLMatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
SMatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
ViewTypeC::value_type scalar_type
const double tol
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
ViewTypeC::value_type scalar_type
ViewTypeC::execution_space execution_space
const int SLFadSize
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
ViewTypeC::execution_space execution_space
ViewTypeC::execution_space execution_space
const int num_layout_types
int n
ViewTypeC::execution_space execution_space
const size_type n
const ViewTypeA A
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)