Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec/driver.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Sacado Package
4 //
5 // Copyright 2006 NTESS and the Sacado contributors.
6 // SPDX-License-Identifier: LGPL-2.1-or-later
7 // *****************************************************************************
8 // @HEADER
9 
10 // A performance test that computes the derivative of a simple Kokkos kernel
11 // using various Fad classes
12 
13 #include "mat_vec.hpp"
14 #include "mat_vec_hierarchical.hpp"
16 
17 #include "Sacado.hpp"
18 
21 
22 // For vtune
23 #include <sys/types.h>
24 #include <unistd.h>
25 #include <algorithm>
26 
27 void
28 print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
29  const std::string& name)
30 {
31  std::cout << name << "\t "
32  << perf.time << "\t "
33  << perf.throughput << "\t "
34  << perf.time / perf_base.time
35  << std::endl;
36 }
37 
38 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
39  typename ... ViewArgs>
40 void
41 do_times(const size_t m,
42  const size_t n,
43  const size_t p,
44  const size_t nloop,
45  const bool value,
46  const bool analytic,
47  const bool sfad,
48  const bool slfad,
49  const bool dfad,
50  const bool flat,
51  const bool hierarchical,
52  const bool check)
53 {
54  Perf perf_value;
55  perf_value.time = 1.0;
56 
57  // Run value
58  if (value) {
59  try {
60  Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
61  perf_value = perf;
62  print_perf(perf, perf_value, p, "Value ");
63  }
64  catch(std::exception& e) {
65  std::cout << e.what() << std::endl;
66  }
67  }
68 
69  // Run analytic
70  if (analytic) {
71  try {
72  Perf perf =
73  do_time_analytic<ViewArgs...>(m,n,p,nloop,check);
74  print_perf(perf, perf_value, p, "Analytic ");
75  }
76  catch(std::exception& e) {
77  std::cout << e.what() << std::endl;
78  }
79  }
80  if(analytic && p == SFadSize) {
81  try {
82  Perf perf =
83  do_time_analytic_s<SFadSize, ViewArgs...>(m,n,nloop,check);
84  print_perf(perf, perf_value, p, "Analytic-s");
85  }
86  catch(std::exception& e) {
87  std::cout << e.what() << std::endl;
88  }
89  }
90  if(analytic && p <= SLFadSize) {
91  try {
92  Perf perf =
93  do_time_analytic_sl<SLFadSize, ViewArgs...>(m,n,p,nloop,check);
94  print_perf(perf, perf_value, p, "Analytic-sl");
95  }
96  catch(std::exception& e) {
97  std::cout << e.what() << std::endl;
98  }
99  }
100 
101  // Run flat SFad
102  if (flat && sfad && p == SFadSize) {
103  try {
104  Perf perf =
105  do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
106  print_perf(perf, perf_value, p, "SFad ");
107  }
108  catch(std::exception& e) {
109  std::cout << e.what() << std::endl;
110  }
111  }
112 
113  // Run flat SLFad
114  if (flat && slfad && p <= SLFadSize) {
115  try {
116  Perf perf =
117  do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
118  print_perf(perf, perf_value, p, "SLFad ");
119  }
120  catch(std::exception& e) {
121  std::cout << e.what() << std::endl;
122  }
123  }
124 
125  // Run flat DFad
126  if (flat && dfad) {
127  try {
128  Perf perf =
129  do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
130  print_perf(perf, perf_value, p, "DFad ");
131  }
132  catch(std::exception& e) {
133  std::cout << e.what() << std::endl;
134  }
135  try {
136  Perf perf_scratch =
137  do_time_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
138  print_perf(perf_scratch, perf_value, p, "DFad Scratch");
139  }
140  catch(std::exception& e) {
141  std::cout << e.what() << std::endl;
142  }
143  }
144 
145  // Run hierarchical SFad
146  if (hierarchical && sfad && p == HierSFadSize) {
147  try {
148  Perf perf =
149  do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,p,nloop,check);
150  print_perf(perf, perf_value, p, "H. SFad ");
151  }
152  catch(std::exception& e) {
153  std::cout << e.what() << std::endl;
154  }
155  }
156 
157  // Run hierarchical SLFad
158  if (hierarchical && slfad && p <= HierSLFadSize) {
159  try {
160  Perf perf =
161  do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,p,nloop,check);
162  print_perf(perf, perf_value, p, "H. SLFad ");
163  }
164  catch(std::exception& e) {
165  std::cout << e.what() << std::endl;
166  }
167  }
168 
169  // Run hierarchical DFad
170  if (hierarchical && dfad) {
171  try {
172  Perf perf =
173  do_time_fad_hierarchical_dfad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
174  print_perf(perf, perf_value, p, "H. DFad ");
175  }
176  catch(std::exception& e) {
177  std::cout << e.what() << std::endl;
178  }
179  try {
180  Perf perf_scratch =
181  do_time_fad_hierarchical_dfad_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
182  print_perf(perf_scratch, perf_value, p, "H. DFad Scratch");
183  }
184  catch(std::exception& e) {
185  std::cout << e.what() << std::endl;
186  }
187  }
188 
189 }
190 
195 };
196 const int num_layout_types = 3;
199 const char *layout_names[] = { "left", "right", "default" };
200 
201 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
202  typename Device>
203 void
204 do_times_layout(const size_t m,
205  const size_t n,
206  const size_t p,
207  const size_t nloop,
208  const bool value,
209  const bool analytic,
210  const bool sfad,
211  const bool slfad,
212  const bool dfad,
213  const bool flat,
214  const bool hierarchical,
215  const bool check,
216  const LayoutType& layout,
217  const std::string& device)
218 {
219  int prec = 2;
220  std::cout.setf(std::ios::scientific);
221  std::cout.precision(prec);
222  std::cout << std::endl
223  << device
224  << " performance for layout "
225  << layout_names[layout]
226  << " m = " << m << " n = " << n << " p = " << p
227  << std::endl << std::endl;
228  std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
229 
230  if (layout == LAYOUT_LEFT)
231  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
232  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
233  else if (layout == LAYOUT_RIGHT)
234  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
235  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
236  else
237  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
238  (m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
239 }
240 
241 // Connect executable to vtune for profiling
243  std::stringstream cmd;
244  pid_t my_os_pid=getpid();
245  const std::string vtune_loc =
246  "amplxe-cl";
247  const std::string output_dir = "./vtune";
248  cmd << vtune_loc
249  << " -collect hotspots -result-dir " << output_dir
250  << " -target-pid " << my_os_pid << " &";
251  std::cout << cmd.str() << std::endl;
252  system(cmd.str().c_str());
253  system("sleep 10");
254 }
255 
256 int main(int argc, char* argv[]) {
257  Kokkos::initialize(argc,argv);
258 
259  bool success = true;
260  try {
261 
262  // Set up command line options
264  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
265  int m = 100000;
266  clp.setOption("m", &m, "Number of matrix rows");
267  int n = 100;
268  clp.setOption("n", &n, "Number of matrix columns");
269  int p = SFadSize;
270  clp.setOption("p", &p, "Number of derivative components");
271  int nloop = 10;
272  clp.setOption("nloop", &nloop, "Number of loops");
273 #ifdef KOKKOS_ENABLE_SERIAL
274  bool serial = 0;
275  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
276 #endif
277 #ifdef KOKKOS_ENABLE_OPENMP
278  bool openmp = 0;
279  clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP");
280 #endif
281 #ifdef KOKKOS_ENABLE_THREADS
282  bool threads = 0;
283  clp.setOption("threads", "no-threads", &threads, "Whether to run Threads");
284 #endif
285 #ifdef KOKKOS_ENABLE_CUDA
286  bool cuda = 0;
287  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
288 #endif
289 #ifdef KOKKOS_ENABLE_HIP
290  bool hip = 0;
291  clp.setOption("hip", "no-hip", &hip, "Whether to run HIP");
292 #endif
293  bool print_config = false;
294  clp.setOption("print-config", "no-print-config", &print_config,
295  "Whether to print Kokkos device configuration");
296  LayoutType layout = LAYOUT_DEFAULT;
297  clp.setOption("layout", &layout, num_layout_types, layout_values,
298  layout_names, "View layout");
299  bool vtune = false;
300  clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
301  bool value = true;
302  clp.setOption("value", "no-value", &value, "Run value calculation");
303  bool analytic = true;
304  clp.setOption("analytic", "no-analytic", &analytic,
305  "Run analytic derivative calculation");
306  bool sfad = true;
307  clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
308  bool slfad = true;
309  clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
310  bool dfad = true;
311  clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
312  bool flat = true;
313  clp.setOption("flat", "no-flat", &flat, "Run flat Fad derivative calculation");
314  bool hierarchical = true;
315  clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
316  bool check = false;
317  clp.setOption("check", "no-check", &check, "Check calculations are correct");
318 
319  // Parse options
320  switch (clp.parse(argc, argv)) {
322  return 0;
325  return 1;
327  break;
328  }
329 
330  if (vtune)
331  connect_vtune();
332 
333  if (print_config)
334  Kokkos::print_configuration(std::cout, true);
335 
336 #ifdef KOKKOS_ENABLE_SERIAL
337  if (serial) {
338  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
339  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Serial");
340  }
341 #endif
342 
343 #ifdef KOKKOS_ENABLE_OPENMP
344  if (openmp) {
345  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
346  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"OpenMP");
347  }
348 #endif
349 
350 #ifdef KOKKOS_ENABLE_THREADS
351  if (threads) {
352  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
353  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Threads");
354  }
355 #endif
356 
357 #ifdef KOKKOS_ENABLE_CUDA
358  if (cuda) {
359  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
360  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Cuda");
361  }
362 #endif
363 
364 #ifdef KOKKOS_ENABLE_HIP
365  if (hip) {
366  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::HIP>(
367  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"HIP");
368  }
369 #endif
370 
371  }
372  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
373 
374  Kokkos::finalize();
375 
376  return !success;
377 }
const char * p
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:72
const int SLFadSize
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
double time
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: mat_vec.cpp:457
void connect_vtune()
const char * layout_names[]
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:418
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
int main()
Definition: ad_example.cpp:171
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
int value
double throughput
void setDocString(const char doc_string[])
LayoutType
const int HierSLFadSize
const int HierSFadSize
const int SFadSize
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
int n