Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec/driver.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 // A performance test that computes the derivative of a simple Kokkos kernel
31 // using various Fad classes
32 
33 #include "mat_vec.hpp"
34 #include "mat_vec_hierarchical.hpp"
36 
37 #include "Sacado.hpp"
38 
41 
42 // For vtune
43 #include <sys/types.h>
44 #include <unistd.h>
45 #include <algorithm>
46 
47 void
48 print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
49  const std::string& name)
50 {
51  std::cout << name << "\t "
52  << perf.time << "\t "
53  << perf.throughput << "\t "
54  << perf.time / perf_base.time
55  << std::endl;
56 }
57 
58 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
59  typename ... ViewArgs>
60 void
61 do_times(const size_t m,
62  const size_t n,
63  const size_t p,
64  const size_t nloop,
65  const bool value,
66  const bool analytic,
67  const bool sfad,
68  const bool slfad,
69  const bool dfad,
70  const bool flat,
71  const bool hierarchical,
72  const bool check)
73 {
74  Perf perf_value;
75  perf_value.time = 1.0;
76 
77  // Run value
78  if (value) {
79  try {
80  Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
81  perf_value = perf;
82  print_perf(perf, perf_value, p, "Value ");
83  }
84  catch(std::exception& e) {
85  std::cout << e.what() << std::endl;
86  }
87  }
88 
89  // Run analytic
90  if (analytic) {
91  try {
92  Perf perf =
93  do_time_analytic<ViewArgs...>(m,n,p,nloop,check);
94  print_perf(perf, perf_value, p, "Analytic ");
95  }
96  catch(std::exception& e) {
97  std::cout << e.what() << std::endl;
98  }
99  }
100  if(analytic && p == SFadSize) {
101  try {
102  Perf perf =
103  do_time_analytic_s<SFadSize, ViewArgs...>(m,n,nloop,check);
104  print_perf(perf, perf_value, p, "Analytic-s");
105  }
106  catch(std::exception& e) {
107  std::cout << e.what() << std::endl;
108  }
109  }
110  if(analytic && p <= SLFadSize) {
111  try {
112  Perf perf =
113  do_time_analytic_sl<SLFadSize, ViewArgs...>(m,n,p,nloop,check);
114  print_perf(perf, perf_value, p, "Analytic-sl");
115  }
116  catch(std::exception& e) {
117  std::cout << e.what() << std::endl;
118  }
119  }
120 
121  // Run flat SFad
122  if (flat && sfad && p == SFadSize) {
123  try {
124  Perf perf =
125  do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
126  print_perf(perf, perf_value, p, "SFad ");
127  }
128  catch(std::exception& e) {
129  std::cout << e.what() << std::endl;
130  }
131  }
132 
133  // Run flat SLFad
134  if (flat && slfad && p <= SLFadSize) {
135  try {
136  Perf perf =
137  do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
138  print_perf(perf, perf_value, p, "SLFad ");
139  }
140  catch(std::exception& e) {
141  std::cout << e.what() << std::endl;
142  }
143  }
144 
145  // Run flat DFad
146  if (flat && dfad) {
147  try {
148  Perf perf =
149  do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
150  print_perf(perf, perf_value, p, "DFad ");
151  }
152  catch(std::exception& e) {
153  std::cout << e.what() << std::endl;
154  }
155  try {
156  Perf perf_scratch =
157  do_time_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
158  print_perf(perf_scratch, perf_value, p, "DFad Scratch");
159  }
160  catch(std::exception& e) {
161  std::cout << e.what() << std::endl;
162  }
163  }
164 
165  // Run hierarchical SFad
166  if (hierarchical && sfad && p == HierSFadSize) {
167  try {
168  Perf perf =
169  do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,p,nloop,check);
170  print_perf(perf, perf_value, p, "H. SFad ");
171  }
172  catch(std::exception& e) {
173  std::cout << e.what() << std::endl;
174  }
175  }
176 
177  // Run hierarchical SLFad
178  if (hierarchical && slfad && p <= HierSLFadSize) {
179  try {
180  Perf perf =
181  do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,p,nloop,check);
182  print_perf(perf, perf_value, p, "H. SLFad ");
183  }
184  catch(std::exception& e) {
185  std::cout << e.what() << std::endl;
186  }
187  }
188 
189  // Run hierarchical DFad
190  if (hierarchical && dfad) {
191  try {
192  Perf perf =
193  do_time_fad_hierarchical_dfad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
194  print_perf(perf, perf_value, p, "H. DFad ");
195  }
196  catch(std::exception& e) {
197  std::cout << e.what() << std::endl;
198  }
199  try {
200  Perf perf_scratch =
201  do_time_fad_hierarchical_dfad_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
202  print_perf(perf_scratch, perf_value, p, "H. DFad Scratch");
203  }
204  catch(std::exception& e) {
205  std::cout << e.what() << std::endl;
206  }
207  }
208 
209 }
210 
215 };
216 const int num_layout_types = 3;
219 const char *layout_names[] = { "left", "right", "default" };
220 
221 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
222  typename Device>
223 void
224 do_times_layout(const size_t m,
225  const size_t n,
226  const size_t p,
227  const size_t nloop,
228  const bool value,
229  const bool analytic,
230  const bool sfad,
231  const bool slfad,
232  const bool dfad,
233  const bool flat,
234  const bool hierarchical,
235  const bool check,
236  const LayoutType& layout,
237  const std::string& device)
238 {
239  int prec = 2;
240  std::cout.setf(std::ios::scientific);
241  std::cout.precision(prec);
242  std::cout << std::endl
243  << device
244  << " performance for layout "
245  << layout_names[layout]
246  << " m = " << m << " n = " << n << " p = " << p
247  << std::endl << std::endl;
248  std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
249 
250  if (layout == LAYOUT_LEFT)
251  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
252  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
253  else if (layout == LAYOUT_RIGHT)
254  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
255  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
256  else
257  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
258  (m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
259 }
260 
261 // Connect executable to vtune for profiling
263  std::stringstream cmd;
264  pid_t my_os_pid=getpid();
265  const std::string vtune_loc =
266  "amplxe-cl";
267  const std::string output_dir = "./vtune";
268  cmd << vtune_loc
269  << " -collect hotspots -result-dir " << output_dir
270  << " -target-pid " << my_os_pid << " &";
271  std::cout << cmd.str() << std::endl;
272  system(cmd.str().c_str());
273  system("sleep 10");
274 }
275 
276 int main(int argc, char* argv[]) {
277  Kokkos::initialize(argc,argv);
278 
279  bool success = true;
280  try {
281 
282  // Set up command line options
284  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
285  int m = 100000;
286  clp.setOption("m", &m, "Number of matrix rows");
287  int n = 100;
288  clp.setOption("n", &n, "Number of matrix columns");
289  int p = SFadSize;
290  clp.setOption("p", &p, "Number of derivative components");
291  int nloop = 10;
292  clp.setOption("nloop", &nloop, "Number of loops");
293 #ifdef KOKKOS_ENABLE_SERIAL
294  bool serial = 0;
295  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
296 #endif
297 #ifdef KOKKOS_ENABLE_OPENMP
298  bool openmp = 0;
299  clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP");
300 #endif
301 #ifdef KOKKOS_ENABLE_THREADS
302  bool threads = 0;
303  clp.setOption("threads", "no-threads", &threads, "Whether to run Threads");
304 #endif
305 #ifdef KOKKOS_ENABLE_CUDA
306  bool cuda = 0;
307  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
308 #endif
309 #ifdef KOKKOS_ENABLE_HIP
310  bool hip = 0;
311  clp.setOption("hip", "no-hip", &hip, "Whether to run HIP");
312 #endif
313  bool print_config = false;
314  clp.setOption("print-config", "no-print-config", &print_config,
315  "Whether to print Kokkos device configuration");
316  LayoutType layout = LAYOUT_DEFAULT;
317  clp.setOption("layout", &layout, num_layout_types, layout_values,
318  layout_names, "View layout");
319  bool vtune = false;
320  clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
321  bool value = true;
322  clp.setOption("value", "no-value", &value, "Run value calculation");
323  bool analytic = true;
324  clp.setOption("analytic", "no-analytic", &analytic,
325  "Run analytic derivative calculation");
326  bool sfad = true;
327  clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
328  bool slfad = true;
329  clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
330  bool dfad = true;
331  clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
332  bool flat = true;
333  clp.setOption("flat", "no-flat", &flat, "Run flat Fad derivative calculation");
334  bool hierarchical = true;
335  clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
336  bool check = false;
337  clp.setOption("check", "no-check", &check, "Check calculations are correct");
338 
339  // Parse options
340  switch (clp.parse(argc, argv)) {
342  return 0;
345  return 1;
347  break;
348  }
349 
350  if (vtune)
351  connect_vtune();
352 
353  if (print_config)
354  Kokkos::print_configuration(std::cout, true);
355 
356 #ifdef KOKKOS_ENABLE_SERIAL
357  if (serial) {
358  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
359  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Serial");
360  }
361 #endif
362 
363 #ifdef KOKKOS_ENABLE_OPENMP
364  if (openmp) {
365  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
366  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"OpenMP");
367  }
368 #endif
369 
370 #ifdef KOKKOS_ENABLE_THREADS
371  if (threads) {
372  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
373  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Threads");
374  }
375 #endif
376 
377 #ifdef KOKKOS_ENABLE_CUDA
378  if (cuda) {
379  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
380  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Cuda");
381  }
382 #endif
383 
384 #ifdef KOKKOS_ENABLE_HIP
385  if (hip) {
386  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::HIP>(
387  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"HIP");
388  }
389 #endif
390 
391  }
392  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
393 
394  Kokkos::finalize();
395 
396  return !success;
397 }
const char * p
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:94
const int SLFadSize
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
double time
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: mat_vec.cpp:477
void connect_vtune()
const char * layout_names[]
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:438
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
int main()
Definition: ad_example.cpp:191
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
int value
double throughput
void setDocString(const char doc_string[])
LayoutType
const int HierSLFadSize
const int HierSFadSize
const int SFadSize
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
int n