Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec/driver.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 // A performance test that computes the derivative of a simple Kokkos kernel
31 // using various Fad classes
32 
33 #include "mat_vec.hpp"
34 #include "mat_vec_hierarchical.hpp"
36 
37 #include "Sacado.hpp"
38 
41 
42 // For vtune
43 #include <sys/types.h>
44 #include <unistd.h>
45 #include <algorithm>
46 
47 void
48 print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
49  const std::string& name)
50 {
51  std::cout << name << "\t "
52  << perf.time << "\t "
53  << perf.throughput << "\t "
54  << perf.time / perf_base.time
55  << std::endl;
56 }
57 
58 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
59  typename ... ViewArgs>
60 void
61 do_times(const size_t m,
62  const size_t n,
63  const size_t p,
64  const size_t nloop,
65  const bool value,
66  const bool analytic,
67  const bool sfad,
68  const bool slfad,
69  const bool dfad,
70  const bool flat,
71  const bool hierarchical,
72  const bool check)
73 {
74  Perf perf_value;
75  perf_value.time = 1.0;
76 
77  // Run value
78  if (value) {
79  try {
80  Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
81  perf_value = perf;
82  print_perf(perf, perf_value, p, "Value ");
83  }
84  catch(std::exception& e) {
85  std::cout << e.what() << std::endl;
86  }
87  }
88 
89  // Run analytic
90  if (analytic) {
91  try {
92  Perf perf =
93  do_time_analytic<ViewArgs...>(m,n,p,nloop,check);
94  print_perf(perf, perf_value, p, "Analytic ");
95  }
96  catch(std::exception& e) {
97  std::cout << e.what() << std::endl;
98  }
99  }
100  if(analytic && p == SFadSize) {
101  try {
102  Perf perf =
103  do_time_analytic_s<SFadSize, ViewArgs...>(m,n,nloop,check);
104  print_perf(perf, perf_value, p, "Analytic-s");
105  }
106  catch(std::exception& e) {
107  std::cout << e.what() << std::endl;
108  }
109  }
110  if(analytic && p <= SLFadSize) {
111  try {
112  Perf perf =
113  do_time_analytic_sl<SLFadSize, ViewArgs...>(m,n,p,nloop,check);
114  print_perf(perf, perf_value, p, "Analytic-sl");
115  }
116  catch(std::exception& e) {
117  std::cout << e.what() << std::endl;
118  }
119  }
120 
121  // Run flat SFad
122  if (flat && sfad && p == SFadSize) {
123  try {
124  Perf perf =
125  do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
126  print_perf(perf, perf_value, p, "SFad ");
127  }
128  catch(std::exception& e) {
129  std::cout << e.what() << std::endl;
130  }
131  }
132 
133  // Run flat SLFad
134  if (flat && slfad && p <= SLFadSize) {
135  try {
136  Perf perf =
137  do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
138  print_perf(perf, perf_value, p, "SLFad ");
139  }
140  catch(std::exception& e) {
141  std::cout << e.what() << std::endl;
142  }
143  }
144 
145  // Run flat DFad
146  if (flat && dfad) {
147  try {
148  Perf perf =
149  do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
150  print_perf(perf, perf_value, p, "DFad ");
151  }
152  catch(std::exception& e) {
153  std::cout << e.what() << std::endl;
154  }
155  try {
156  Perf perf_scratch =
157  do_time_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
158  print_perf(perf_scratch, perf_value, p, "DFad Scratch");
159  }
160  catch(std::exception& e) {
161  std::cout << e.what() << std::endl;
162  }
163  }
164 
165  // Run hierarchical SFad
166  if (hierarchical && sfad && p == HierSFadSize) {
167  try {
168  Perf perf =
169  do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,p,nloop,check);
170  print_perf(perf, perf_value, p, "H. SFad ");
171  }
172  catch(std::exception& e) {
173  std::cout << e.what() << std::endl;
174  }
175  }
176 
177  // Run hierarchical SLFad
178  if (hierarchical && slfad && p <= HierSLFadSize) {
179  try {
180  Perf perf =
181  do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,p,nloop,check);
182  print_perf(perf, perf_value, p, "H. SLFad ");
183  }
184  catch(std::exception& e) {
185  std::cout << e.what() << std::endl;
186  }
187  }
188 
189  // Run hierarchical DFad
190  if (hierarchical && dfad) {
191  try {
192  Perf perf =
193  do_time_fad_hierarchical_dfad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
194  print_perf(perf, perf_value, p, "H. DFad ");
195  }
196  catch(std::exception& e) {
197  std::cout << e.what() << std::endl;
198  }
199  try {
200  Perf perf_scratch =
201  do_time_fad_hierarchical_dfad_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
202  print_perf(perf_scratch, perf_value, p, "H. DFad Scratch");
203  }
204  catch(std::exception& e) {
205  std::cout << e.what() << std::endl;
206  }
207  }
208 
209 }
210 
215 };
216 const int num_layout_types = 3;
219 const char *layout_names[] = { "left", "right", "default" };
220 
221 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
222  typename Device>
223 void
224 do_times_layout(const size_t m,
225  const size_t n,
226  const size_t p,
227  const size_t nloop,
228  const bool value,
229  const bool analytic,
230  const bool sfad,
231  const bool slfad,
232  const bool dfad,
233  const bool flat,
234  const bool hierarchical,
235  const bool check,
236  const LayoutType& layout,
237  const std::string& device)
238 {
239  int prec = 2;
240  std::cout.setf(std::ios::scientific);
241  std::cout.precision(prec);
242  std::cout << std::endl
243  << device
244  << " performance for layout "
245  << layout_names[layout]
246  << " m = " << m << " n = " << n << " p = " << p
247  << std::endl << std::endl;
248  std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
249 
250  if (layout == LAYOUT_LEFT)
251  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
252  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
253  else if (layout == LAYOUT_RIGHT)
254  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
255  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
256  else
257  do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
258  (m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
259 }
260 
261 // Connect executable to vtune for profiling
263  std::stringstream cmd;
264  pid_t my_os_pid=getpid();
265  const std::string vtune_loc =
266  "amplxe-cl";
267  const std::string output_dir = "./vtune";
268  cmd << vtune_loc
269  << " -collect hotspots -result-dir " << output_dir
270  << " -target-pid " << my_os_pid << " &";
271  std::cout << cmd.str() << std::endl;
272  system(cmd.str().c_str());
273  system("sleep 10");
274 }
275 
276 int main(int argc, char* argv[]) {
277  Kokkos::initialize(argc,argv);
278 
279  bool success = true;
280  try {
281 
282  // Set up command line options
284  clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
285  int m = 100000;
286  clp.setOption("m", &m, "Number of matrix rows");
287  int n = 100;
288  clp.setOption("n", &n, "Number of matrix columns");
289  int p = SFadSize;
290  clp.setOption("p", &p, "Number of derivative components");
291  int nloop = 10;
292  clp.setOption("nloop", &nloop, "Number of loops");
293 #ifdef KOKKOS_ENABLE_SERIAL
294  bool serial = 0;
295  clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
296 #endif
297 #ifdef KOKKOS_ENABLE_OPENMP
298  bool openmp = 0;
299  clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP");
300 #endif
301 #ifdef KOKKOS_ENABLE_THREADS
302  bool threads = 0;
303  clp.setOption("threads", "no-threads", &threads, "Whether to run Threads");
304 #endif
305 #ifdef KOKKOS_ENABLE_CUDA
306  bool cuda = 0;
307  clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
308 #endif
309  bool print_config = false;
310  clp.setOption("print-config", "no-print-config", &print_config,
311  "Whether to print Kokkos device configuration");
312  LayoutType layout = LAYOUT_DEFAULT;
313  clp.setOption("layout", &layout, num_layout_types, layout_values,
314  layout_names, "View layout");
315  bool vtune = false;
316  clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
317  bool value = true;
318  clp.setOption("value", "no-value", &value, "Run value calculation");
319  bool analytic = true;
320  clp.setOption("analytic", "no-analytic", &analytic,
321  "Run analytic derivative calculation");
322  bool sfad = true;
323  clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
324  bool slfad = true;
325  clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
326  bool dfad = true;
327  clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
328  bool flat = true;
329  clp.setOption("flat", "no-flat", &flat, "Run flat Fad derivative calculation");
330  bool hierarchical = true;
331  clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
332  bool check = false;
333  clp.setOption("check", "no-check", &check, "Check calculations are correct");
334 
335  // Parse options
336  switch (clp.parse(argc, argv)) {
338  return 0;
341  return 1;
343  break;
344  }
345 
346  if (vtune)
347  connect_vtune();
348 
349  if (print_config)
350  Kokkos::print_configuration(std::cout, true);
351 
352 #ifdef KOKKOS_ENABLE_SERIAL
353  if (serial) {
354  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
355  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Serial");
356  }
357 #endif
358 
359 #ifdef KOKKOS_ENABLE_OPENMP
360  if (openmp) {
361  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
362  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"OpenMP");
363  }
364 #endif
365 
366 #ifdef KOKKOS_ENABLE_THREADS
367  if (threads) {
368  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
369  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Threads");
370  }
371 #endif
372 
373 #ifdef KOKKOS_ENABLE_CUDA
374  if (cuda) {
375  do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
376  m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Cuda");
377  }
378 #endif
379 
380  }
381  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
382 
383  Kokkos::finalize();
384 
385  return !success;
386 }
double do_time_analytic(int nderiv, int nloop)
Definition: fad_expr.cpp:94
const int SLFadSize
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
double time
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: mat_vec.cpp:477
void connect_vtune()
const char * layout_names[]
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: mat_vec.cpp:438
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
int main()
Definition: ad_example.cpp:191
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
double throughput
void setDocString(const char doc_string[])
LayoutType
const int HierSLFadSize
const int HierSFadSize
const int SFadSize
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
int n