43 #include <sys/types.h>
49 const std::string& name)
51 std::cout << name <<
"\t "
59 typename ... ViewArgs>
71 const bool hierarchical,
75 perf_value.
time = 1.0;
84 catch(std::exception& e) {
85 std::cout << e.what() << std::endl;
96 catch(std::exception& e) {
97 std::cout << e.what() << std::endl;
104 print_perf(perf, perf_value, p,
"Analytic-s");
106 catch(std::exception& e) {
107 std::cout << e.what() << std::endl;
114 print_perf(perf, perf_value, p,
"Analytic-sl");
116 catch(std::exception& e) {
117 std::cout << e.what() << std::endl;
122 if (flat && sfad && p ==
SFadSize) {
125 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,
check);
128 catch(std::exception& e) {
129 std::cout << e.what() << std::endl;
137 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,
check);
140 catch(std::exception& e) {
141 std::cout << e.what() << std::endl;
149 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,
check);
152 catch(std::exception& e) {
153 std::cout << e.what() << std::endl;
157 do_time_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,
check);
158 print_perf(perf_scratch, perf_value, p,
"DFad Scratch");
160 catch(std::exception& e) {
161 std::cout << e.what() << std::endl;
169 do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,p,nloop,
check);
172 catch(std::exception& e) {
173 std::cout << e.what() << std::endl;
181 do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,p,nloop,
check);
184 catch(std::exception& e) {
185 std::cout << e.what() << std::endl;
190 if (hierarchical && dfad) {
193 do_time_fad_hierarchical_dfad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,
check);
196 catch(std::exception& e) {
197 std::cout << e.what() << std::endl;
201 do_time_fad_hierarchical_dfad_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,
check);
202 print_perf(perf_scratch, perf_value, p,
"H. DFad Scratch");
204 catch(std::exception& e) {
205 std::cout << e.what() << std::endl;
234 const bool hierarchical,
237 const std::string& device)
240 std::cout.setf(std::ios::scientific);
241 std::cout.precision(prec);
242 std::cout << std::endl
244 <<
" performance for layout "
246 <<
" m = " << m <<
" n = " << n <<
" p = " << p
247 << std::endl << std::endl;
248 std::cout <<
"Computation \t Time \t Throughput \t Ratio" << std::endl;
251 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
252 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,
check);
254 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
255 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,
check);
257 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
258 (m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,
check);
263 std::stringstream cmd;
264 pid_t my_os_pid=getpid();
265 const std::string vtune_loc =
267 const std::string output_dir =
"./vtune";
269 <<
" -collect hotspots -result-dir " << output_dir
270 <<
" -target-pid " << my_os_pid <<
" &";
271 std::cout << cmd.str() << std::endl;
272 system(cmd.str().c_str());
276 int main(
int argc,
char* argv[]) {
277 Kokkos::initialize(argc,argv);
284 clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
286 clp.
setOption(
"m", &m,
"Number of matrix rows");
288 clp.
setOption(
"n", &n,
"Number of matrix columns");
290 clp.
setOption(
"p", &p,
"Number of derivative components");
292 clp.
setOption(
"nloop", &nloop,
"Number of loops");
293 #ifdef KOKKOS_ENABLE_SERIAL
295 clp.
setOption(
"serial",
"no-serial", &serial,
"Whether to run Serial");
297 #ifdef KOKKOS_ENABLE_OPENMP
299 clp.
setOption(
"openmp",
"no-openmp", &openmp,
"Whether to run OpenMP");
301 #ifdef KOKKOS_ENABLE_THREADS
303 clp.
setOption(
"threads",
"no-threads", &threads,
"Whether to run Threads");
305 #ifdef KOKKOS_ENABLE_CUDA
307 clp.
setOption(
"cuda",
"no-cuda", &cuda,
"Whether to run CUDA");
309 bool print_config =
false;
310 clp.
setOption(
"print-config",
"no-print-config", &print_config,
311 "Whether to print Kokkos device configuration");
316 clp.
setOption(
"vtune",
"no-vtune", &vtune,
"Profile with vtune");
318 clp.
setOption(
"value",
"no-value", &value,
"Run value calculation");
319 bool analytic =
true;
320 clp.
setOption(
"analytic",
"no-analytic", &analytic,
321 "Run analytic derivative calculation");
323 clp.
setOption(
"sfad",
"no-sfad", &sfad,
"Run SFad derivative calculation");
325 clp.
setOption(
"slfad",
"no-slfad", &slfad,
"Run SLFad derivative calculation");
327 clp.
setOption(
"dfad",
"no-dfad", &dfad,
"Run DFad derivative calculation");
329 clp.
setOption(
"flat",
"no-flat", &flat,
"Run flat Fad derivative calculation");
330 bool hierarchical =
true;
331 clp.
setOption(
"hierarchical",
"no-hierarchical", &hierarchical,
"Run hierarchical Fad derivative calculation");
333 clp.
setOption(
"check",
"no-check", &check,
"Check calculations are correct");
336 switch (clp.
parse(argc, argv)) {
350 Kokkos::print_configuration(std::cout,
true);
352 #ifdef KOKKOS_ENABLE_SERIAL
354 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
355 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"Serial");
359 #ifdef KOKKOS_ENABLE_OPENMP
361 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
362 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"OpenMP");
366 #ifdef KOKKOS_ENABLE_THREADS
368 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
369 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"Threads");
373 #ifdef KOKKOS_ENABLE_CUDA
375 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
376 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"Cuda");
double do_time_analytic(int nderiv, int nloop)
void do_times(const T x[], int nloop, Teuchos::Array< double > ×)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
const char * layout_names[]
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
void setDocString(const char doc_string[])
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)