23 #include <sys/types.h>
29 const std::string& name)
31 std::cout << name <<
"\t "
39 typename ... ViewArgs>
51 const bool hierarchical,
55 perf_value.
time = 1.0;
64 catch(std::exception& e) {
65 std::cout << e.what() << std::endl;
76 catch(std::exception& e) {
77 std::cout << e.what() << std::endl;
86 catch(std::exception& e) {
87 std::cout << e.what() << std::endl;
94 print_perf(perf, perf_value, p,
"Analytic-sl");
96 catch(std::exception& e) {
97 std::cout << e.what() << std::endl;
102 if (flat && sfad && p ==
SFadSize) {
105 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
108 catch(std::exception& e) {
109 std::cout << e.what() << std::endl;
117 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
120 catch(std::exception& e) {
121 std::cout << e.what() << std::endl;
129 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,
p,nloop,
check);
132 catch(std::exception& e) {
133 std::cout << e.what() << std::endl;
137 do_time_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,
p,nloop,
check);
138 print_perf(perf_scratch, perf_value, p,
"DFad Scratch");
140 catch(std::exception& e) {
141 std::cout << e.what() << std::endl;
149 do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
152 catch(std::exception& e) {
153 std::cout << e.what() << std::endl;
161 do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
164 catch(std::exception& e) {
165 std::cout << e.what() << std::endl;
170 if (hierarchical && dfad) {
173 do_time_fad_hierarchical_dfad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,
p,nloop,
check);
176 catch(std::exception& e) {
177 std::cout << e.what() << std::endl;
181 do_time_fad_hierarchical_dfad_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,
p,nloop,
check);
182 print_perf(perf_scratch, perf_value, p,
"H. DFad Scratch");
184 catch(std::exception& e) {
185 std::cout << e.what() << std::endl;
214 const bool hierarchical,
217 const std::string& device)
220 std::cout.setf(std::ios::scientific);
221 std::cout.precision(prec);
222 std::cout << std::endl
224 <<
" performance for layout "
226 <<
" m = " << m <<
" n = " << n <<
" p = " << p
227 << std::endl << std::endl;
228 std::cout <<
"Computation \t Time \t Throughput \t Ratio" << std::endl;
231 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
232 m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check);
234 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
235 m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check);
237 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
238 (m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check);
243 std::stringstream cmd;
244 pid_t my_os_pid=getpid();
245 const std::string vtune_loc =
247 const std::string output_dir =
"./vtune";
249 <<
" -collect hotspots -result-dir " << output_dir
250 <<
" -target-pid " << my_os_pid <<
" &";
251 std::cout << cmd.str() << std::endl;
252 system(cmd.str().c_str());
256 int main(
int argc,
char* argv[]) {
257 Kokkos::initialize(argc,argv);
264 clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
266 clp.
setOption(
"m", &m,
"Number of matrix rows");
268 clp.
setOption(
"n", &n,
"Number of matrix columns");
270 clp.
setOption(
"p", &p,
"Number of derivative components");
272 clp.
setOption(
"nloop", &nloop,
"Number of loops");
273 #ifdef KOKKOS_ENABLE_SERIAL
275 clp.
setOption(
"serial",
"no-serial", &serial,
"Whether to run Serial");
277 #ifdef KOKKOS_ENABLE_OPENMP
279 clp.
setOption(
"openmp",
"no-openmp", &openmp,
"Whether to run OpenMP");
281 #ifdef KOKKOS_ENABLE_THREADS
283 clp.
setOption(
"threads",
"no-threads", &threads,
"Whether to run Threads");
285 #ifdef KOKKOS_ENABLE_CUDA
287 clp.
setOption(
"cuda",
"no-cuda", &cuda,
"Whether to run CUDA");
289 #ifdef KOKKOS_ENABLE_HIP
291 clp.
setOption(
"hip",
"no-hip", &hip,
"Whether to run HIP");
293 bool print_config =
false;
294 clp.
setOption(
"print-config",
"no-print-config", &print_config,
295 "Whether to print Kokkos device configuration");
300 clp.
setOption(
"vtune",
"no-vtune", &vtune,
"Profile with vtune");
302 clp.
setOption(
"value",
"no-value", &value,
"Run value calculation");
303 bool analytic =
true;
304 clp.
setOption(
"analytic",
"no-analytic", &analytic,
305 "Run analytic derivative calculation");
307 clp.
setOption(
"sfad",
"no-sfad", &sfad,
"Run SFad derivative calculation");
309 clp.
setOption(
"slfad",
"no-slfad", &slfad,
"Run SLFad derivative calculation");
311 clp.
setOption(
"dfad",
"no-dfad", &dfad,
"Run DFad derivative calculation");
313 clp.
setOption(
"flat",
"no-flat", &flat,
"Run flat Fad derivative calculation");
314 bool hierarchical =
true;
315 clp.
setOption(
"hierarchical",
"no-hierarchical", &hierarchical,
"Run hierarchical Fad derivative calculation");
317 clp.
setOption(
"check",
"no-check", &check,
"Check calculations are correct");
320 switch (clp.
parse(argc, argv)) {
334 Kokkos::print_configuration(std::cout,
true);
336 #ifdef KOKKOS_ENABLE_SERIAL
338 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
339 m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"Serial");
343 #ifdef KOKKOS_ENABLE_OPENMP
345 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
346 m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"OpenMP");
350 #ifdef KOKKOS_ENABLE_THREADS
352 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
353 m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"Threads");
357 #ifdef KOKKOS_ENABLE_CUDA
359 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
360 m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"Cuda");
364 #ifdef KOKKOS_ENABLE_HIP
366 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::HIP>(
367 m,n,
p,nloop,
value,analytic,sfad,slfad,dfad,flat,hierarchical,
check,layout,
"HIP");
double do_time_analytic(int nderiv, int nloop)
void do_times(const T x[], int nloop, Teuchos::Array< double > ×)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
const char * layout_names[]
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
void setDocString(const char doc_string[])
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)