37 #include "impl/Kokkos_Timer.hpp"
40 #include <sys/types.h>
47 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
69 const ViewTypeB& b_arg,
70 const ViewTypeC& c_arg) :
71 A(A_arg),
b(b_arg),
c(c_arg),
n(
A.extent(1))
88 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
111 const ViewTypeB& b_arg,
112 const ViewTypeC& c_arg) :
113 A(A_arg),
b(b_arg),
c(c_arg),
n(
A.extent(1)),
p(
A.extent(2)-1)
123 c(i,p) +=
A(i,j,p)*
b(j,p);
125 c(i,k) +=
A(i,j,k)*
b(j,p) +
A(i,j,p)*
b(j,k);
134 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC,
158 const ViewTypeB& b_arg,
159 const ViewTypeC& c_arg) :
160 A(A_arg),
b(b_arg),
c(c_arg),
n(
A.extent(1)),
p(
A.extent(2)-1)
176 t[k] +=
A(i,j,k)*bv + av*
b(j,k);
189 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC,
212 const ViewTypeB& b_arg,
213 const ViewTypeC& c_arg) :
214 A(A_arg),
b(b_arg),
c(c_arg),
n(
A.extent(1))
233 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
237 t[k] +=
A(i,j,k)*bv + av*
b(j,k);
249 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
254 Kokkos::parallel_for( A.extent(0), f );
258 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
263 Kokkos::parallel_for( A.extent(0), f );
267 template <
int MaxP,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
272 Kokkos::parallel_for( A.extent(0), f );
276 template <
int p,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
281 Kokkos::parallel_for( A.extent(0), f );
284 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
286 check_val(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c)
288 const double tol = 1.0e-14;
289 typedef typename ViewTypeC::value_type value_type;
290 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
291 Kokkos::deep_copy(h_c, c);
292 const size_t m = A.extent(0);
293 const size_t n = A.extent(1);
294 for (
size_t i=0; i<m; ++i) {
297 std::cout <<
"Comparison failed! " << i <<
" : " << h_c(i) <<
" , " << t
303 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
307 const double tol = 1.0e-14;
308 typedef typename ViewTypeC::value_type value_type;
309 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
310 Kokkos::deep_copy(h_c, c);
311 const size_t m = A.extent(0);
312 const size_t n = A.extent(1);
313 const size_t p = A.extent(2);
314 for (
size_t i=0; i<m; ++i) {
315 for (
size_t j=0; j<p; ++j) {
316 value_type t = (j == p-1 ? n : 2*n);
318 std::cout <<
"Comparison failed! " << i <<
"," << j <<
" : "
319 << h_c(i,j) <<
" , " << t << std::endl;
331 template <
typename FadType,
typename ... ViewArgs>
333 do_time_fad(
const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
336 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
337 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
338 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
339 typedef typename ViewTypeA::execution_space execution_space;
341 ViewTypeA
A(
"A",m,n,p+1);
342 ViewTypeB b(
"B",n,p+1);
343 ViewTypeC
c(
"c",m,p+1);
346 for (
size_t k=0; k<p; ++k)
347 a.fastAccessDx(k) = 1.0;
348 Kokkos::deep_copy(A, a);
349 Kokkos::deep_copy(b, a);
351 Kokkos::Impl::Timer wall_clock;
356 execution_space::fence();
359 for (
size_t l=0; l<nloop; l++) {
362 execution_space::fence();
364 perf.
time = wall_clock.seconds() / nloop;
365 perf.
flops = m*n*(2+4*p);
369 typename ViewTypeA::array_type A_flat =
A;
370 typename ViewTypeB::array_type b_flat = b;
371 typename ViewTypeC::array_type c_flat =
c;
378 template <
typename ... ViewArgs>
381 const size_t nloop,
const bool check)
383 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
384 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
385 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
386 typedef typename ViewTypeA::execution_space execution_space;
388 ViewTypeA
A(
"A",m,n,p+1);
389 ViewTypeB b(
"B",n,p+1);
390 ViewTypeC
c(
"c",m,p+1);
392 Kokkos::deep_copy(A, 1.0);
393 Kokkos::deep_copy(b, 1.0);
395 Kokkos::Impl::Timer wall_clock;
400 execution_space::fence();
404 for (
size_t l=0; l<nloop; l++) {
407 execution_space::fence();
410 perf.
time = wall_clock.seconds() / nloop;
411 perf.
flops = m*n*(2+4*p);
420 template <
int MaxP,
typename ... ViewArgs>
423 const size_t nloop,
const bool check)
425 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
426 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
427 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
428 typedef typename ViewTypeA::execution_space execution_space;
430 ViewTypeA
A(
"A",m,n,p+1);
431 ViewTypeB b(
"B",n,p+1);
432 ViewTypeC
c(
"c",m,p+1);
434 Kokkos::deep_copy(A, 1.0);
435 Kokkos::deep_copy(b, 1.0);
437 Kokkos::Impl::Timer wall_clock;
441 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
442 execution_space::fence();
446 for (
size_t l=0; l<nloop; l++) {
447 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
449 execution_space::fence();
452 perf.
time = wall_clock.seconds() / nloop;
453 perf.
flops = m*n*(2+4*p);
462 template <
int p,
typename ... ViewArgs>
465 const size_t nloop,
const bool check)
467 typedef Kokkos::View<
double**[p+1], ViewArgs...> ViewTypeA;
468 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
469 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
470 typedef typename ViewTypeA::execution_space execution_space;
472 ViewTypeA
A(
"A",m,n,p+1);
473 ViewTypeB b(
"B",n,p+1);
474 ViewTypeC
c(
"c",m,p+1);
476 Kokkos::deep_copy(A, 1.0);
477 Kokkos::deep_copy(b, 1.0);
479 Kokkos::Impl::Timer wall_clock;
483 run_mat_vec_deriv_s<p>(
A, b,
c );
484 execution_space::fence();
488 for (
size_t l=0; l<nloop; l++) {
489 run_mat_vec_deriv_s<p>(
A, b,
c );
491 execution_space::fence();
494 perf.
time = wall_clock.seconds() / nloop;
495 perf.
flops = m*n*(2+4*p);
504 template <
typename ... ViewArgs>
509 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
510 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeB;
511 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeC;
512 typedef typename ViewTypeA::execution_space execution_space;
514 ViewTypeA
A(
"A",m,n);
518 Kokkos::deep_copy(A, 1.0);
519 Kokkos::deep_copy(b, 1.0);
521 Kokkos::Impl::Timer wall_clock;
526 execution_space::fence();
529 for (
size_t l=0; l<nloop; l++) {
532 execution_space::fence();
534 perf.
time = wall_clock.seconds() / nloop;
547 std::cout << name <<
"\t "
548 << perf.
time <<
"\t "
568 perf_analytic.
time = 1.0;
582 print_perf(perf_analytic, perf_analytic,
"Analytic ");
588 print_perf(perf, perf_analytic,
"Analytic-s");
594 print_perf(perf, perf_analytic,
"Analytic-sl");
600 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
607 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
614 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
630 template <
int SFadSize,
int SLFadSize,
typename Device>
643 const std::string& device)
646 std::cout.setf(std::ios::scientific);
647 std::cout.precision(prec);
648 std::cout << std::endl
650 <<
" performance for layout "
652 <<
" m = " << m <<
" n = " << n <<
" p = " << p
653 << std::endl << std::endl;
654 std::cout <<
"Computation \t Time \t Throughput \t Ratio" << std::endl;
657 do_times<SFadSize,SLFadSize,Kokkos::LayoutLeft,Device>(
658 m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
660 do_times<SFadSize,SLFadSize,Kokkos::LayoutRight,Device>(
661 m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
663 do_times<SFadSize,SLFadSize,Device>
664 (m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
669 std::stringstream cmd;
670 pid_t my_os_pid=getpid();
671 const std::string vtune_loc =
673 const std::string output_dir =
"./vtune";
675 <<
" -collect hotspots -result-dir " << output_dir
676 <<
" -target-pid " << my_os_pid <<
" &";
677 std::cout << cmd.str() << std::endl;
678 system(cmd.str().c_str());
685 int main(
int argc,
char* argv[]) {
691 clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
693 clp.
setOption(
"m", &m,
"Number of matrix rows");
695 clp.
setOption(
"n", &n,
"Number of matrix columns");
697 clp.
setOption(
"p", &p,
"Number of derivative components");
699 clp.
setOption(
"nloop", &nloop,
"Number of loops");
700 #ifdef KOKKOS_ENABLE_SERIAL
702 clp.
setOption(
"serial",
"no-serial", &serial,
"Whether to run Serial");
704 #ifdef KOKKOS_ENABLE_OPENMP
706 clp.
setOption(
"openmp", &openmp,
"Number of OpenMP threads");
708 #ifdef KOKKOS_ENABLE_THREADS
710 clp.
setOption(
"threads", &threads,
"Number of pThreads threads");
712 #ifdef KOKKOS_ENABLE_CUDA
714 clp.
setOption(
"cuda",
"no-cuda", &cuda,
"Whether to run CUDA");
718 "Number of NUMA domains to use (set to 0 to use all NUMAs");
719 int cores_per_numa = 0;
720 clp.
setOption(
"cores-per-numa", &cores_per_numa,
721 "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
722 bool print_config =
false;
723 clp.
setOption(
"print-config",
"no-print-config", &print_config,
724 "Whether to print Kokkos device configuration");
729 clp.
setOption(
"vtune",
"no-vtune", &vtune,
"Profile with vtune");
731 clp.
setOption(
"value",
"no-value", &value,
"Run value calculation");
732 bool analytic =
true;
733 clp.
setOption(
"analytic",
"no-analytic", &analytic,
734 "Run analytic derivative calculation");
736 clp.
setOption(
"sfad",
"no-sfad", &sfad,
"Run SFad derivative calculation");
738 clp.
setOption(
"slfad",
"no-slfad", &slfad,
"Run SLFad derivative calculation");
739 #if defined(KOKKOS_ENABLE_CUDA_UVM)
741 clp.
setOption(
"dfad",
"no-dfad", &dfad,
"Run DFad derivative calculation");
746 clp.
setOption(
"check",
"no-check", &check,
"Check calculations are correct");
749 switch (clp.
parse(argc, argv)) {
762 Kokkos::InitArguments init_args;
763 init_args.num_threads = cores_per_numa;
764 init_args.num_numa = numa;
766 Kokkos::initialize(init_args);
769 Kokkos::print_configuration(std::cout,
true);
771 #ifdef KOKKOS_ENABLE_SERIAL
773 do_times_layout<SFadSize,SLFadSize,Kokkos::Serial>(
774 m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"Serial");
778 #ifdef KOKKOS_ENABLE_OPENMP
780 do_times_layout<SFadSize,SLFadSize,Kokkos::OpenMP>(
781 m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"OpenMP");
785 #ifdef KOKKOS_ENABLE_THREADS
787 do_times_layout<SFadSize,SLFadSize,Kokkos::Threads>(
788 m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"Threads");
792 #ifdef KOKKOS_ENABLE_CUDA
794 do_times_layout<SFadSize,SLFadSize,Kokkos::Cuda>(
795 m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"Cuda");
double do_time_analytic(int nderiv, int nloop)
const char * layout_names[]
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > ×)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t ph, const size_t nloop, const bool value, const bool sfad, const bool slfad, const bool dfad, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
#define KOKKOS_INLINE_FUNCTION
ViewTypeC::value_type scalar_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
void start(bool reset=false)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
MatVecFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
const LayoutType layout_values[]
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
ViewTypeC::value_type scalar_type
MatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
SLMatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
SMatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
ViewTypeC::value_type scalar_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
ViewTypeC::value_type scalar_type
ViewTypeC::execution_space execution_space
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
ViewTypeC::execution_space execution_space
ViewTypeC::execution_space execution_space
const int num_layout_types
ViewTypeC::execution_space execution_space
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)