10 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
13 #define SACADO_ALIGN_SFAD 1
22 #include "Kokkos_Timer.hpp"
25 #include <sys/types.h>
32 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
33 void run_mat_vec(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c) {
34 typedef typename ViewTypeC::value_type scalar_type;
35 typedef typename ViewTypeC::execution_space execution_space;
37 const int m = A.extent(0);
38 const int n = A.extent(1);
40 Kokkos::RangePolicy<execution_space>( 0,m ),
41 KOKKOS_LAMBDA (
const int i) {
43 for (
int j=0; j<n; ++j)
50 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
52 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
55 typedef typename ViewTypeC::value_type scalar_type;
56 typedef typename ViewTypeC::execution_space execution_space;
58 #if defined (KOKKOS_ENABLE_CUDA)
60 const unsigned vector_size = is_cuda ? 32 : 1;
61 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
62 #elif defined (KOKKOS_ENABLE_HIP)
64 const unsigned vector_size = is_hip ? 64 : 1;
65 const unsigned team_size = is_hip ? 128 / vector_size : 1;
67 const unsigned vector_size = 1;
68 const unsigned team_size = 1;
71 const int m = A.extent(0);
72 const int n = A.extent(1);
73 const int range = (m+team_size-1)/team_size;
75 typedef Kokkos::TeamPolicy<execution_space> Policy;
77 Policy( range,team_size,vector_size ),
78 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
79 const int i = team.league_rank()*team.team_size() + team.team_rank();
84 for (
int j=0; j<n; ++j)
91 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
93 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
96 typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
97 typedef typename ViewTypeC::execution_space execution_space;
99 #if defined (KOKKOS_ENABLE_CUDA)
101 const unsigned vector_size = is_cuda ? 32 : 1;
102 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
103 #elif defined (KOKKOS_ENABLE_HIP)
105 const unsigned vector_size = is_hip ? 64 : 1;
106 const unsigned team_size = is_hip ? 128 / vector_size : 1;
108 const unsigned vector_size = 1;
109 const unsigned team_size = 1;
112 const int m = A.extent(0);
113 const int n = A.extent(1);
114 const int range = (m+team_size-1)/team_size;
116 typedef Kokkos::TeamPolicy<execution_space> Policy;
117 Kokkos::parallel_for(
118 Policy( range,team_size,vector_size ),
119 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
120 const int i = team.league_rank()*team.team_size() + team.team_rank();
125 for (
int j=0; j<n; ++j)
134 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
136 const ViewTypeC& c) {
137 typedef typename ViewTypeC::value_type scalar_type;
138 typedef typename ViewTypeC::execution_space execution_space;
140 #if defined (KOKKOS_ENABLE_CUDA)
143 const bool is_cuda =
false;
145 const unsigned vector_size = 1;
146 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
148 const int m = A.extent(0);
149 const int n = A.extent(1);
150 const int range = (m+team_size-1)/team_size;
152 typedef Kokkos::TeamPolicy<execution_space> Policy;
153 Kokkos::parallel_for(
154 Policy( range,team_size,vector_size ),
155 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
156 const int i = team.league_rank()*team.team_size() + team.team_rank();
161 for (
int j=0; j<n; ++j)
170 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
172 check_val(
const ViewTypeA& A,
const ViewTypeB& b,
const ViewTypeC& c)
174 const double tol = 1.0e-14;
175 typedef typename ViewTypeC::value_type value_type;
176 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
177 Kokkos::deep_copy(h_c, c);
178 const size_t m = A.extent(0);
179 const size_t n = A.extent(1);
180 for (
size_t i=0; i<m; ++
i) {
183 std::cout <<
"Comparison failed! " << i <<
" : " << h_c(i) <<
" , " << t
189 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
191 check_deriv(
const ViewTypeA& A,
const ViewTypeB& b,
const ViewTypeC& c)
193 const double tol = 1.0e-14;
194 typedef typename ViewTypeC::value_type value_type;
195 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
196 Kokkos::deep_copy(h_c, c);
197 const size_t m = A.extent(0);
198 const size_t n = A.extent(1);
199 const size_t p = Kokkos::dimension_scalar(A);
200 for (
size_t i=0; i<m; ++
i) {
201 for (
size_t j=0; j<
p; ++j) {
202 value_type t = (j == p-1 ? n : 2*n);
204 std::cout <<
"Comparison failed! " << i <<
"," << j <<
" : "
205 << h_c(i).fastAccessDx(j) <<
" , " << t << std::endl;
217 template <
typename FadType,
typename ... ViewArgs>
219 do_time_fad(
const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
222 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
223 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
224 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
225 typedef typename ViewTypeA::execution_space execution_space;
227 ViewTypeA
A(
"A",m,n,p+1);
228 ViewTypeB b(
"B",n,p+1);
229 ViewTypeC
c(
"c",m,p+1);
232 for (
size_t k=0; k<
p; ++k)
233 a.fastAccessDx(k) = 1.0;
234 Kokkos::deep_copy(A, a);
235 Kokkos::deep_copy(b, a);
237 Kokkos::Timer wall_clock;
242 execution_space().fence();
245 for (
size_t l=0; l<nloop; l++) {
248 execution_space().fence();
250 perf.
time = wall_clock.seconds() / nloop;
261 template <
typename FadType,
typename ... ViewArgs>
264 const size_t nloop,
const bool check)
266 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
267 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
268 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
269 typedef typename ViewTypeA::execution_space execution_space;
271 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
272 #if defined (KOKKOS_ENABLE_CUDA)
274 const int FadStride = is_cuda ? 32 : 1;
275 #elif defined(KOKKOS_ENABLE_HIP)
277 const int FadStride = is_hip ? 64 : 1;
279 const int FadStride 1;
281 #if defined(SACADO_ALIGN_SFAD)
283 const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
284 const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride :
p;
285 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
287 typedef FadType AlignedFadType;
291 const int FadStride = 1;
292 typedef FadType AlignedFadType;
296 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
301 typedef typename ViewTypeA::array_layout ConLayoutA;
302 typedef typename ViewTypeB::array_layout ConLayoutB;
303 typedef typename ViewTypeC::array_layout ConLayoutC;
308 typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
309 typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
310 typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
312 ConViewTypeA
A(
"A",m,n,pa+1);
313 ConViewTypeB b(
"B",n,pa+1);
314 ConViewTypeC
c(
"c",m,pa+1);
316 AlignedFadType
a(pa, 1.0);
317 for (
size_t k=0; k<pa; ++k)
318 a.fastAccessDx(k) = 1.0;
319 Kokkos::deep_copy(A, a);
320 Kokkos::deep_copy(b, a);
322 Kokkos::Timer wall_clock;
325 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
326 const size_t concurrency = execution_space().concurrency();
328 #if defined (KOKKOS_ENABLE_CUDA)
329 const size_t warp_dim = is_cuda ? 32 : 1;
330 #elif defined (KOKKOS_ENABLE_HIP)
331 const size_t warp_dim = is_hip ? 64 : 1;
333 const size_t warp_dim = 1;
336 const size_t block_size = pa*
sizeof(double);
337 const size_t nkernels = concurrency / warp_dim;
338 const size_t mem_pool_size =
339 static_cast<size_t>(1.2*nkernels*block_size);
340 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
341 execution_space space;
351 execution_space().fence();
354 for (
size_t l=0; l<nloop; l++) {
357 execution_space().fence();
359 perf.
time = wall_clock.seconds() / nloop;
367 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
374 template <
typename ... ViewArgs>
379 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
380 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeB;
381 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeC;
382 typedef typename ViewTypeA::execution_space execution_space;
384 ViewTypeA
A(
"A",m,n);
388 Kokkos::deep_copy(A, 1.0);
389 Kokkos::deep_copy(b, 1.0);
391 Kokkos::Timer wall_clock;
396 execution_space().fence();
399 for (
size_t l=0; l<nloop; l++) {
402 execution_space().fence();
404 perf.
time = wall_clock.seconds() / nloop;
416 const std::string& name)
418 std::cout << name <<
"\t "
419 << perf.
time <<
"\t "
426 typename ... ViewArgs>
437 const bool hierarchical,
441 perf_value.
time = 1.0;
451 if (sfad && p == SFadSize) {
453 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
458 if (slfad && p <= SLFadSize) {
460 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
467 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,
p,nloop,
check);
473 if (sfad && ph == HierSFadSize) {
475 do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,
check);
476 print_perf(perf, perf_value, ph,
"Hier SFad ");
478 if (slfad && ph <= HierSLFadSize) {
480 do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,
check);
481 print_perf(perf, perf_value, ph,
"Hier SLFad");
485 do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,
check);
486 print_perf(perf, perf_value, ph,
"Hier DFad ");
514 const bool hierarchical,
517 const std::string& device)
520 std::cout.setf(std::ios::scientific);
521 std::cout.precision(prec);
522 std::cout << std::endl
524 <<
" performance for layout "
525 << layout_names[layout]
526 <<
" m = " << m <<
" n = " << n <<
" p = " << p <<
" ph = " << ph
527 << std::endl << std::endl;
528 std::cout <<
"Computation \t Time \t Throughput \t Ratio" << std::endl;
531 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
532 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check);
534 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
535 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check);
537 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
538 (m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check);
543 std::stringstream cmd;
544 pid_t my_os_pid=getpid();
545 const std::string vtune_loc =
547 const std::string output_dir =
"./vtune";
549 <<
" -collect hotspots -result-dir " << output_dir
550 <<
" -target-pid " << my_os_pid <<
" &";
551 std::cout << cmd.str() << std::endl;
552 system(cmd.str().c_str());
557 const int SFadSize = 32;
560 const int HierSFadSize = 32;
563 int main(
int argc,
char* argv[]) {
569 clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
571 clp.
setOption(
"m", &m,
"Number of matrix rows");
573 clp.
setOption(
"n", &n,
"Number of matrix columns");
575 clp.
setOption(
"p", &p,
"Number of derivative components");
577 clp.
setOption(
"ph", &ph,
"Number of derivative components for hierarchical");
579 clp.
setOption(
"nloop", &nloop,
"Number of loops");
580 #ifdef KOKKOS_ENABLE_SERIAL
582 clp.
setOption(
"serial",
"no-serial", &serial,
"Whether to run Serial");
584 #ifdef KOKKOS_ENABLE_OPENMP
586 clp.
setOption(
"openmp", &openmp,
"Number of OpenMP threads");
588 #ifdef KOKKOS_ENABLE_THREADS
590 clp.
setOption(
"threads", &threads,
"Number of pThreads threads");
592 #ifdef KOKKOS_ENABLE_CUDA
594 clp.
setOption(
"cuda",
"no-cuda", &cuda,
"Whether to run CUDA");
596 #ifdef KOKKOS_ENABLE_HIP
598 clp.
setOption(
"hip",
"no-hip", &cuda,
"Whether to run HIP");
602 "Number of NUMA domains to use (set to 0 to use all NUMAs");
603 int cores_per_numa = 0;
604 clp.
setOption(
"cores-per-numa", &cores_per_numa,
605 "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
606 bool print_config =
false;
607 clp.
setOption(
"print-config",
"no-print-config", &print_config,
608 "Whether to print Kokkos device configuration");
610 clp.
setOption(
"layout", &layout, num_layout_types, layout_values,
611 layout_names,
"View layout");
613 clp.
setOption(
"vtune",
"no-vtune", &vtune,
"Profile with vtune");
615 clp.
setOption(
"value",
"no-value", &value,
"Run value calculation");
617 clp.
setOption(
"sfad",
"no-sfad", &sfad,
"Run SFad derivative calculation");
619 clp.
setOption(
"slfad",
"no-slfad", &slfad,
"Run SLFad derivative calculation");
621 clp.
setOption(
"dfad",
"no-dfad", &dfad,
"Run DFad derivative calculation");
622 bool hierarchical =
true;
623 clp.
setOption(
"hierarchical",
"no-hierarchical", &hierarchical,
"Run hierarchical Fad derivative calculation");
625 clp.
setOption(
"check",
"no-check", &check,
"Check calculations are correct");
628 switch (clp.
parse(argc, argv)) {
641 Kokkos::InitializationSettings init_args;
642 init_args.set_num_threads(cores_per_numa);
644 Kokkos::initialize(init_args);
647 Kokkos::print_configuration(std::cout,
true);
649 #ifdef KOKKOS_ENABLE_SERIAL
651 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
652 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"Serial");
656 #ifdef KOKKOS_ENABLE_OPENMP
658 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
659 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"OpenMP");
663 #ifdef KOKKOS_ENABLE_THREADS
665 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
666 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"Threads");
670 #ifdef KOKKOS_ENABLE_CUDA
672 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
673 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"Cuda");
677 #ifdef KOKKOS_ENABLE_HIP
679 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::HIP>(
680 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"HIP");
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > ×)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
const char * layout_names[]
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void destroyGlobalMemoryPool(const ExecSpace &space)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)