30 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
33 #define SACADO_ALIGN_SFAD 1
42 #include "Kokkos_Timer.hpp"
45 #include <sys/types.h>
52 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
53 void run_mat_vec(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c) {
54 typedef typename ViewTypeC::value_type scalar_type;
55 typedef typename ViewTypeC::execution_space execution_space;
57 const int m = A.extent(0);
58 const int n = A.extent(1);
60 Kokkos::RangePolicy<execution_space>( 0,m ),
61 KOKKOS_LAMBDA (
const int i) {
63 for (
int j=0; j<n; ++j)
70 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
72 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
75 typedef typename ViewTypeC::value_type scalar_type;
76 typedef typename ViewTypeC::execution_space execution_space;
78 #if defined (KOKKOS_ENABLE_CUDA)
80 const unsigned vector_size = is_cuda ? 32 : 1;
81 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
82 #elif defined (KOKKOS_ENABLE_HIP)
84 const unsigned vector_size = is_hip ? 64 : 1;
85 const unsigned team_size = is_hip ? 128 / vector_size : 1;
87 const unsigned vector_size = 1;
88 const unsigned team_size = 1;
91 const int m = A.extent(0);
92 const int n = A.extent(1);
93 const int range = (m+team_size-1)/team_size;
95 typedef Kokkos::TeamPolicy<execution_space> Policy;
97 Policy( range,team_size,vector_size ),
98 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
99 const int i = team.league_rank()*team.team_size() + team.team_rank();
104 for (
int j=0; j<n; ++j)
111 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
113 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
115 const ViewTypeC& c) {
116 typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
117 typedef typename ViewTypeC::execution_space execution_space;
119 #if defined (KOKKOS_ENABLE_CUDA)
121 const unsigned vector_size = is_cuda ? 32 : 1;
122 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
123 #elif defined (KOKKOS_ENABLE_HIP)
125 const unsigned vector_size = is_hip ? 64 : 1;
126 const unsigned team_size = is_hip ? 128 / vector_size : 1;
128 const unsigned vector_size = 1;
129 const unsigned team_size = 1;
132 const int m = A.extent(0);
133 const int n = A.extent(1);
134 const int range = (m+team_size-1)/team_size;
136 typedef Kokkos::TeamPolicy<execution_space> Policy;
137 Kokkos::parallel_for(
138 Policy( range,team_size,vector_size ),
139 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
140 const int i = team.league_rank()*team.team_size() + team.team_rank();
145 for (
int j=0; j<n; ++j)
154 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
156 const ViewTypeC& c) {
157 typedef typename ViewTypeC::value_type scalar_type;
158 typedef typename ViewTypeC::execution_space execution_space;
160 #if defined (KOKKOS_ENABLE_CUDA)
163 const bool is_cuda =
false;
165 const unsigned vector_size = 1;
166 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
168 const int m = A.extent(0);
169 const int n = A.extent(1);
170 const int range = (m+team_size-1)/team_size;
172 typedef Kokkos::TeamPolicy<execution_space> Policy;
173 Kokkos::parallel_for(
174 Policy( range,team_size,vector_size ),
175 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
176 const int i = team.league_rank()*team.team_size() + team.team_rank();
181 for (
int j=0; j<n; ++j)
190 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
192 check_val(
const ViewTypeA& A,
const ViewTypeB& b,
const ViewTypeC& c)
194 const double tol = 1.0e-14;
195 typedef typename ViewTypeC::value_type value_type;
196 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
197 Kokkos::deep_copy(h_c, c);
198 const size_t m = A.extent(0);
199 const size_t n = A.extent(1);
200 for (
size_t i=0; i<m; ++
i) {
203 std::cout <<
"Comparison failed! " << i <<
" : " << h_c(i) <<
" , " << t
209 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
211 check_deriv(
const ViewTypeA& A,
const ViewTypeB& b,
const ViewTypeC& c)
213 const double tol = 1.0e-14;
214 typedef typename ViewTypeC::value_type value_type;
215 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
216 Kokkos::deep_copy(h_c, c);
217 const size_t m = A.extent(0);
218 const size_t n = A.extent(1);
219 const size_t p = Kokkos::dimension_scalar(A);
220 for (
size_t i=0; i<m; ++
i) {
221 for (
size_t j=0; j<
p; ++j) {
222 value_type t = (j == p-1 ? n : 2*n);
224 std::cout <<
"Comparison failed! " << i <<
"," << j <<
" : "
225 << h_c(i).fastAccessDx(j) <<
" , " << t << std::endl;
237 template <
typename FadType,
typename ... ViewArgs>
239 do_time_fad(
const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
242 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
243 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
244 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
245 typedef typename ViewTypeA::execution_space execution_space;
247 ViewTypeA
A(
"A",m,n,p+1);
248 ViewTypeB b(
"B",n,p+1);
249 ViewTypeC
c(
"c",m,p+1);
252 for (
size_t k=0; k<
p; ++k)
253 a.fastAccessDx(k) = 1.0;
254 Kokkos::deep_copy(A, a);
255 Kokkos::deep_copy(b, a);
257 Kokkos::Timer wall_clock;
262 execution_space().fence();
265 for (
size_t l=0; l<nloop; l++) {
268 execution_space().fence();
270 perf.
time = wall_clock.seconds() / nloop;
281 template <
typename FadType,
typename ... ViewArgs>
284 const size_t nloop,
const bool check)
286 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
287 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
288 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
289 typedef typename ViewTypeA::execution_space execution_space;
291 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
292 #if defined (KOKKOS_ENABLE_CUDA)
294 const int FadStride = is_cuda ? 32 : 1;
295 #elif defined(KOKKOS_ENABLE_HIP)
297 const int FadStride = is_hip ? 64 : 1;
299 const int FadStride 1;
301 #if defined(SACADO_ALIGN_SFAD)
303 const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
304 const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride :
p;
305 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
307 typedef FadType AlignedFadType;
311 const int FadStride = 1;
312 typedef FadType AlignedFadType;
316 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
321 typedef typename ViewTypeA::array_layout ConLayoutA;
322 typedef typename ViewTypeB::array_layout ConLayoutB;
323 typedef typename ViewTypeC::array_layout ConLayoutC;
328 typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
329 typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
330 typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
332 ConViewTypeA
A(
"A",m,n,pa+1);
333 ConViewTypeB b(
"B",n,pa+1);
334 ConViewTypeC
c(
"c",m,pa+1);
336 AlignedFadType
a(pa, 1.0);
337 for (
size_t k=0; k<pa; ++k)
338 a.fastAccessDx(k) = 1.0;
339 Kokkos::deep_copy(A, a);
340 Kokkos::deep_copy(b, a);
342 Kokkos::Timer wall_clock;
345 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
346 const size_t concurrency = execution_space().concurrency();
348 #if defined (KOKKOS_ENABLE_CUDA)
349 const size_t warp_dim = is_cuda ? 32 : 1;
350 #elif defined (KOKKOS_ENABLE_HIP)
351 const size_t warp_dim = is_hip ? 64 : 1;
353 const size_t warp_dim = 1;
356 const size_t block_size = pa*
sizeof(double);
357 const size_t nkernels = concurrency / warp_dim;
358 const size_t mem_pool_size =
359 static_cast<size_t>(1.2*nkernels*block_size);
360 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
361 execution_space space;
371 execution_space().fence();
374 for (
size_t l=0; l<nloop; l++) {
377 execution_space().fence();
379 perf.
time = wall_clock.seconds() / nloop;
387 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
394 template <
typename ... ViewArgs>
399 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
400 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeB;
401 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeC;
402 typedef typename ViewTypeA::execution_space execution_space;
404 ViewTypeA
A(
"A",m,n);
408 Kokkos::deep_copy(A, 1.0);
409 Kokkos::deep_copy(b, 1.0);
411 Kokkos::Timer wall_clock;
416 execution_space().fence();
419 for (
size_t l=0; l<nloop; l++) {
422 execution_space().fence();
424 perf.
time = wall_clock.seconds() / nloop;
436 const std::string& name)
438 std::cout << name <<
"\t "
439 << perf.
time <<
"\t "
446 typename ... ViewArgs>
457 const bool hierarchical,
461 perf_value.
time = 1.0;
471 if (sfad && p == SFadSize) {
473 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
478 if (slfad && p <= SLFadSize) {
480 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,
p,nloop,
check);
487 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,
p,nloop,
check);
493 if (sfad && ph == HierSFadSize) {
495 do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,
check);
496 print_perf(perf, perf_value, ph,
"Hier SFad ");
498 if (slfad && ph <= HierSLFadSize) {
500 do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,
check);
501 print_perf(perf, perf_value, ph,
"Hier SLFad");
505 do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,
check);
506 print_perf(perf, perf_value, ph,
"Hier DFad ");
534 const bool hierarchical,
537 const std::string& device)
540 std::cout.setf(std::ios::scientific);
541 std::cout.precision(prec);
542 std::cout << std::endl
544 <<
" performance for layout "
545 << layout_names[layout]
546 <<
" m = " << m <<
" n = " << n <<
" p = " << p <<
" ph = " << ph
547 << std::endl << std::endl;
548 std::cout <<
"Computation \t Time \t Throughput \t Ratio" << std::endl;
551 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
552 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check);
554 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
555 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check);
557 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
558 (m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check);
563 std::stringstream cmd;
564 pid_t my_os_pid=getpid();
565 const std::string vtune_loc =
567 const std::string output_dir =
"./vtune";
569 <<
" -collect hotspots -result-dir " << output_dir
570 <<
" -target-pid " << my_os_pid <<
" &";
571 std::cout << cmd.str() << std::endl;
572 system(cmd.str().c_str());
577 const int SFadSize = 32;
580 const int HierSFadSize = 32;
583 int main(
int argc,
char* argv[]) {
589 clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
591 clp.
setOption(
"m", &m,
"Number of matrix rows");
593 clp.
setOption(
"n", &n,
"Number of matrix columns");
595 clp.
setOption(
"p", &p,
"Number of derivative components");
597 clp.
setOption(
"ph", &ph,
"Number of derivative components for hierarchical");
599 clp.
setOption(
"nloop", &nloop,
"Number of loops");
600 #ifdef KOKKOS_ENABLE_SERIAL
602 clp.
setOption(
"serial",
"no-serial", &serial,
"Whether to run Serial");
604 #ifdef KOKKOS_ENABLE_OPENMP
606 clp.
setOption(
"openmp", &openmp,
"Number of OpenMP threads");
608 #ifdef KOKKOS_ENABLE_THREADS
610 clp.
setOption(
"threads", &threads,
"Number of pThreads threads");
612 #ifdef KOKKOS_ENABLE_CUDA
614 clp.
setOption(
"cuda",
"no-cuda", &cuda,
"Whether to run CUDA");
616 #ifdef KOKKOS_ENABLE_HIP
618 clp.
setOption(
"hip",
"no-hip", &cuda,
"Whether to run HIP");
622 "Number of NUMA domains to use (set to 0 to use all NUMAs");
623 int cores_per_numa = 0;
624 clp.
setOption(
"cores-per-numa", &cores_per_numa,
625 "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
626 bool print_config =
false;
627 clp.
setOption(
"print-config",
"no-print-config", &print_config,
628 "Whether to print Kokkos device configuration");
630 clp.
setOption(
"layout", &layout, num_layout_types, layout_values,
631 layout_names,
"View layout");
633 clp.
setOption(
"vtune",
"no-vtune", &vtune,
"Profile with vtune");
635 clp.
setOption(
"value",
"no-value", &value,
"Run value calculation");
637 clp.
setOption(
"sfad",
"no-sfad", &sfad,
"Run SFad derivative calculation");
639 clp.
setOption(
"slfad",
"no-slfad", &slfad,
"Run SLFad derivative calculation");
641 clp.
setOption(
"dfad",
"no-dfad", &dfad,
"Run DFad derivative calculation");
642 bool hierarchical =
true;
643 clp.
setOption(
"hierarchical",
"no-hierarchical", &hierarchical,
"Run hierarchical Fad derivative calculation");
645 clp.
setOption(
"check",
"no-check", &check,
"Check calculations are correct");
648 switch (clp.
parse(argc, argv)) {
661 Kokkos::InitializationSettings init_args;
662 init_args.set_num_threads(cores_per_numa);
664 Kokkos::initialize(init_args);
667 Kokkos::print_configuration(std::cout,
true);
669 #ifdef KOKKOS_ENABLE_SERIAL
671 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
672 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"Serial");
676 #ifdef KOKKOS_ENABLE_OPENMP
678 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
679 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"OpenMP");
683 #ifdef KOKKOS_ENABLE_THREADS
685 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
686 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"Threads");
690 #ifdef KOKKOS_ENABLE_CUDA
692 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
693 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"Cuda");
697 #ifdef KOKKOS_ENABLE_HIP
699 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::HIP>(
700 m,n,
p,ph,nloop,
value,sfad,slfad,dfad,hierarchical,
check,layout,
"HIP");
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > ×)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
const char * layout_names[]
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void destroyGlobalMemoryPool(const ExecSpace &space)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)