30 #define SACADO_VIEW_CUDA_HIERARCHICAL 1
33 #define SACADO_ALIGN_SFAD 1
42 #include "impl/Kokkos_Timer.hpp"
45 #include <sys/types.h>
52 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
53 void run_mat_vec(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c) {
54 typedef typename ViewTypeC::value_type scalar_type;
55 typedef typename ViewTypeC::execution_space execution_space;
57 const int m = A.extent(0);
58 const int n = A.extent(1);
60 Kokkos::RangePolicy<execution_space>( 0,m ),
61 KOKKOS_LAMBDA (
const int i) {
63 for (
int j=0; j<n; ++j)
70 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
72 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
75 typedef typename ViewTypeC::value_type scalar_type;
76 typedef typename ViewTypeC::execution_space execution_space;
78 #if defined (KOKKOS_ENABLE_CUDA)
79 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
81 const bool is_cuda =
false;
83 const unsigned vector_size = is_cuda ? 32 : 1;
84 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
86 const int m = A.extent(0);
87 const int n = A.extent(1);
88 const int range = (m+team_size-1)/team_size;
90 typedef Kokkos::TeamPolicy<execution_space> Policy;
92 Policy( range,team_size,vector_size ),
93 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
94 const int i = team.league_rank()*team.team_size() + team.team_rank();
99 for (
int j=0; j<n; ++j)
106 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
108 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
110 const ViewTypeC& c) {
111 typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
112 typedef typename ViewTypeC::execution_space execution_space;
114 #if defined (KOKKOS_ENABLE_CUDA)
115 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
117 const bool is_cuda =
false;
119 const unsigned vector_size = is_cuda ? 32 : 1;
120 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
122 const int m = A.extent(0);
123 const int n = A.extent(1);
124 const int range = (m+team_size-1)/team_size;
126 typedef Kokkos::TeamPolicy<execution_space> Policy;
127 Kokkos::parallel_for(
128 Policy( range,team_size,vector_size ),
129 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
130 const int i = team.league_rank()*team.team_size() + team.team_rank();
135 for (
int j=0; j<n; ++j)
144 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
146 const ViewTypeC& c) {
147 typedef typename ViewTypeC::value_type scalar_type;
148 typedef typename ViewTypeC::execution_space execution_space;
150 #if defined (KOKKOS_ENABLE_CUDA)
151 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
153 const bool is_cuda =
false;
155 const unsigned vector_size = 1;
156 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
158 const int m = A.extent(0);
159 const int n = A.extent(1);
160 const int range = (m+team_size-1)/team_size;
162 typedef Kokkos::TeamPolicy<execution_space> Policy;
163 Kokkos::parallel_for(
164 Policy( range,team_size,vector_size ),
165 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
166 const int i = team.league_rank()*team.team_size() + team.team_rank();
171 for (
int j=0; j<n; ++j)
180 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
182 check_val(
const ViewTypeA& A,
const ViewTypeB& b,
const ViewTypeC& c)
184 const double tol = 1.0e-14;
185 typedef typename ViewTypeC::value_type value_type;
186 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
187 Kokkos::deep_copy(h_c, c);
188 const size_t m = A.extent(0);
189 const size_t n = A.extent(1);
190 for (
size_t i=0; i<m; ++i) {
193 std::cout <<
"Comparison failed! " << i <<
" : " << h_c(i) <<
" , " << t
199 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
201 check_deriv(
const ViewTypeA& A,
const ViewTypeB& b,
const ViewTypeC& c)
203 const double tol = 1.0e-14;
204 typedef typename ViewTypeC::value_type value_type;
205 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
206 Kokkos::deep_copy(h_c, c);
207 const size_t m = A.extent(0);
208 const size_t n = A.extent(1);
209 const size_t p = Kokkos::dimension_scalar(A);
210 for (
size_t i=0; i<m; ++i) {
211 for (
size_t j=0; j<p; ++j) {
212 value_type t = (j == p-1 ? n : 2*n);
214 std::cout <<
"Comparison failed! " << i <<
"," << j <<
" : "
215 << h_c(i).fastAccessDx(j) <<
" , " << t << std::endl;
227 template <
typename FadType,
typename ... ViewArgs>
229 do_time_fad(
const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
232 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
233 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
234 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
235 typedef typename ViewTypeA::execution_space execution_space;
237 ViewTypeA
A(
"A",m,n,p+1);
238 ViewTypeB b(
"B",n,p+1);
239 ViewTypeC
c(
"c",m,p+1);
242 for (
size_t k=0; k<p; ++k)
243 a.fastAccessDx(k) = 1.0;
244 Kokkos::deep_copy(A, a);
245 Kokkos::deep_copy(b, a);
247 Kokkos::Impl::Timer wall_clock;
252 execution_space().fence();
255 for (
size_t l=0; l<nloop; l++) {
258 execution_space().fence();
260 perf.
time = wall_clock.seconds() / nloop;
261 perf.
flops = m*n*(2+4*p);
271 template <
typename FadType,
typename ... ViewArgs>
274 const size_t nloop,
const bool check)
276 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
277 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
278 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
279 typedef typename ViewTypeA::execution_space execution_space;
281 #if defined (KOKKOS_ENABLE_CUDA)
282 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
284 const bool is_cuda =
false;
287 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
288 const int FadStride = is_cuda ? 32 : 1;
289 #if defined(SACADO_ALIGN_SFAD)
291 const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
292 const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
293 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
295 typedef FadType AlignedFadType;
299 const int FadStride = 1;
300 typedef FadType AlignedFadType;
304 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
309 typedef typename ViewTypeA::array_layout ConLayoutA;
310 typedef typename ViewTypeB::array_layout ConLayoutB;
311 typedef typename ViewTypeC::array_layout ConLayoutC;
316 typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
317 typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
318 typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
320 ConViewTypeA
A(
"A",m,n,pa+1);
321 ConViewTypeB b(
"B",n,pa+1);
322 ConViewTypeC
c(
"c",m,pa+1);
324 AlignedFadType
a(pa, 1.0);
325 for (
size_t k=0; k<pa; ++k)
326 a.fastAccessDx(k) = 1.0;
327 Kokkos::deep_copy(A, a);
328 Kokkos::deep_copy(b, a);
330 Kokkos::Impl::Timer wall_clock;
333 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
334 const size_t concurrency = execution_space::concurrency();
335 const size_t warp_dim = is_cuda ? 32 : 1;
336 const size_t block_size = pa*
sizeof(double);
337 const size_t nkernels = concurrency / warp_dim;
338 const size_t mem_pool_size =
339 static_cast<size_t>(1.2*nkernels*block_size);
340 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
341 execution_space space;
351 execution_space().fence();
354 for (
size_t l=0; l<nloop; l++) {
357 execution_space().fence();
359 perf.
time = wall_clock.seconds() / nloop;
360 perf.
flops = m*n*(2+4*p);
367 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
374 template <
typename ... ViewArgs>
379 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
380 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeB;
381 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeC;
382 typedef typename ViewTypeA::execution_space execution_space;
384 ViewTypeA
A(
"A",m,n);
388 Kokkos::deep_copy(A, 1.0);
389 Kokkos::deep_copy(b, 1.0);
391 Kokkos::Impl::Timer wall_clock;
396 execution_space().fence();
399 for (
size_t l=0; l<nloop; l++) {
402 execution_space().fence();
404 perf.
time = wall_clock.seconds() / nloop;
416 const std::string& name)
418 std::cout << name <<
"\t "
419 << perf.
time <<
"\t "
426 typename ... ViewArgs>
437 const bool hierarchical,
441 perf_value.
time = 1.0;
451 if (sfad && p == SFadSize) {
453 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,
check);
458 if (slfad && p <= SLFadSize) {
460 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,
check);
467 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,
check);
473 if (sfad && ph == HierSFadSize) {
475 do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,
check);
476 print_perf(perf, perf_value, ph,
"Hier SFad ");
478 if (slfad && ph <= HierSLFadSize) {
480 do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,
check);
481 print_perf(perf, perf_value, ph,
"Hier SLFad");
485 do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,
check);
486 print_perf(perf, perf_value, ph,
"Hier DFad ");
514 const bool hierarchical,
517 const std::string& device)
520 std::cout.setf(std::ios::scientific);
521 std::cout.precision(prec);
522 std::cout << std::endl
524 <<
" performance for layout "
525 << layout_names[layout]
526 <<
" m = " << m <<
" n = " << n <<
" p = " << p <<
" ph = " << ph
527 << std::endl << std::endl;
528 std::cout <<
"Computation \t Time \t Throughput \t Ratio" << std::endl;
531 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
532 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,
check);
534 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
535 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,
check);
537 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
538 (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,
check);
543 std::stringstream cmd;
544 pid_t my_os_pid=getpid();
545 const std::string vtune_loc =
547 const std::string output_dir =
"./vtune";
549 <<
" -collect hotspots -result-dir " << output_dir
550 <<
" -target-pid " << my_os_pid <<
" &";
551 std::cout << cmd.str() << std::endl;
552 system(cmd.str().c_str());
557 const int SFadSize = 32;
560 const int HierSFadSize = 32;
563 int main(
int argc,
char* argv[]) {
569 clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
571 clp.
setOption(
"m", &m,
"Number of matrix rows");
573 clp.
setOption(
"n", &n,
"Number of matrix columns");
575 clp.
setOption(
"p", &p,
"Number of derivative components");
577 clp.
setOption(
"ph", &ph,
"Number of derivative components for hierarchical");
579 clp.
setOption(
"nloop", &nloop,
"Number of loops");
580 #ifdef KOKKOS_ENABLE_SERIAL
582 clp.
setOption(
"serial",
"no-serial", &serial,
"Whether to run Serial");
584 #ifdef KOKKOS_ENABLE_OPENMP
586 clp.
setOption(
"openmp", &openmp,
"Number of OpenMP threads");
588 #ifdef KOKKOS_ENABLE_THREADS
590 clp.
setOption(
"threads", &threads,
"Number of pThreads threads");
592 #ifdef KOKKOS_ENABLE_CUDA
594 clp.
setOption(
"cuda",
"no-cuda", &cuda,
"Whether to run CUDA");
598 "Number of NUMA domains to use (set to 0 to use all NUMAs");
599 int cores_per_numa = 0;
600 clp.
setOption(
"cores-per-numa", &cores_per_numa,
601 "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
602 bool print_config =
false;
603 clp.
setOption(
"print-config",
"no-print-config", &print_config,
604 "Whether to print Kokkos device configuration");
606 clp.
setOption(
"layout", &layout, num_layout_types, layout_values,
607 layout_names,
"View layout");
609 clp.
setOption(
"vtune",
"no-vtune", &vtune,
"Profile with vtune");
611 clp.
setOption(
"value",
"no-value", &value,
"Run value calculation");
613 clp.
setOption(
"sfad",
"no-sfad", &sfad,
"Run SFad derivative calculation");
615 clp.
setOption(
"slfad",
"no-slfad", &slfad,
"Run SLFad derivative calculation");
617 clp.
setOption(
"dfad",
"no-dfad", &dfad,
"Run DFad derivative calculation");
618 bool hierarchical =
true;
619 clp.
setOption(
"hierarchical",
"no-hierarchical", &hierarchical,
"Run hierarchical Fad derivative calculation");
621 clp.
setOption(
"check",
"no-check", &check,
"Check calculations are correct");
624 switch (clp.
parse(argc, argv)) {
637 Kokkos::InitArguments init_args;
638 init_args.num_threads = cores_per_numa;
639 init_args.num_numa = numa;
641 Kokkos::initialize(init_args);
644 Kokkos::print_configuration(std::cout,
true);
646 #ifdef KOKKOS_ENABLE_SERIAL
648 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
649 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,
check,layout,
"Serial");
653 #ifdef KOKKOS_ENABLE_OPENMP
655 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
656 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,
check,layout,
"OpenMP");
660 #ifdef KOKKOS_ENABLE_THREADS
662 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
663 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,
check,layout,
"Threads");
667 #ifdef KOKKOS_ENABLE_CUDA
669 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
670 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,
check,layout,
"Cuda");
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > ×)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Base template specification for static size.
const char * layout_names[]
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
const int num_layout_types
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const LayoutType layout_values[]
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void destroyGlobalMemoryPool(const ExecSpace &space)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)