doc/html/fad__kokkos__mat__vec__perf_8cpp_source.html

 // @HEADER

 // ***********************************************************************

 //

 //                           Sacado Package

 //                 Copyright (2006) Sandia Corporation

 //

 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,

 // the U.S. Government retains certain rights in this software.

 //

 // This library is free software; you can redistribute it and/or modify

 // it under the terms of the GNU Lesser General Public License as

 // published by the Free Software Foundation; either version 2.1 of the

 // License, or (at your option) any later version.

 //

 // This library is distributed in the hope that it will be useful, but

 // WITHOUT ANY WARRANTY; without even the implied warranty of

 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

 // Lesser General Public License for more details.

 //

 // You should have received a copy of the GNU Lesser General Public

 // License along with this library; if not, write to the Free Software

 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301

 // USA

 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps

 // (etphipp@sandia.gov).

 //

 // ***********************************************************************

 // @HEADER


 //#define SACADO_VIEW_CUDA_HIERARCHICAL 1

 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1

 #define SACADO_KOKKOS_USE_MEMORY_POOL 1

 //#define SACADO_ALIGN_SFAD 1


 //#define SACADO_DISABLE_FAD_VIEW_SPEC

 #include "Sacado.hpp"


 #include "Teuchos_CommandLineProcessor.hpp"

 #include "Teuchos_StandardCatchMacros.hpp"

 #include "Teuchos_Time.hpp"


 #include "impl/Kokkos_Timer.hpp"


 // For vtune

 #include <sys/types.h>

 #include <unistd.h>

 #include <algorithm>


 // A performance test that computes the derivative of a simple Kokkos kernel

 // using various Fad classes


 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>

 void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {

   typedef typename ViewTypeC::value_type scalar_type;

   typedef typename ViewTypeC::execution_space execution_space;


   const int m = A.extent(0);

   const int n = A.extent(1);

   Kokkos::parallel_for(

     Kokkos::RangePolicy<execution_space>( 0,m ),

     KOKKOS_LAMBDA (const int i) {

       scalar_type t = 0.0;

       for (int j=0; j<n; ++j)

         t += A(i,j)*b(j);

       c(i) = t;

     }

   );

 }


 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)


 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>

 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,

                               const ViewTypeC& c) {

   typedef typename ViewTypeC::value_type scalar_type;

   typedef typename ViewTypeC::execution_space execution_space;


 #if defined (KOKKOS_ENABLE_CUDA)

   const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;

 #else

   const bool is_cuda = false;

 #endif

   const unsigned vector_size = is_cuda ? 32 : 1;

   const unsigned team_size = is_cuda ? 128 / vector_size : 1;


   const int m = A.extent(0);

   const int n = A.extent(1);

   const int range = (m+team_size-1)/team_size;


   typedef Kokkos::TeamPolicy<execution_space> Policy;

   Kokkos::parallel_for(

     Policy( range,team_size,vector_size ),

     KOKKOS_LAMBDA (const typename Policy::member_type& team) {

       const int i = team.league_rank()*team.team_size() + team.team_rank();

       if (i >= m)

         return;


       scalar_type t = 0.0;

       for (int j=0; j<n; ++j)

         t += A(i,j)*b(j);

       c(i) = t;

     }

   );

 }


 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)


 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>

 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,

                               const ViewTypeC& c) {

   typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;

   typedef typename ViewTypeC::execution_space execution_space;


 #if defined (KOKKOS_ENABLE_CUDA)

   const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;

 #else

   const bool is_cuda = false;

 #endif

   const unsigned vector_size = is_cuda ? 32 : 1;

   const unsigned team_size = is_cuda ? 128 / vector_size : 1;


   const int m = A.extent(0);

   const int n = A.extent(1);

   const int range = (m+team_size-1)/team_size;


   typedef Kokkos::TeamPolicy<execution_space> Policy;

   Kokkos::parallel_for(

     Policy( range,team_size,vector_size ),

     KOKKOS_LAMBDA (const typename Policy::member_type& team) {

       const int i = team.league_rank()*team.team_size() + team.team_rank();

       if (i >= m)

         return;


       scalar_type t = 0.0;

       for (int j=0; j<n; ++j)

         t += A(i,j)*b(j);

       c(i) = t;

     }

   );

 }


 #else


 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>

 void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,

                               const ViewTypeC& c) {

   typedef typename ViewTypeC::value_type scalar_type;

   typedef typename ViewTypeC::execution_space execution_space;


 #if defined (KOKKOS_ENABLE_CUDA)

   const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;

 #else

   const bool is_cuda = false;

 #endif

   const unsigned vector_size = 1;

   const unsigned team_size = is_cuda ? 128 / vector_size : 1;


   const int m = A.extent(0);

   const int n = A.extent(1);

   const int range = (m+team_size-1)/team_size;


   typedef Kokkos::TeamPolicy<execution_space> Policy;

   Kokkos::parallel_for(

     Policy( range,team_size,vector_size ),

     KOKKOS_LAMBDA (const typename Policy::member_type& team) {

       const int i = team.league_rank()*team.team_size() + team.team_rank();

       if (i >= m)

         return;


       scalar_type t = 0.0;

       for (int j=0; j<n; ++j)

         t += A(i,j)*b(j);

       c(i) = t;

     }

   );

 }


 #endif


 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>

 void

 check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)

 {

   const double tol = 1.0e-14;

   typedef typename ViewTypeC::value_type value_type;

   typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);

   Kokkos::deep_copy(h_c, c);

   const size_t m = A.extent(0);

   const size_t n = A.extent(1);

   for (size_t i=0; i<m; ++i) {

     value_type t = n;

     if (std::abs(h_c(i)- t) > tol) {

       std::cout << "Comparison failed!  " << i << " : " << h_c(i) << " , " << t

                 << std::endl;

     }

   }

 }


 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>

 void

 check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)

 {

   const double tol = 1.0e-14;

   typedef typename ViewTypeC::value_type value_type;

   typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);

   Kokkos::deep_copy(h_c, c);

   const size_t m = A.extent(0);

   const size_t n = A.extent(1);

   const size_t p = Kokkos::dimension_scalar(A);

   for (size_t i=0; i<m; ++i) {

     for (size_t j=0; j<p; ++j) {

       value_type t = (j == p-1 ? n : 2*n);

       if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {

         std::cout << "Comparison failed!  " << i << "," << j << " : "

                   << h_c(i).fastAccessDx(j) << " , " << t << std::endl;

       }

     }

   }

 }


 struct Perf {

   double time;

   double flops;

   double throughput;

 };


 template <typename FadType, typename ... ViewArgs>

 Perf

 do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,

             const bool check)

 {

   typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;

   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;

   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;

   typedef typename ViewTypeA::execution_space execution_space;


   ViewTypeA A("A",m,n,p+1);

   ViewTypeB b("B",n,p+1);

   ViewTypeC c("c",m,p+1);


   FadType a(p, 1.0);

   for (size_t k=0; k<p; ++k)

     a.fastAccessDx(k) = 1.0;

   Kokkos::deep_copy(A, a);

   Kokkos::deep_copy(b, a);


   Kokkos::Impl::Timer wall_clock;

   Perf perf;


   // Execute the kernel once to warm up

   run_mat_vec( A, b, c );

   execution_space::fence();


   wall_clock.reset();

   for (size_t l=0; l<nloop; l++) {

     run_mat_vec( A, b, c );

   }

   execution_space::fence();


   perf.time = wall_clock.seconds() / nloop;

   perf.flops = m*n*(2+4*p);

   perf.throughput = perf.flops / perf.time / 1.0e9;


   if (check) {

     check_deriv(A, b, c);

   }


   return perf;

 }


 template <typename FadType, typename ... ViewArgs>

 Perf

 do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,

                          const size_t nloop, const bool check)

 {

   typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;

   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;

   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;

   typedef typename ViewTypeA::execution_space execution_space;


 #if defined (KOKKOS_ENABLE_CUDA)

   const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;

 #else

   const bool is_cuda = false;

 #endif


 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL)

   const int FadStride = is_cuda ? 32 : 1;

 #if defined(SACADO_ALIGN_SFAD)

   const int N = Sacado::StaticSize<FadType>::value;

   const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;

   const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;

   typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;

 #else

   typedef FadType AlignedFadType;

   const size_t pa = p;

 #endif

 #else

   const int FadStride = 1;

   typedef FadType AlignedFadType;

   const size_t pa = p;

 #endif


 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)

   typedef Kokkos::LayoutContiguous<typename ViewTypeA::array_layout,FadStride> ConLayoutA;

   typedef Kokkos::LayoutContiguous<typename ViewTypeB::array_layout,FadStride> ConLayoutB;

   typedef Kokkos::LayoutContiguous<typename ViewTypeC::array_layout,FadStride> ConLayoutC;

 #else

   typedef typename ViewTypeA::array_layout ConLayoutA;

   typedef typename ViewTypeB::array_layout ConLayoutB;

   typedef typename ViewTypeC::array_layout ConLayoutC;

   (void) FadStride;

 #endif


   typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;

   typedef Kokkos::View<AlignedFadType*,  ConLayoutB, execution_space> ConViewTypeB;

   typedef Kokkos::View<AlignedFadType*,  ConLayoutC, execution_space> ConViewTypeC;


   ConViewTypeA A("A",m,n,pa+1);

   ConViewTypeB b("B",n,pa+1);

   ConViewTypeC c("c",m,pa+1);


   AlignedFadType a(pa, 1.0);

   for (size_t k=0; k<pa; ++k)

     a.fastAccessDx(k) = 1.0;

   Kokkos::deep_copy(A, a);

   Kokkos::deep_copy(b, a);


   Kokkos::Impl::Timer wall_clock;

   Perf perf;


 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)

   const size_t concurrency = execution_space::concurrency();

   const size_t warp_dim = is_cuda ? 32 : 1;

   const size_t block_size = pa*sizeof(double);

   const size_t nkernels = concurrency / warp_dim;

   const size_t mem_pool_size =

     static_cast<size_t>(1.2*nkernels*block_size);

   const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;

   execution_space space;

   Sacado::createGlobalMemoryPool(space, mem_pool_size,

       block_size,

       block_size,

       superblock_size

       );

 #endif


   // Execute the kernel once to warm up

   run_mat_vec_hierarchical( A, b, c );

   execution_space::fence();


   wall_clock.reset();

   for (size_t l=0; l<nloop; l++) {

     run_mat_vec_hierarchical( A, b, c );

   }

   execution_space::fence();


   perf.time = wall_clock.seconds() / nloop;

   perf.flops = m*n*(2+4*p);

   perf.throughput = perf.flops / perf.time / 1.0e9;


   if (check) {

     check_deriv(A, b, c);

   }


 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL)

   Sacado::destroyGlobalMemoryPool(space);

 #endif


   return perf;

 }


 template <typename ... ViewArgs>

 Perf

 do_time_val(const size_t m, const size_t n, const size_t nloop,

             const bool check)

 {

   typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;

   typedef Kokkos::View<double*,  ViewArgs...> ViewTypeB;

   typedef Kokkos::View<double*,  ViewArgs...> ViewTypeC;

   typedef typename ViewTypeA::execution_space execution_space;


   ViewTypeA A("A",m,n);

   ViewTypeB b("B",n);

   ViewTypeC c("c",m);


   Kokkos::deep_copy(A, 1.0);

   Kokkos::deep_copy(b, 1.0);


   Kokkos::Impl::Timer wall_clock;

   Perf perf;


   // Execute the kernel once to warm up

   run_mat_vec( A, b, c );

   execution_space::fence();


   wall_clock.reset();

   for (size_t l=0; l<nloop; l++) {

     run_mat_vec( A, b, c );

   }

   execution_space::fence();


   perf.time = wall_clock.seconds() / nloop;

   perf.flops = m*n*2;

   perf.throughput = perf.flops / perf.time / 1.0e9;


   if (check)

     check_val(A,b,c);


   return perf;

 }


 void

 print_perf(const Perf& perf, const Perf& perf_base, const size_t p,

            const std::string& name)

 {

   std::cout << name << "\t "

             << perf.time << "\t "

             << perf.throughput << "\t "

             << perf.time / (perf_base.time*p)

             << std::endl;

 }


 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,

           typename ... ViewArgs>

 void

 do_times(const size_t m,

          const size_t n,

          const size_t p,

          const size_t ph,

          const size_t nloop,

          const bool value,

          const bool sfad,

          const bool slfad,

          const bool dfad,

          const bool hierarchical,

          const bool check)

 {

   Perf perf_value;

   perf_value.time = 1.0;


   // Run value

   if (value) {

     Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);

     perf_value = perf;

     print_perf(perf, perf_value, p, "Value     ");

   }


   // Run SFad

   if (sfad && p == SFadSize) {

     Perf perf =

       do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);

     print_perf(perf, perf_value, p, "SFad      ");

   }


   // Run SLFad

   if (slfad && p <= SLFadSize) {

     Perf perf =

       do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);

     print_perf(perf, perf_value, p, "SLFad     ");

   }


   // Run DFad

   if (dfad) {

     Perf perf =

       do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);

     print_perf(perf, perf_value, p, "DFad      ");

   }


   // Run hierarchical

   if (hierarchical) {

     if (sfad && ph == HierSFadSize) {

       Perf perf =

         do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,check);

       print_perf(perf, perf_value, ph, "Hier SFad ");

     }

     if (slfad && ph <= HierSLFadSize) {

       Perf perf =

         do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,check);

       print_perf(perf, perf_value, ph, "Hier SLFad");

     }

     if (dfad) {

       Perf perf =

         do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,check);

       print_perf(perf, perf_value, ph, "Hier DFad ");

     }

   }


 }


 enum LayoutType {

   LAYOUT_LEFT=0,

   LAYOUT_RIGHT,

   LAYOUT_DEFAULT

 };

 const int num_layout_types = 3;

 const LayoutType layout_values[] = {

   LAYOUT_LEFT, LAYOUT_RIGHT, LAYOUT_DEFAULT };

 const char *layout_names[] = { "left", "right", "default" };


 template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,

           typename Device>

 void

 do_times_layout(const size_t m,

                 const size_t n,

                 const size_t p,

                 const size_t ph,

                 const size_t nloop,

                 const bool value,

                 const bool sfad,

                 const bool slfad,

                 const bool dfad,

                 const bool hierarchical,

                 const bool check,

                 const LayoutType& layout,

                 const std::string& device)

 {

   int prec = 2;

   std::cout.setf(std::ios::scientific);

   std::cout.precision(prec);

   std::cout << std::endl

             << device

             << " performance for layout "

             << layout_names[layout]

             << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph

             << std::endl << std::endl;

   std::cout << "Computation \t Time     \t Throughput \t Ratio" << std::endl;


   if (layout == LAYOUT_LEFT)

     do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(

       m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);

   else if (layout == LAYOUT_RIGHT)

     do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(

       m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);

   else

     do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>

       (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);

 }


 // Connect executable to vtune for profiling

 void connect_vtune() {

   std::stringstream cmd;

   pid_t my_os_pid=getpid();

   const std::string vtune_loc =

     "amplxe-cl";

   const std::string output_dir = "./vtune";

   cmd << vtune_loc

       << " -collect hotspots -result-dir " << output_dir

       << " -target-pid " << my_os_pid << " &";

   std::cout << cmd.str() << std::endl;

   system(cmd.str().c_str());

   system("sleep 10");

 }


 const int SFadSize  = 8;

 const int SLFadSize = SFadSize;

 const int HierSFadSize  = 50;

 const int HierSLFadSize = HierSFadSize;


 int main(int argc, char* argv[]) {

   bool success = true;

   try {


     // Set up command line options

     Teuchos::CommandLineProcessor clp(false);

     clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");

     int m = 100000;

     clp.setOption("m", &m, "Number of matrix rows");

     int n = 100;

     clp.setOption("n", &n, "Number of matrix columns");

     int p = SFadSize;

     clp.setOption("p", &p, "Number of derivative components");

     int ph = HierSFadSize;

     clp.setOption("ph", &ph, "Number of derivative components for hierarchical");

     int nloop = 10;

     clp.setOption("nloop", &nloop, "Number of loops");

 #ifdef KOKKOS_ENABLE_SERIAL

     bool serial = 0;

     clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");

 #endif

 #ifdef KOKKOS_ENABLE_OPENMP

     int openmp = 0;

     clp.setOption("openmp", &openmp, "Number of OpenMP threads");

 #endif

 #ifdef KOKKOS_ENABLE_THREADS

     int threads = 0;

     clp.setOption("threads", &threads, "Number of pThreads threads");

 #endif

 #ifdef KOKKOS_ENABLE_CUDA

     bool cuda = 0;

     clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");

 #endif

     int numa = 0;

     clp.setOption("numa", &numa,

                   "Number of NUMA domains to use (set to 0 to use all NUMAs");

     int cores_per_numa = 0;

     clp.setOption("cores-per-numa", &cores_per_numa,

                   "Number of CPU cores per NUMA to use (set to 0 to use all cores)");

     bool print_config = false;

     clp.setOption("print-config", "no-print-config", &print_config,

                   "Whether to print Kokkos device configuration");

     LayoutType layout = LAYOUT_DEFAULT;

     clp.setOption("layout", &layout, num_layout_types, layout_values,

                   layout_names, "View layout");

     bool vtune = false;

     clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");

     bool value = true;

     clp.setOption("value", "no-value", &value, "Run value calculation");

     bool sfad = true;

     clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");

     bool slfad = true;

     clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");

     bool dfad = true;

     clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");

     bool hierarchical = true;

     clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");

     bool check = false;

     clp.setOption("check", "no-check", &check, "Check calculations are correct");


     // Parse options

     switch (clp.parse(argc, argv)) {

       case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:

         return 0;

       case Teuchos::CommandLineProcessor::PARSE_ERROR:

       case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:

         return 1;

       case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:

         break;

     }


     if (vtune)

       connect_vtune();


 // #ifdef KOKKOS_ENABLE_SERIAL

 //     if (serial) {

 //       Kokkos::initialize();

 //       if (print_config)

 //         Kokkos::print_configuration(std::cout, true);

 //       do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(

 //         m,n,p,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial");

 //       Kokkos::finalize();

 //     }

 // #endif


 #ifdef KOKKOS_ENABLE_OPENMP

     if (openmp) {

       Kokkos::InitArgs init_args;

       init_args.num_threads = openmp;

       init_args.num_numa = numa;

       Kokkos::initialize( init_args );

       if (print_config)

         Kokkos::print_configuration(std::cout, true);

       do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(

         m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP");

       Kokkos::finalize();

     }

 #endif


 #ifdef KOKKOS_ENABLE_THREADS

     if (threads) {

       Kokkos::InitArgs init_args;

       init_args.num_threads = threads;

       init_args.num_numa = numa;

       Kokkos::initialize( init_args );

       if (print_config)

         Kokkos::print_configuration(std::cout, true);

       do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(

         m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads");

       Kokkos::finalize();

     }

 #endif


 #ifdef KOKKOS_ENABLE_CUDA

     if (cuda) {

       Kokkos::initialize();

       if (print_config)

         Kokkos::print_configuration(std::cout, true);

       do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(

         m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda");

       Kokkos::finalize();

     }

 #endif


   }

   TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);


   return !success;

 }

layout_names
const char * layout_names[]
Definition: fad_kokkos_mat_vec_perf.cpp:500

Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED

abs
abs(expr.val())

run_mat_vec
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: fad_kokkos_mat_vec_perf.cpp:53

do_times
void do_times(const T x[], int nloop, Teuchos::Array< double > &times)
Definition: fad_expr_depth.cpp:55

Perf::flops
double flops
Definition: fad_kokkos_mat_vec_perf.cpp:223

HierSLFadSize
const int HierSLFadSize
Definition: fad_kokkos_mat_vec_perf.cpp:559

Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION

Sacado::createGlobalMemoryPool
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
Definition: Sacado_DynamicArrayTraits.hpp:51

run_mat_vec_hierarchical
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: fad_kokkos_mat_vec_perf.cpp:73

Perf
Definition: fad_kokkos_mat_vec_perf.cpp:221

print_perf
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
Definition: fad_kokkos_mat_vec_perf.cpp:415

Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL

SFadSize
const int SFadSize
Definition: fad_kokkos_mat_vec_perf.cpp:556

Perf::time
double time
Definition: fad_kokkos_mat_vec_perf.cpp:222

Teuchos_Time.hpp

do_time_fad
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: fad_kokkos_mat_vec_perf.cpp:229

FadType
Sacado::Fad::DFad< double > FadType
Definition: blas_example.cpp:49

A
Definition: ConversionTests.cpp:42

do_times_layout
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t ph, const size_t nloop, const bool value, const bool sfad, const bool slfad, const bool dfad, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
Definition: fad_kokkos_mat_vec_perf.cpp:505

Sacado::StaticSize
Base template specification for static size.
Definition: Sacado_Traits.hpp:409

check
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)

Sacado.hpp

a
a
Definition: Sacado_CacheFad_Ops.hpp:426

connect_vtune
void connect_vtune()
Definition: fad_kokkos_mat_vec_perf.cpp:542

c
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
Definition: Sacado_LFad_LogicalSparseOps.hpp:452

A
#define A
Definition: Sacado_rad.hpp:572

Sacado::Fad::DFad< double >

Teuchos::CommandLineProcessor::setOption
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)

TEUCHOS_STANDARD_CATCH_STATEMENTS
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)

do_time_fad_hierarchical
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition: fad_kokkos_mat_vec_perf.cpp:273

layout_values
const LayoutType layout_values[]
Definition: fad_kokkos_mat_vec_perf.cpp:498

main
int main()
Definition: ad_example.cpp:191

Teuchos::CommandLineProcessor::parse
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const

do_time_val
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition: fad_kokkos_mat_vec_perf.cpp:376

Teuchos_StandardCatchMacros.hpp

Kokkos::LayoutContiguous
Definition: Kokkos_LayoutContiguous.hpp:41

void
void
Definition: uninit.c:96

Perf::throughput
double throughput
Definition: fad_kokkos_mat_vec_perf.cpp:224

fastAccessDx
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp

LAYOUT_DEFAULT
Definition: fad_kokkos_mat_vec_perf.cpp:495

Teuchos::CommandLineProcessor::PARSE_ERROR

Teuchos_CommandLineProcessor.hpp

Teuchos::CommandLineProcessor::setDocString
void setDocString(const char doc_string[])

HierSFadSize
const int HierSFadSize
Definition: fad_kokkos_mat_vec_perf.cpp:558

check_deriv
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: fad_kokkos_mat_vec_perf.cpp:201

LAYOUT_RIGHT
Definition: fad_kokkos_mat_vec_perf.cpp:494

tol
const double tol
Definition: tradoptest_01.cpp:61

SLFadSize
const int SLFadSize
Definition: fad_kokkos_mat_vec_perf.cpp:557

check_val
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Definition: fad_kokkos_mat_vec_perf.cpp:182

LAYOUT_LEFT
Definition: fad_kokkos_mat_vec_perf.cpp:493

LayoutType
LayoutType
Definition: fad_kokkos_mat_vec_perf.cpp:492

Sacado::destroyGlobalMemoryPool
void destroyGlobalMemoryPool(const ExecSpace &space)
Definition: Sacado_DynamicArrayTraits.hpp:59

num_layout_types
const int num_layout_types
Definition: fad_kokkos_mat_vec_perf.cpp:497

n
int n

Teuchos::CommandLineProcessor