36 #include "impl/Kokkos_Timer.hpp" 
   38 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
   39 void run_mat_vec(
const ViewTypeA& 
A, 
const ViewTypeB& b, 
const ViewTypeC& 
c) {
 
   40   typedef typename ViewTypeC::value_type scalar_type;
 
   41   typedef typename ViewTypeC::execution_space execution_space;
 
   43   const int m = A.extent(0);
 
   44   const int n = A.extent(1);
 
   46     Kokkos::RangePolicy<execution_space>( 0,m ),
 
   47     KOKKOS_LAMBDA (
const int i) {
 
   49       for (
int j=0; j<n; ++j)
 
   56 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
   60   typedef typename ViewTypeC::value_type scalar_type;
 
   61   typedef typename ViewTypeC::execution_space execution_space;
 
   62   typedef Kokkos::TeamPolicy<execution_space> Policy;
 
   63   typedef typename Policy::member_type team_member;
 
   64   typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
 
   66   const int m = A.extent(0);
 
   67   const int n = A.extent(1);
 
   68   const int p = dimension_scalar(A);
 
   70 #ifdef KOKKOS_ENABLE_CUDA 
   71   const bool is_cuda = std::is_same<execution_space,Kokkos::Cuda>::value;
 
   73   const bool is_cuda = 
false;
 
   75   const int TeamSize = is_cuda ? 128 : 1;
 
   76   const int N = (m+TeamSize-1)/TeamSize;
 
   77   Policy policy(N, TeamSize, 1);
 
   78   const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
 
   80     policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
 
   81     KOKKOS_LAMBDA (
const team_member& team) {
 
   82       const int team_rank = team.team_rank();
 
   83       const int team_size = team.team_size();
 
   84       TmpScratchSpace t(team.team_scratch(0), team_size, p);
 
   85       const int i = team.league_rank()*team_size + team_rank;
 
   88         for (
int j=0; j<n; ++j)
 
   89           t(team_rank) += 
A(i,j)*b(j);
 
   96 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  100   typedef typename ViewTypeC::execution_space execution_space;
 
  102   const int m = A.extent(0);
 
  103   const int n = A.extent(1);
 
  104   const int p = A.extent(2)-1;
 
  105   Kokkos::parallel_for(
 
  106     Kokkos::RangePolicy<execution_space>( 0,m ),
 
  107     KOKKOS_LAMBDA (
const int i) {
 
  109       for (
int k=0; k<p; ++k)
 
  111       for (
int j=0; j<n; ++j) {
 
  112         c(i,p) += 
A(i,j,p)*b(j,p);
 
  113         for (
int k=0; k<p; ++k) {
 
  114           c(i,k) += 
A(i,j,k)*b(j,p) + 
A(i,j,p)*b(j,k);
 
  121 template <
int MaxP, 
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  125   typedef typename ViewTypeC::value_type scalar_type;
 
  126   typedef typename ViewTypeC::execution_space execution_space;
 
  128   const int m = A.extent(0);
 
  129   const int n = A.extent(1);
 
  130   const int p = A.extent(2)-1;
 
  131   Kokkos::parallel_for(
 
  132     Kokkos::RangePolicy<execution_space>( 0,m ),
 
  133     KOKKOS_LAMBDA (
const int i) {
 
  134       scalar_type cv = 0.0;
 
  136       for (
int k=0; k<p; ++k)
 
  139       for (
int j=0; j<n; ++j) {
 
  140         scalar_type av = 
A(i,j,p);
 
  141         scalar_type bv = b(j,p);
 
  143         for (
int k=0; k<p; ++k) {
 
  144           t[k] += 
A(i,j,k)*bv + av*b(j,k);
 
  148       for (
int k=0; k<p; ++k)
 
  155 template <
int p, 
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  159   typedef typename ViewTypeC::value_type scalar_type;
 
  160   typedef typename ViewTypeC::execution_space execution_space;
 
  162   const int m = A.extent(0);
 
  163   const int n = A.extent(1);
 
  164   Kokkos::parallel_for(
 
  165     Kokkos::RangePolicy<execution_space>( 0,m ),
 
  166     KOKKOS_LAMBDA (
const int i) {
 
  167       scalar_type cv = 0.0;
 
  169       for (
int k=0; k<p; ++k)
 
  172       for (
int j=0; j<n; ++j) {
 
  173         const scalar_type av = 
A(i,j,p);
 
  174         const scalar_type bv = b(j,p);
 
  180 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__) 
  183         for (
int k=0; k<p; ++k) {
 
  184           t[k] += 
A(i,j,k)*bv + av*b(j,k);
 
  188       for (
int k=0; k<p; ++k)
 
  195 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  197 check_val(
const ViewTypeA& 
A, 
const ViewTypeB& b, 
const ViewTypeC& 
c)
 
  199   const double tol = 1.0e-14;
 
  200   typedef typename ViewTypeC::value_type value_type;
 
  201   typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
 
  202   Kokkos::deep_copy(h_c, c);
 
  203   const size_t m = A.extent(0);
 
  204   const size_t n = A.extent(1);
 
  205   for (
size_t i=0; i<m; ++i) {
 
  208       std::cout << 
"Comparison failed!  " << i << 
" : " << h_c(i) << 
" , " << t
 
  214 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  218   const double tol = 1.0e-14;
 
  219   typedef typename ViewTypeC::value_type value_type;
 
  220   typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
 
  221   Kokkos::deep_copy(h_c, c);
 
  222   const size_t m = A.extent(0);
 
  223   const size_t n = A.extent(1);
 
  224   const size_t p = A.extent(2);
 
  225   for (
size_t i=0; i<m; ++i) {
 
  226     for (
size_t j=0; j<p; ++j) {
 
  227       value_type t = (j == p-1 ? n : 2*n);
 
  229         std::cout << 
"Comparison failed!  " << i << 
"," << j << 
" : " 
  230                   << h_c(i,j) << 
" , " << t << std::endl;
 
  236 template <
typename ... ViewArgs>
 
  241   typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
 
  242   typedef Kokkos::View<
double*,  ViewArgs...> ViewTypeB;
 
  243   typedef Kokkos::View<
double*,  ViewArgs...> ViewTypeC;
 
  244   typedef typename ViewTypeA::execution_space execution_space;
 
  246   ViewTypeA 
A(
"A",m,n);
 
  250   Kokkos::deep_copy(A, 1.0);
 
  251   Kokkos::deep_copy(b, 1.0);
 
  253   Kokkos::Impl::Timer wall_clock;
 
  258   execution_space().fence();
 
  261   for (
size_t l=0; l<nloop; l++) {
 
  264   execution_space().fence();
 
  266   perf.
time = wall_clock.seconds() / nloop;
 
  276 template <
typename FadType, 
typename ... ViewArgs>
 
  278 do_time_fad(
const size_t m, 
const size_t n, 
const size_t p, 
const size_t nloop,
 
  281   typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
 
  282   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
 
  283   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
 
  284   typedef typename ViewTypeA::execution_space execution_space;
 
  287 #ifdef KOKKOS_ENABLE_CUDA 
  288   if (std::is_same<execution_space,Kokkos::Cuda>::value &&
 
  290     const size_t concurrency = execution_space::concurrency();
 
  291     const size_t mem = 
std::min(m,concurrency) * p * 
sizeof(double);
 
  293     cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
 
  297 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC 
  298   ViewTypeA 
A(
"A",m,n,p+1);
 
  299   ViewTypeB b(
"B",n,p+1);
 
  300   ViewTypeC 
c(
"c",m,p+1);
 
  302   ViewTypeA 
A(
"A",m,n);
 
  310   Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
 
  311   Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
 
  313   Kokkos::Impl::Timer wall_clock;
 
  318   execution_space().fence();
 
  321   for (
size_t l=0; l<nloop; l++) {
 
  324   execution_space().fence();
 
  326   perf.
time = wall_clock.seconds() / nloop;
 
  327   perf.
flops = m*n*(2+4*p);
 
  330 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC 
  332     typename ViewTypeA::array_type A_flat = 
A;
 
  333     typename ViewTypeB::array_type b_flat = b;
 
  334     typename ViewTypeC::array_type c_flat = 
c;
 
  342 template <
typename FadType, 
typename ... ViewArgs>
 
  347   typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
 
  348   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
 
  349   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
 
  350   typedef typename ViewTypeA::execution_space execution_space;
 
  352 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC 
  353   ViewTypeA 
A(
"A",m,n,p+1);
 
  354   ViewTypeB b(
"B",n,p+1);
 
  355   ViewTypeC 
c(
"c",m,p+1);
 
  357   ViewTypeA 
A(
"A",m,n);
 
  365   Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
 
  366   Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
 
  368   Kokkos::Impl::Timer wall_clock;
 
  373   execution_space().fence();
 
  376   for (
size_t l=0; l<nloop; l++) {
 
  379   execution_space().fence();
 
  381   perf.
time = wall_clock.seconds() / nloop;
 
  382   perf.
flops = m*n*(2+4*p);
 
  385 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC 
  387     typename ViewTypeA::array_type A_flat = 
A;
 
  388     typename ViewTypeB::array_type b_flat = b;
 
  389     typename ViewTypeC::array_type c_flat = 
c;
 
  397 template <
typename ... ViewArgs>
 
  400                  const size_t nloop, 
const bool check)
 
  402   typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
 
  403   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeB;
 
  404   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeC;
 
  405   typedef typename ViewTypeA::execution_space execution_space;
 
  407   ViewTypeA 
A(
"A",m,n,p+1);
 
  408   ViewTypeB b(
"B",n,p+1);
 
  409   ViewTypeC 
c(
"c",m,p+1);
 
  411   Kokkos::deep_copy(A, 1.0);
 
  412   Kokkos::deep_copy(b, 1.0);
 
  414   Kokkos::Impl::Timer wall_clock;
 
  419   execution_space().fence();
 
  421   for (
size_t l=0; l<nloop; l++) {
 
  424   execution_space().fence();
 
  426   perf.
time = wall_clock.seconds() / nloop;
 
  427   perf.
flops = m*n*(2+4*p);
 
  436 template <
int MaxP, 
typename ... ViewArgs>
 
  439                     const size_t nloop, 
const bool check)
 
  441   typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
 
  442   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeB;
 
  443   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeC;
 
  444   typedef typename ViewTypeA::execution_space execution_space;
 
  446   ViewTypeA 
A(
"A",m,n,p+1);
 
  447   ViewTypeB b(
"B",n,p+1);
 
  448   ViewTypeC 
c(
"c",m,p+1);
 
  450   Kokkos::deep_copy(A, 1.0);
 
  451   Kokkos::deep_copy(b, 1.0);
 
  453   Kokkos::Impl::Timer wall_clock;
 
  457   run_mat_vec_deriv_sl<MaxP>( 
A, b, 
c );
 
  458   execution_space().fence();
 
  460   for (
size_t l=0; l<nloop; l++) {
 
  461     run_mat_vec_deriv_sl<MaxP>( 
A, b, 
c );
 
  463   execution_space().fence();
 
  465   perf.
time = wall_clock.seconds() / nloop;
 
  466   perf.
flops = m*n*(2+4*p);
 
  475 template <
int p, 
typename ... ViewArgs>
 
  478                    const size_t nloop, 
const bool check)
 
  480   typedef Kokkos::View<
double**[p+1], ViewArgs...> ViewTypeA;
 
  481   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeB;
 
  482   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeC;
 
  483   typedef typename ViewTypeA::execution_space execution_space;
 
  485   ViewTypeA 
A(
"A",m,n);
 
  486   ViewTypeB b(
"B",n,p+1);
 
  487   ViewTypeC 
c(
"c",m,p+1);
 
  489   Kokkos::deep_copy(A, 1.0);
 
  490   Kokkos::deep_copy(b, 1.0);
 
  492   Kokkos::Impl::Timer wall_clock;
 
  496   run_mat_vec_deriv_s<p>( 
A, b, 
c );
 
  497   execution_space().fence();
 
  499   for (
size_t l=0; l<nloop; l++) {
 
  500     run_mat_vec_deriv_s<p>( 
A, b, 
c );
 
  502   execution_space().fence();
 
  504   perf.
time = wall_clock.seconds() / nloop;
 
  505   perf.
flops = m*n*(2+4*p);
 
  518 #define INST_FUNC_VAL_DEV(DEV) \ 
  519   template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \ 
  520   template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \ 
  521   template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \ 
  522   template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ 
  523   template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ 
  524   template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ 
  525   template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ 
  526   template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ 
  527   template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ 
  528   template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \ 
  529   template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \ 
  530   template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); 
  532 #define INST_FUNC_FAD_DEV(FAD,DEV)                                      \ 
  533   template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ 
  534   template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ 
  535   template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ 
  536   template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ 
  537   template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ 
  538   template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); 
  540 #define INST_FUNC_DEV(DEV)                                       \ 
  541   INST_FUNC_VAL_DEV( DEV )                                       \ 
  542   INST_FUNC_FAD_DEV( SFad_type, DEV )   \ 
  543   INST_FUNC_FAD_DEV( SLFad_type, DEV ) \ 
  544   INST_FUNC_FAD_DEV( DFad_type, DEV ) 
  546 #ifdef KOKKOS_ENABLE_SERIAL 
  550 #ifdef KOKKOS_ENABLE_OPENMP 
  554 #ifdef KOKKOS_ENABLE_THREADS 
  558 #ifdef KOKKOS_ENABLE_CUDA 
Sacado::Fad::DFad< double > DFad_type
 
double do_time_analytic(int nderiv, int nloop)
 
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
 
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
 
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
 
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
 
Sacado::Fad::DFad< double > FadType
 
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
 
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
 
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
 
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
 
Sacado::Fad::SFad< double, SFadSize > SFad_type
 
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
 
#define INST_FUNC_DEV(DEV)
 
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
 
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
 
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
 
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
 
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
 
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
 
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type