36 #include "impl/Kokkos_Timer.hpp"
38 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
39 void run_mat_vec(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c) {
40 typedef typename ViewTypeC::value_type scalar_type;
41 typedef typename ViewTypeC::execution_space execution_space;
43 const int m = A.extent(0);
44 const int n = A.extent(1);
46 Kokkos::RangePolicy<execution_space>( 0,m ),
47 KOKKOS_LAMBDA (
const int i) {
49 for (
int j=0; j<n; ++j)
56 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
60 typedef typename ViewTypeC::value_type scalar_type;
61 typedef typename ViewTypeC::execution_space execution_space;
62 typedef Kokkos::TeamPolicy<execution_space> Policy;
63 typedef typename Policy::member_type team_member;
64 typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
66 const int m = A.extent(0);
67 const int n = A.extent(1);
68 const int p = dimension_scalar(A);
70 #ifdef KOKKOS_ENABLE_CUDA
71 const bool is_cuda = std::is_same<execution_space,Kokkos::Cuda>::value;
73 const bool is_cuda =
false;
75 const int TeamSize = is_cuda ? 128 : 1;
76 const int N = (m+TeamSize-1)/TeamSize;
77 Policy policy(N, TeamSize, 1);
78 const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
80 policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
81 KOKKOS_LAMBDA (
const team_member& team) {
82 const int team_rank = team.team_rank();
83 const int team_size = team.team_size();
84 TmpScratchSpace t(team.team_scratch(0), team_size, p);
85 const int i = team.league_rank()*team_size + team_rank;
88 for (
int j=0; j<n; ++j)
89 t(team_rank) +=
A(i,j)*b(j);
96 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
100 typedef typename ViewTypeC::execution_space execution_space;
102 const int m = A.extent(0);
103 const int n = A.extent(1);
104 const int p = A.extent(2)-1;
105 Kokkos::parallel_for(
106 Kokkos::RangePolicy<execution_space>( 0,m ),
107 KOKKOS_LAMBDA (
const int i) {
109 for (
int k=0; k<p; ++k)
111 for (
int j=0; j<n; ++j) {
112 c(i,p) +=
A(i,j,p)*b(j,p);
113 for (
int k=0; k<p; ++k) {
114 c(i,k) +=
A(i,j,k)*b(j,p) +
A(i,j,p)*b(j,k);
121 template <
int MaxP,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
125 typedef typename ViewTypeC::value_type scalar_type;
126 typedef typename ViewTypeC::execution_space execution_space;
128 const int m = A.extent(0);
129 const int n = A.extent(1);
130 const int p = A.extent(2)-1;
131 Kokkos::parallel_for(
132 Kokkos::RangePolicy<execution_space>( 0,m ),
133 KOKKOS_LAMBDA (
const int i) {
134 scalar_type cv = 0.0;
136 for (
int k=0; k<p; ++k)
139 for (
int j=0; j<n; ++j) {
140 scalar_type av =
A(i,j,p);
141 scalar_type bv = b(j,p);
143 for (
int k=0; k<p; ++k) {
144 t[k] +=
A(i,j,k)*bv + av*b(j,k);
148 for (
int k=0; k<p; ++k)
155 template <
int p,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
159 typedef typename ViewTypeC::value_type scalar_type;
160 typedef typename ViewTypeC::execution_space execution_space;
162 const int m = A.extent(0);
163 const int n = A.extent(1);
164 Kokkos::parallel_for(
165 Kokkos::RangePolicy<execution_space>( 0,m ),
166 KOKKOS_LAMBDA (
const int i) {
167 scalar_type cv = 0.0;
169 for (
int k=0; k<p; ++k)
172 for (
int j=0; j<n; ++j) {
173 const scalar_type av =
A(i,j,p);
174 const scalar_type bv = b(j,p);
180 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
183 for (
int k=0; k<p; ++k) {
184 t[k] +=
A(i,j,k)*bv + av*b(j,k);
188 for (
int k=0; k<p; ++k)
195 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
197 check_val(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c)
199 const double tol = 1.0e-14;
200 typedef typename ViewTypeC::value_type value_type;
201 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
202 Kokkos::deep_copy(h_c, c);
203 const size_t m = A.extent(0);
204 const size_t n = A.extent(1);
205 for (
size_t i=0; i<m; ++i) {
208 std::cout <<
"Comparison failed! " << i <<
" : " << h_c(i) <<
" , " << t
214 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
218 const double tol = 1.0e-14;
219 typedef typename ViewTypeC::value_type value_type;
220 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
221 Kokkos::deep_copy(h_c, c);
222 const size_t m = A.extent(0);
223 const size_t n = A.extent(1);
224 const size_t p = A.extent(2);
225 for (
size_t i=0; i<m; ++i) {
226 for (
size_t j=0; j<p; ++j) {
227 value_type t = (j == p-1 ? n : 2*n);
229 std::cout <<
"Comparison failed! " << i <<
"," << j <<
" : "
230 << h_c(i,j) <<
" , " << t << std::endl;
236 template <
typename ... ViewArgs>
241 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
242 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeB;
243 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeC;
244 typedef typename ViewTypeA::execution_space execution_space;
246 ViewTypeA
A(
"A",m,n);
250 Kokkos::deep_copy(A, 1.0);
251 Kokkos::deep_copy(b, 1.0);
253 Kokkos::Impl::Timer wall_clock;
258 execution_space().fence();
261 for (
size_t l=0; l<nloop; l++) {
264 execution_space().fence();
266 perf.
time = wall_clock.seconds() / nloop;
276 template <
typename FadType,
typename ... ViewArgs>
278 do_time_fad(
const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
281 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
282 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
283 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
284 typedef typename ViewTypeA::execution_space execution_space;
287 #ifdef KOKKOS_ENABLE_CUDA
288 if (std::is_same<execution_space,Kokkos::Cuda>::value &&
290 const size_t concurrency = execution_space::concurrency();
291 const size_t mem =
std::min(m,concurrency) * p *
sizeof(double);
293 cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
297 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
298 ViewTypeA
A(
"A",m,n,p+1);
299 ViewTypeB b(
"B",n,p+1);
300 ViewTypeC
c(
"c",m,p+1);
302 ViewTypeA
A(
"A",m,n);
310 Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
311 Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
313 Kokkos::Impl::Timer wall_clock;
318 execution_space().fence();
321 for (
size_t l=0; l<nloop; l++) {
324 execution_space().fence();
326 perf.
time = wall_clock.seconds() / nloop;
327 perf.
flops = m*n*(2+4*p);
330 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
332 typename ViewTypeA::array_type A_flat =
A;
333 typename ViewTypeB::array_type b_flat = b;
334 typename ViewTypeC::array_type c_flat =
c;
342 template <
typename FadType,
typename ... ViewArgs>
347 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
348 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
349 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
350 typedef typename ViewTypeA::execution_space execution_space;
352 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
353 ViewTypeA
A(
"A",m,n,p+1);
354 ViewTypeB b(
"B",n,p+1);
355 ViewTypeC
c(
"c",m,p+1);
357 ViewTypeA
A(
"A",m,n);
365 Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
366 Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
368 Kokkos::Impl::Timer wall_clock;
373 execution_space().fence();
376 for (
size_t l=0; l<nloop; l++) {
379 execution_space().fence();
381 perf.
time = wall_clock.seconds() / nloop;
382 perf.
flops = m*n*(2+4*p);
385 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
387 typename ViewTypeA::array_type A_flat =
A;
388 typename ViewTypeB::array_type b_flat = b;
389 typename ViewTypeC::array_type c_flat =
c;
397 template <
typename ... ViewArgs>
400 const size_t nloop,
const bool check)
402 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
403 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
404 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
405 typedef typename ViewTypeA::execution_space execution_space;
407 ViewTypeA
A(
"A",m,n,p+1);
408 ViewTypeB b(
"B",n,p+1);
409 ViewTypeC
c(
"c",m,p+1);
411 Kokkos::deep_copy(A, 1.0);
412 Kokkos::deep_copy(b, 1.0);
414 Kokkos::Impl::Timer wall_clock;
419 execution_space().fence();
421 for (
size_t l=0; l<nloop; l++) {
424 execution_space().fence();
426 perf.
time = wall_clock.seconds() / nloop;
427 perf.
flops = m*n*(2+4*p);
436 template <
int MaxP,
typename ... ViewArgs>
439 const size_t nloop,
const bool check)
441 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
442 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
443 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
444 typedef typename ViewTypeA::execution_space execution_space;
446 ViewTypeA
A(
"A",m,n,p+1);
447 ViewTypeB b(
"B",n,p+1);
448 ViewTypeC
c(
"c",m,p+1);
450 Kokkos::deep_copy(A, 1.0);
451 Kokkos::deep_copy(b, 1.0);
453 Kokkos::Impl::Timer wall_clock;
457 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
458 execution_space().fence();
460 for (
size_t l=0; l<nloop; l++) {
461 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
463 execution_space().fence();
465 perf.
time = wall_clock.seconds() / nloop;
466 perf.
flops = m*n*(2+4*p);
475 template <
int p,
typename ... ViewArgs>
478 const size_t nloop,
const bool check)
480 typedef Kokkos::View<
double**[p+1], ViewArgs...> ViewTypeA;
481 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
482 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
483 typedef typename ViewTypeA::execution_space execution_space;
485 ViewTypeA
A(
"A",m,n);
486 ViewTypeB b(
"B",n,p+1);
487 ViewTypeC
c(
"c",m,p+1);
489 Kokkos::deep_copy(A, 1.0);
490 Kokkos::deep_copy(b, 1.0);
492 Kokkos::Impl::Timer wall_clock;
496 run_mat_vec_deriv_s<p>(
A, b,
c );
497 execution_space().fence();
499 for (
size_t l=0; l<nloop; l++) {
500 run_mat_vec_deriv_s<p>(
A, b,
c );
502 execution_space().fence();
504 perf.
time = wall_clock.seconds() / nloop;
505 perf.
flops = m*n*(2+4*p);
518 #define INST_FUNC_VAL_DEV(DEV) \
519 template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
520 template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
521 template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
522 template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
523 template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
524 template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
525 template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
526 template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
527 template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
528 template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
529 template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
530 template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
532 #define INST_FUNC_FAD_DEV(FAD,DEV) \
533 template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
534 template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
535 template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
536 template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
537 template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
538 template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
540 #define INST_FUNC_DEV(DEV) \
541 INST_FUNC_VAL_DEV( DEV ) \
542 INST_FUNC_FAD_DEV( SFad_type, DEV ) \
543 INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
544 INST_FUNC_FAD_DEV( DFad_type, DEV )
546 #ifdef KOKKOS_ENABLE_SERIAL
550 #ifdef KOKKOS_ENABLE_OPENMP
554 #ifdef KOKKOS_ENABLE_THREADS
558 #ifdef KOKKOS_ENABLE_CUDA
Sacado::Fad::DFad< double > DFad_type
double do_time_analytic(int nderiv, int nloop)
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
Sacado::Fad::SFad< double, SFadSize > SFad_type
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
#define INST_FUNC_DEV(DEV)
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type