16 #include "Kokkos_Timer.hpp"
18 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
19 void run_mat_vec(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c) {
20 typedef typename ViewTypeC::value_type scalar_type;
21 typedef typename ViewTypeC::execution_space execution_space;
23 const int m = A.extent(0);
24 const int n = A.extent(1);
26 Kokkos::RangePolicy<execution_space>( 0,m ),
27 KOKKOS_LAMBDA (
const int i) {
29 for (
int j=0; j<n; ++j)
36 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
40 typedef typename ViewTypeC::value_type scalar_type;
41 typedef typename ViewTypeC::execution_space execution_space;
42 typedef Kokkos::TeamPolicy<execution_space> Policy;
43 typedef typename Policy::member_type team_member;
44 typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
46 const int m = A.extent(0);
47 const int n = A.extent(1);
48 const int p = dimension_scalar(A);
50 #ifdef KOKKOS_ENABLE_CUDA
53 const bool is_cuda =
false;
55 const int TeamSize = is_cuda ? 128 : 1;
56 const int N = (m+TeamSize-1)/TeamSize;
57 Policy policy(N, TeamSize, 1);
58 const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
60 policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
61 KOKKOS_LAMBDA (
const team_member& team) {
62 const int team_rank = team.team_rank();
63 const int team_size = team.team_size();
64 TmpScratchSpace t(team.team_scratch(0), team_size,
p);
65 const int i = team.league_rank()*team_size + team_rank;
68 for (
int j=0; j<n; ++j)
69 t(team_rank) +=
A(i,j)*b(j);
76 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
80 typedef typename ViewTypeC::execution_space execution_space;
82 const int m = A.extent(0);
83 const int n = A.extent(1);
84 const int p = A.extent(2)-1;
86 Kokkos::RangePolicy<execution_space>( 0,m ),
87 KOKKOS_LAMBDA (
const int i) {
89 for (
int k=0; k<
p; ++k)
91 for (
int j=0; j<n; ++j) {
92 c(i,p) +=
A(i,j,p)*b(j,p);
93 for (
int k=0; k<
p; ++k) {
94 c(i,k) +=
A(i,j,k)*b(j,p) +
A(i,j,p)*b(j,k);
101 template <
int MaxP,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
105 typedef typename ViewTypeC::value_type scalar_type;
106 typedef typename ViewTypeC::execution_space execution_space;
108 const int m = A.extent(0);
109 const int n = A.extent(1);
110 const int p = A.extent(2)-1;
111 Kokkos::parallel_for(
112 Kokkos::RangePolicy<execution_space>( 0,m ),
113 KOKKOS_LAMBDA (
const int i) {
114 scalar_type cv = 0.0;
116 for (
int k=0; k<
p; ++k)
119 for (
int j=0; j<n; ++j) {
120 scalar_type av =
A(i,j,p);
121 scalar_type bv = b(j,p);
123 for (
int k=0; k<
p; ++k) {
124 t[k] +=
A(i,j,k)*bv + av*b(j,k);
128 for (
int k=0; k<
p; ++k)
135 template <
int p,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
139 typedef typename ViewTypeC::value_type scalar_type;
140 typedef typename ViewTypeC::execution_space execution_space;
142 const int m = A.extent(0);
143 const int n = A.extent(1);
144 Kokkos::parallel_for(
145 Kokkos::RangePolicy<execution_space>( 0,m ),
146 KOKKOS_LAMBDA (
const int i) {
147 scalar_type cv = 0.0;
149 for (
int k=0; k<
p; ++k)
152 for (
int j=0; j<n; ++j) {
153 const scalar_type av =
A(i,j,p);
154 const scalar_type bv = b(j,p);
160 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
163 for (
int k=0; k<
p; ++k) {
164 t[k] +=
A(i,j,k)*bv + av*b(j,k);
168 for (
int k=0; k<
p; ++k)
175 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
177 check_val(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c)
179 const double tol = 1.0e-14;
180 typedef typename ViewTypeC::value_type value_type;
181 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
182 Kokkos::deep_copy(h_c, c);
183 const size_t m = A.extent(0);
184 const size_t n = A.extent(1);
185 for (
size_t i=0;
i<m; ++
i) {
188 std::cout <<
"Comparison failed! " <<
i <<
" : " << h_c(
i) <<
" , " << t
194 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
198 const double tol = 1.0e-14;
199 typedef typename ViewTypeC::value_type value_type;
200 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
201 Kokkos::deep_copy(h_c, c);
202 const size_t m = A.extent(0);
203 const size_t n = A.extent(1);
204 const size_t p = A.extent(2);
205 for (
size_t i=0;
i<m; ++
i) {
206 for (
size_t j=0; j<
p; ++j) {
207 value_type t = (j == p-1 ? n : 2*n);
209 std::cout <<
"Comparison failed! " <<
i <<
"," << j <<
" : "
210 << h_c(
i,j) <<
" , " << t << std::endl;
216 template <
typename ... ViewArgs>
221 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
222 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeB;
223 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeC;
224 typedef typename ViewTypeA::execution_space execution_space;
226 ViewTypeA
A(
"A",m,n);
230 Kokkos::deep_copy(A, 1.0);
231 Kokkos::deep_copy(b, 1.0);
233 Kokkos::Timer wall_clock;
238 execution_space().fence();
241 for (
size_t l=0; l<nloop; l++) {
244 execution_space().fence();
246 perf.
time = wall_clock.seconds() / nloop;
256 template <
typename FadType,
typename ... ViewArgs>
258 do_time_fad(
const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
261 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
262 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
263 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
264 typedef typename ViewTypeA::execution_space execution_space;
267 #ifdef KOKKOS_ENABLE_CUDA
270 const size_t concurrency = execution_space().concurrency();
271 const size_t mem =
std::min(m,concurrency) * p *
sizeof(double);
273 cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
277 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
278 ViewTypeA
A(
"A",m,n,p+1);
279 ViewTypeB b(
"B",n,p+1);
280 ViewTypeC
c(
"c",m,p+1);
282 ViewTypeA
A(
"A",m,n);
290 Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
291 Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
293 Kokkos::Timer wall_clock;
298 execution_space().fence();
301 for (
size_t l=0; l<nloop; l++) {
304 execution_space().fence();
306 perf.
time = wall_clock.seconds() / nloop;
310 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
312 typename ViewTypeA::array_type A_flat =
A;
313 typename ViewTypeB::array_type b_flat = b;
314 typename ViewTypeC::array_type c_flat =
c;
322 template <
typename FadType,
typename ... ViewArgs>
327 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
328 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
329 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
330 typedef typename ViewTypeA::execution_space execution_space;
332 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
333 ViewTypeA
A(
"A",m,n,p+1);
334 ViewTypeB b(
"B",n,p+1);
335 ViewTypeC
c(
"c",m,p+1);
337 ViewTypeA
A(
"A",m,n);
345 Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
346 Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
348 Kokkos::Timer wall_clock;
353 execution_space().fence();
356 for (
size_t l=0; l<nloop; l++) {
359 execution_space().fence();
361 perf.
time = wall_clock.seconds() / nloop;
365 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
367 typename ViewTypeA::array_type A_flat =
A;
368 typename ViewTypeB::array_type b_flat = b;
369 typename ViewTypeC::array_type c_flat =
c;
377 template <
typename ... ViewArgs>
380 const size_t nloop,
const bool check)
382 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
383 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
384 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
385 typedef typename ViewTypeA::execution_space execution_space;
387 ViewTypeA
A(
"A",m,n,p+1);
388 ViewTypeB b(
"B",n,p+1);
389 ViewTypeC
c(
"c",m,p+1);
391 Kokkos::deep_copy(A, 1.0);
392 Kokkos::deep_copy(b, 1.0);
394 Kokkos::Timer wall_clock;
399 execution_space().fence();
401 for (
size_t l=0; l<nloop; l++) {
404 execution_space().fence();
406 perf.
time = wall_clock.seconds() / nloop;
416 template <
int MaxP,
typename ... ViewArgs>
419 const size_t nloop,
const bool check)
421 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
422 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
423 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
424 typedef typename ViewTypeA::execution_space execution_space;
426 ViewTypeA
A(
"A",m,n,p+1);
427 ViewTypeB b(
"B",n,p+1);
428 ViewTypeC
c(
"c",m,p+1);
430 Kokkos::deep_copy(A, 1.0);
431 Kokkos::deep_copy(b, 1.0);
433 Kokkos::Timer wall_clock;
437 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
438 execution_space().fence();
440 for (
size_t l=0; l<nloop; l++) {
441 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
443 execution_space().fence();
445 perf.
time = wall_clock.seconds() / nloop;
455 template <
int p,
typename ... ViewArgs>
458 const size_t nloop,
const bool check)
460 typedef Kokkos::View<
double**[
p+1], ViewArgs...> ViewTypeA;
461 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
462 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
463 typedef typename ViewTypeA::execution_space execution_space;
465 ViewTypeA
A(
"A",m,n);
466 ViewTypeB b(
"B",n,
p+1);
467 ViewTypeC
c(
"c",m,
p+1);
469 Kokkos::deep_copy(A, 1.0);
470 Kokkos::deep_copy(b, 1.0);
472 Kokkos::Timer wall_clock;
476 run_mat_vec_deriv_s<p>(
A, b,
c );
477 execution_space().fence();
479 for (
size_t l=0; l<nloop; l++) {
480 run_mat_vec_deriv_s<p>(
A, b,
c );
482 execution_space().fence();
484 perf.
time = wall_clock.seconds() / nloop;
498 #define INST_FUNC_VAL_DEV(DEV) \
499 template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
500 template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
501 template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
502 template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
503 template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
504 template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
505 template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
506 template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
507 template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
508 template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
509 template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
510 template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
512 #define INST_FUNC_FAD_DEV(FAD,DEV) \
513 template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
514 template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
515 template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
516 template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
517 template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
518 template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
520 #define INST_FUNC_DEV(DEV) \
521 INST_FUNC_VAL_DEV( DEV ) \
522 INST_FUNC_FAD_DEV( SFad_type, DEV ) \
523 INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
524 INST_FUNC_FAD_DEV( DFad_type, DEV )
526 #ifdef KOKKOS_ENABLE_SERIAL
530 #ifdef KOKKOS_ENABLE_OPENMP
534 #ifdef KOKKOS_ENABLE_THREADS
538 #ifdef KOKKOS_ENABLE_CUDA
542 #ifdef KOKKOS_ENABLE_HIP
Sacado::Fad::DFad< double > DFad_type
double do_time_analytic(int nderiv, int nloop)
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
Sacado::Fad::SFad< double, SFadSize > SFad_type
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
#define INST_FUNC_DEV(DEV)
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type