16 #include "Kokkos_Timer.hpp"
18 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
19 void run_mat_vec(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c) {
20 typedef typename ViewTypeC::value_type scalar_type;
21 typedef typename ViewTypeC::execution_space execution_space;
23 const int m = A.extent(0);
24 const int n = A.extent(1);
26 Kokkos::RangePolicy<execution_space>( 0,m ),
27 KOKKOS_LAMBDA (
const int i) {
29 for (
int j=0; j<n; ++j)
36 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
40 typedef typename ViewTypeC::value_type scalar_type;
41 typedef typename ViewTypeC::execution_space execution_space;
42 typedef Kokkos::TeamPolicy<execution_space> Policy;
43 typedef typename Policy::member_type team_member;
44 typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
46 const int m = A.extent(0);
47 const int n = A.extent(1);
48 const int p = dimension_scalar(A);
50 #ifdef KOKKOS_ENABLE_CUDA
53 const bool is_cuda =
false;
55 const int TeamSize = is_cuda ? 128 : 1;
56 const int N = (m+TeamSize-1)/TeamSize;
57 Policy policy(N, TeamSize, 1);
58 const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
60 policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
61 KOKKOS_LAMBDA (
const team_member& team) {
62 const int team_rank = team.team_rank();
63 const int team_size = team.team_size();
64 TmpScratchSpace t(team.team_scratch(0), team_size,
p);
65 const int i = team.league_rank()*team_size + team_rank;
68 for (
int j=0; j<n; ++j)
69 t(team_rank) +=
A(i,j)*b(j);
76 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
80 typedef typename ViewTypeC::execution_space execution_space;
82 const int m = A.extent(0);
83 const int n = A.extent(1);
84 const int p = A.extent(2)-1;
86 Kokkos::RangePolicy<execution_space>( 0,m ),
87 KOKKOS_LAMBDA (
const int i) {
89 for (
int k=0; k<
p; ++k)
91 for (
int j=0; j<n; ++j) {
92 c(i,p) +=
A(i,j,p)*b(j,p);
93 for (
int k=0; k<
p; ++k) {
94 c(i,k) +=
A(i,j,k)*b(j,p) +
A(i,j,p)*b(j,k);
101 template <
int MaxP,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
105 typedef typename ViewTypeC::value_type scalar_type;
106 typedef typename ViewTypeC::execution_space execution_space;
108 const int m = A.extent(0);
109 const int n = A.extent(1);
110 const int p = A.extent(2)-1;
111 Kokkos::parallel_for(
112 Kokkos::RangePolicy<execution_space>( 0,m ),
113 KOKKOS_LAMBDA (
const int i) {
114 scalar_type cv = 0.0;
116 for (
int k=0; k<
p; ++k)
119 for (
int j=0; j<n; ++j) {
120 scalar_type av =
A(i,j,p);
121 scalar_type bv = b(j,p);
123 for (
int k=0; k<
p; ++k) {
124 t[k] +=
A(i,j,k)*bv + av*b(j,k);
128 for (
int k=0; k<
p; ++k)
135 template <
int p,
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
139 typedef typename ViewTypeC::value_type scalar_type;
140 typedef typename ViewTypeC::execution_space execution_space;
142 const int m = A.extent(0);
143 const int n = A.extent(1);
144 Kokkos::parallel_for(
145 Kokkos::RangePolicy<execution_space>( 0,m ),
146 KOKKOS_LAMBDA (
const int i) {
147 scalar_type cv = 0.0;
149 for (
int k=0; k<
p; ++k)
152 for (
int j=0; j<n; ++j) {
153 const scalar_type av =
A(i,j,p);
154 const scalar_type bv = b(j,p);
160 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
163 for (
int k=0; k<
p; ++k) {
164 t[k] +=
A(i,j,k)*bv + av*b(j,k);
168 for (
int k=0; k<
p; ++k)
175 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
177 check_val(
const ViewTypeA&
A,
const ViewTypeB& b,
const ViewTypeC&
c)
179 const double tol = 1.0e-14;
180 typedef typename ViewTypeC::value_type value_type;
181 typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
182 Kokkos::deep_copy(h_c, c);
183 const size_t m = A.extent(0);
184 const size_t n = A.extent(1);
185 for (
size_t i=0;
i<m; ++
i) {
188 std::cout <<
"Comparison failed! " <<
i <<
" : " << h_c(
i) <<
" , " << t
194 template <
typename ViewTypeA,
typename ViewTypeB,
typename ViewTypeC>
198 const double tol = 1.0e-14;
199 typedef typename ViewTypeC::value_type value_type;
200 typename ViewTypeC::host_mirror_type h_c = Kokkos::create_mirror_view(c);
201 Kokkos::deep_copy(h_c, c);
202 const size_t m = A.extent(0);
203 const size_t n = A.extent(1);
204 const size_t p = A.extent(2);
205 for (
size_t i=0;
i<m; ++
i) {
206 for (
size_t j=0; j<
p; ++j) {
207 value_type t = (j == p-1 ? n : 2*n);
209 std::cout <<
"Comparison failed! " <<
i <<
"," << j <<
" : "
210 << h_c(
i,j) <<
" , " << t << std::endl;
216 template <
typename ... ViewArgs>
221 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
222 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeB;
223 typedef Kokkos::View<
double*, ViewArgs...> ViewTypeC;
224 typedef typename ViewTypeA::execution_space execution_space;
226 ViewTypeA
A(
"A",m,n);
230 Kokkos::deep_copy(A, 1.0);
231 Kokkos::deep_copy(b, 1.0);
233 Kokkos::Timer wall_clock;
238 execution_space().fence();
241 for (
size_t l=0; l<nloop; l++) {
244 execution_space().fence();
246 perf.
time = wall_clock.seconds() / nloop;
256 template <
typename FadType,
typename ... ViewArgs>
258 do_time_fad(
const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
261 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
262 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
263 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
264 typedef typename ViewTypeA::execution_space execution_space;
267 #ifdef KOKKOS_ENABLE_CUDA
270 const size_t concurrency = execution_space().concurrency();
271 const size_t mem =
std::min(m,concurrency) * p *
sizeof(double);
273 cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
277 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
278 ViewTypeA
A(
"A",m,n,p+1);
279 ViewTypeB b(
"B",n,p+1);
280 ViewTypeC
c(
"c",m,p+1);
282 ViewTypeA
A(
"A",m,n);
290 Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
291 Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
293 Kokkos::Timer wall_clock;
298 execution_space().fence();
301 for (
size_t l=0; l<nloop; l++) {
304 execution_space().fence();
306 perf.
time = wall_clock.seconds() / nloop;
311 #if !defined(SACADO_DISABLE_FAD_VIEW_SPEC) && !defined(SACADO_HAS_NEW_KOKKOS_VIEW_IMPL)
313 typename ViewTypeA::array_type A_flat =
A;
314 typename ViewTypeB::array_type b_flat = b;
315 typename ViewTypeC::array_type c_flat =
c;
323 template <
typename FadType,
typename ... ViewArgs>
328 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
329 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
330 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
331 typedef typename ViewTypeA::execution_space execution_space;
333 #ifndef SACADO_DISABLE_FAD_VIEW_SPEC
334 ViewTypeA
A(
"A",m,n,p+1);
335 ViewTypeB b(
"B",n,p+1);
336 ViewTypeC
c(
"c",m,p+1);
338 ViewTypeA
A(
"A",m,n);
346 Kokkos::deep_copy(
typename ViewTypeA::array_type(A), 1.0);
347 Kokkos::deep_copy(
typename ViewTypeB::array_type(b), 1.0);
349 Kokkos::Timer wall_clock;
354 execution_space().fence();
357 for (
size_t l=0; l<nloop; l++) {
360 execution_space().fence();
362 perf.
time = wall_clock.seconds() / nloop;
367 #if !defined(SACADO_DISABLE_FAD_VIEW_SPEC) && !defined(SACADO_HAS_NEW_KOKKOS_VIEW_IMPL)
369 typename ViewTypeA::array_type A_flat =
A;
370 typename ViewTypeB::array_type b_flat = b;
371 typename ViewTypeC::array_type c_flat =
c;
379 template <
typename ... ViewArgs>
382 const size_t nloop,
const bool check)
384 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
385 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
386 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
387 typedef typename ViewTypeA::execution_space execution_space;
389 ViewTypeA
A(
"A",m,n,p+1);
390 ViewTypeB b(
"B",n,p+1);
391 ViewTypeC
c(
"c",m,p+1);
393 Kokkos::deep_copy(A, 1.0);
394 Kokkos::deep_copy(b, 1.0);
396 Kokkos::Timer wall_clock;
401 execution_space().fence();
403 for (
size_t l=0; l<nloop; l++) {
406 execution_space().fence();
408 perf.
time = wall_clock.seconds() / nloop;
418 template <
int MaxP,
typename ... ViewArgs>
421 const size_t nloop,
const bool check)
423 typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
424 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
425 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
426 typedef typename ViewTypeA::execution_space execution_space;
428 ViewTypeA
A(
"A",m,n,p+1);
429 ViewTypeB b(
"B",n,p+1);
430 ViewTypeC
c(
"c",m,p+1);
432 Kokkos::deep_copy(A, 1.0);
433 Kokkos::deep_copy(b, 1.0);
435 Kokkos::Timer wall_clock;
439 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
440 execution_space().fence();
442 for (
size_t l=0; l<nloop; l++) {
443 run_mat_vec_deriv_sl<MaxP>(
A, b,
c );
445 execution_space().fence();
447 perf.
time = wall_clock.seconds() / nloop;
457 template <
int p,
typename ... ViewArgs>
460 const size_t nloop,
const bool check)
462 typedef Kokkos::View<
double**[
p+1], ViewArgs...> ViewTypeA;
463 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeB;
464 typedef Kokkos::View<
double**, ViewArgs...> ViewTypeC;
465 typedef typename ViewTypeA::execution_space execution_space;
467 ViewTypeA
A(
"A",m,n);
468 ViewTypeB b(
"B",n,
p+1);
469 ViewTypeC
c(
"c",m,
p+1);
471 Kokkos::deep_copy(A, 1.0);
472 Kokkos::deep_copy(b, 1.0);
474 Kokkos::Timer wall_clock;
478 run_mat_vec_deriv_s<p>(
A, b,
c );
479 execution_space().fence();
481 for (
size_t l=0; l<nloop; l++) {
482 run_mat_vec_deriv_s<p>(
A, b,
c );
484 execution_space().fence();
486 perf.
time = wall_clock.seconds() / nloop;
500 #define INST_FUNC_VAL_DEV(DEV) \
501 template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
502 template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
503 template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
504 template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
505 template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
506 template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
507 template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
508 template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
509 template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
510 template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
511 template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
512 template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
514 #define INST_FUNC_FAD_DEV(FAD,DEV) \
515 template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
516 template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
517 template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
518 template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
519 template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
520 template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
522 #define INST_FUNC_DEV(DEV) \
523 INST_FUNC_VAL_DEV( DEV ) \
524 INST_FUNC_FAD_DEV( SFad_type, DEV ) \
525 INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
526 INST_FUNC_FAD_DEV( DFad_type, DEV )
528 #ifdef KOKKOS_ENABLE_SERIAL
532 #ifdef KOKKOS_ENABLE_OPENMP
536 #ifdef KOKKOS_ENABLE_THREADS
540 #ifdef KOKKOS_ENABLE_CUDA
544 #ifdef KOKKOS_ENABLE_HIP
Sacado::Fad::DFad< double > DFad_type
double do_time_analytic(int nderiv, int nloop)
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void run_mat_vec_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
SimpleFad< ValueT > min(const SimpleFad< ValueT > &a, const SimpleFad< ValueT > &b)
Sacado::Fad::SFad< double, SFadSize > SFad_type
Perf do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
#define INST_FUNC_DEV(DEV)
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::SLFad< double, SLFadSize > SLFad_type