42 #ifndef STOKHOS_TINY_VEC_HPP
43 #define STOKHOS_TINY_VEC_HPP
46 #if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
49 #include <immintrin.h>
54 #include "Kokkos_Macros.hpp"
58 #if defined(__INTEL_COMPILER) && ! defined( __CUDA_ARCH__)
60 template <
typename ValueType,
int N,
bool UseIntrinsics,
bool Mask = false >
64 static const int Num = N;
66 KOKKOS_INLINE_FUNCTION
69 KOKKOS_INLINE_FUNCTION
74 template <
typename OrdinalType>
75 KOKKOS_INLINE_FUNCTION
76 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
80 KOKKOS_INLINE_FUNCTION
85 KOKKOS_INLINE_FUNCTION
88 #pragma vector aligned
89 for (
int i=0; i<
Num; ++i)
93 KOKKOS_INLINE_FUNCTION
96 #pragma vector aligned
97 for (
int i=0; i<
Num; ++i)
102 KOKKOS_INLINE_FUNCTION
103 void load(
const ValueType a[]) {
105 #pragma vector aligned
106 for (
int i=0; i<
Num; ++i)
110 KOKKOS_INLINE_FUNCTION
111 void load(
const ValueType a) {
113 #pragma vector aligned
114 for (
int i=0; i<
Num; ++i)
118 KOKKOS_INLINE_FUNCTION
121 #pragma vector aligned
122 for (
int i=0; i<
Num; ++i)
126 template <
typename OrdinalType>
127 KOKKOS_INLINE_FUNCTION
128 void gather(
const ValueType a[],
const OrdinalType idx[]) {
130 #pragma vector aligned
131 for (
int i=0; i<
Num; ++i)
135 KOKKOS_INLINE_FUNCTION
136 void scatter(ValueType a[])
const {
138 #pragma vector aligned
139 for (
int i=0; i<
Num; ++i)
143 KOKKOS_INLINE_FUNCTION
146 #pragma vector aligned
147 for (
int i=0; i<
Num; ++i)
151 KOKKOS_INLINE_FUNCTION
154 #pragma vector aligned
155 for (
int i=0; i<
Num; ++i)
156 v[i] = ValueType(0.0);
159 KOKKOS_INLINE_FUNCTION
162 #pragma vector aligned
163 for (
int i=0; i<
Num; ++i)
167 KOKKOS_INLINE_FUNCTION
170 #pragma vector aligned
171 for (
int i=0; i<
Num; ++i)
176 KOKKOS_INLINE_FUNCTION
179 #pragma vector aligned
180 for (
int i=0; i<
Num; ++i)
181 v[i] += t1.v[i]*t2.v[i];
184 KOKKOS_INLINE_FUNCTION
185 ValueType
sum()
const {
188 #pragma vector aligned
189 for (
int i=0; i<
Num; ++i)
195 ValueType
v[
Num] __attribute__((aligned(64)));
198 template <
typename ValueType,
int N,
bool UseIntrinsics >
199 class TinyVec<ValueType,N,UseIntrinsics,
true> {
202 static const int Num = N;
204 KOKKOS_INLINE_FUNCTION
205 TinyVec(
int size) { sz = size; }
207 KOKKOS_INLINE_FUNCTION
208 TinyVec(
const ValueType a[],
int size) {
213 template <
typename OrdinalType>
214 KOKKOS_INLINE_FUNCTION
215 TinyVec(
const ValueType a[],
const OrdinalType idx[],
int size) {
220 KOKKOS_INLINE_FUNCTION
221 TinyVec(
const ValueType a,
int size) {
226 KOKKOS_INLINE_FUNCTION
230 #pragma vector aligned
231 for (
int i=0; i<sz; ++i)
235 KOKKOS_INLINE_FUNCTION
239 #pragma vector aligned
240 for (
int i=0; i<sz; ++i)
245 KOKKOS_INLINE_FUNCTION
246 void load(
const ValueType a[]) {
248 #pragma vector aligned
249 for (
int i=0; i<sz; ++i)
253 KOKKOS_INLINE_FUNCTION
254 void load(
const ValueType a) {
256 #pragma vector aligned
257 for (
int i=0; i<sz; ++i)
261 KOKKOS_INLINE_FUNCTION
264 #pragma vector aligned
265 for (
int i=0; i<sz; ++i)
269 template <
typename OrdinalType>
270 KOKKOS_INLINE_FUNCTION
271 void gather(
const ValueType a[],
const OrdinalType idx[]) {
273 #pragma vector aligned
274 for (
int i=0; i<sz; ++i)
278 KOKKOS_INLINE_FUNCTION
279 void scatter(ValueType a[])
const {
281 #pragma vector aligned
282 for (
int i=0; i<sz; ++i)
286 KOKKOS_INLINE_FUNCTION
289 #pragma vector aligned
290 for (
int i=0; i<sz; ++i)
294 KOKKOS_INLINE_FUNCTION
297 #pragma vector aligned
298 for (
int i=0; i<sz; ++i)
299 v[i] = ValueType(0.0);
302 KOKKOS_INLINE_FUNCTION
305 #pragma vector aligned
306 for (
int i=0; i<sz; ++i)
310 KOKKOS_INLINE_FUNCTION
313 #pragma vector aligned
314 for (
int i=0; i<sz; ++i)
319 KOKKOS_INLINE_FUNCTION
322 #pragma vector aligned
323 for (
int i=0; i<sz; ++i)
324 v[i] += t1.v[i]*t2.v[i];
327 KOKKOS_INLINE_FUNCTION
328 ValueType
sum()
const {
331 #pragma vector aligned
332 for (
int i=0; i<sz; ++i)
338 ValueType
v[
Num] __attribute__((aligned(64)));
344 template <
typename ValueType,
int N,
bool UseIntrinsics,
bool Mask = false >
348 static const int Num = N;
350 KOKKOS_INLINE_FUNCTION
353 KOKKOS_INLINE_FUNCTION
358 template <
typename OrdinalType>
359 KOKKOS_INLINE_FUNCTION
360 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
364 KOKKOS_INLINE_FUNCTION
369 KOKKOS_INLINE_FUNCTION
371 for (
int i=0; i<
Num; ++i)
375 KOKKOS_INLINE_FUNCTION
377 for (
int i=0; i<
Num; ++i)
382 KOKKOS_INLINE_FUNCTION
383 void load(
const ValueType a[]) {
384 for (
int i=0; i<
Num; ++i)
388 KOKKOS_INLINE_FUNCTION
390 for (
int i=0; i<
Num; ++i)
394 KOKKOS_INLINE_FUNCTION
396 for (
int i=0; i<
Num; ++i)
400 template <
typename OrdinalType>
401 KOKKOS_INLINE_FUNCTION
402 void gather(
const ValueType a[],
const OrdinalType idx[]) {
403 for (
int i=0; i<
Num; ++i)
407 KOKKOS_INLINE_FUNCTION
409 for (
int i=0; i<
Num; ++i)
413 KOKKOS_INLINE_FUNCTION
415 for (
int i=0; i<
Num; ++i)
419 KOKKOS_INLINE_FUNCTION
421 for (
int i=0; i<
Num; ++i)
422 v[i] = ValueType(0.0);
425 KOKKOS_INLINE_FUNCTION
427 for (
int i=0; i<
Num; ++i)
431 KOKKOS_INLINE_FUNCTION
433 for (
int i=0; i<
Num; ++i)
438 KOKKOS_INLINE_FUNCTION
440 for (
int i=0; i<
Num; ++i)
441 v[i] += t1.v[i]*t2.v[i];
444 KOKKOS_INLINE_FUNCTION
447 for (
int i=0; i<
Num; ++i)
456 template <
typename ValueType,
int N,
bool UseIntrinsics >
460 static const int Num = N;
462 KOKKOS_INLINE_FUNCTION
465 KOKKOS_INLINE_FUNCTION
471 template <
typename OrdinalType>
472 KOKKOS_INLINE_FUNCTION
473 TinyVec(
const ValueType a[],
const OrdinalType idx[],
int size) {
478 KOKKOS_INLINE_FUNCTION
484 KOKKOS_INLINE_FUNCTION
487 for (
int i=0; i<sz; ++i)
491 KOKKOS_INLINE_FUNCTION
494 for (
int i=0; i<sz; ++i)
499 KOKKOS_INLINE_FUNCTION
500 void load(
const ValueType a[]) {
501 for (
int i=0; i<sz; ++i)
505 KOKKOS_INLINE_FUNCTION
507 for (
int i=0; i<sz; ++i)
511 KOKKOS_INLINE_FUNCTION
513 for (
int i=0; i<sz; ++i)
517 template <
typename OrdinalType>
518 KOKKOS_INLINE_FUNCTION
519 void gather(
const ValueType a[],
const OrdinalType idx[]) {
520 for (
int i=0; i<sz; ++i)
524 KOKKOS_INLINE_FUNCTION
526 for (
int i=0; i<sz; ++i)
530 KOKKOS_INLINE_FUNCTION
532 for (
int i=0; i<sz; ++i)
536 KOKKOS_INLINE_FUNCTION
538 for (
int i=0; i<sz; ++i)
539 v[i] = ValueType(0.0);
542 KOKKOS_INLINE_FUNCTION
544 for (
int i=0; i<sz; ++i)
548 KOKKOS_INLINE_FUNCTION
550 for (
int i=0; i<sz; ++i)
555 KOKKOS_INLINE_FUNCTION
557 for (
int i=0; i<sz; ++i)
558 v[i] += t1.v[i]*t2.v[i];
561 KOKKOS_INLINE_FUNCTION
564 for (
int i=0; i<sz; ++i)
576 #if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
583 typedef double ValueType;
584 static const int Num = 2;
592 template <
typename OrdinalType>
593 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
610 void load(
const ValueType a[]) {
611 v = _mm_set_pd(a[1], a[0]);
614 void load(
const ValueType a) {
622 template <
typename OrdinalType>
623 void gather(
const ValueType a[],
const OrdinalType idx[]) {
624 v = _mm_set_pd(a[idx[1]], a[idx[0]]);
627 void scatter(ValueType a[])
const {
628 _mm_storel_pd(&a[0],
v);
629 _mm_storeh_pd(&a[1],
v);
637 v = _mm_setzero_pd();
641 v = _mm_add_pd(
v, t.v);
645 v = _mm_mul_pd(
v, t.v);
650 __m128d t = _mm_mul_pd(t1.v, t2.v);
651 v = _mm_add_pd(
v, t);
654 ValueType
sum()
const {
667 class TinyVec<float,8,
true,false> {
670 typedef float ValueType;
671 static const int Num = 8;
679 template <
typename OrdinalType>
680 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
697 void load(
const ValueType a[]) {
698 v = _mm256_loadu_ps(a);
701 void load(
const ValueType a) {
702 v = _mm256_set1_ps(a);
706 v = _mm256_load_ps(a);
709 template <
typename OrdinalType>
710 void gather(
const ValueType a[],
const OrdinalType idx[]) {
711 __m128 v1 = _mm_set_ps(a[idx[3]], a[idx[2]], a[idx[1]], a[idx[0]]);
712 __m128 v2 = _mm_set_ps(a[idx[7]], a[idx[6]], a[idx[5]], a[idx[4]]);
713 v = _mm256_insertf128_ps(
v, v1, 0);
714 v = _mm256_insertf128_ps(
v, v2, 1);
717 void scatter(ValueType a[])
const {
718 _mm256_storeu_ps(a,
v);
722 _mm256_store_ps(a,
v);
726 v = _mm256_setzero_ps();
730 v = _mm256_add_ps(
v, t.v);
734 v = _mm256_mul_ps(
v, t.v);
739 __m256 t = _mm256_mul_ps(t1.v, t2.v);
740 v = _mm256_add_ps(
v, t);
744 __m256 s = _mm256_hadd_ps(
v,
v);
745 __m128 sl = _mm256_extractf128_ps(s, 0);
746 __m128 sh = _mm256_extractf128_ps(s, 1);
747 sl = _mm_add_ps(sl,sh);
748 sl = _mm_hadd_ps(sl,sl);
750 _MM_EXTRACT_FLOAT(res, sl, 0);
763 typedef double ValueType;
764 static const int Num = 4;
772 template <
typename OrdinalType>
773 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
790 void load(
const ValueType a[]) {
791 v = _mm256_loadu_pd(a);
794 void load(
const ValueType a) {
795 v = _mm256_set1_pd(a);
799 v = _mm256_load_pd(a);
802 template <
typename OrdinalType>
803 void gather(
const ValueType a[],
const OrdinalType idx[]) {
804 __m128d v1 = _mm_set_pd(a[idx[1]], a[idx[0]]);
805 __m128d v2 = _mm_set_pd(a[idx[3]], a[idx[2]]);
806 v = _mm256_insertf128_pd(
v, v1, 0);
807 v = _mm256_insertf128_pd(
v, v2, 1);
810 void scatter(ValueType a[])
const {
811 _mm256_storeu_pd(a,
v);
815 _mm256_store_pd(a,
v);
819 v = _mm256_setzero_pd();
823 v = _mm256_add_pd(
v, t.v);
827 v = _mm256_mul_pd(
v, t.v);
832 __m256d t = _mm256_mul_pd(t1.v, t2.v);
833 v = _mm256_add_pd(
v, t);
849 __m256d s = _mm256_hadd_pd(
v,
v);
850 __m128d sl = _mm256_extractf128_pd(s, 0);
851 __m128d sh = _mm256_extractf128_pd(s, 1);
852 sl = _mm_add_pd(sl,sh);
854 _mm_storel_pd(&res, sl);
866 typedef double ValueType;
867 static const int Num = 8;
875 template <
typename OrdinalType>
876 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
885 v1 = tv.v1; v2 = tv.v2;
889 v1 = tv.v1; v2 = tv.v2;
893 void load(
const ValueType a[]) {
894 v1 = _mm256_loadu_pd(a);
895 v2 = _mm256_loadu_pd(a+4);
898 void load(
const ValueType a) {
899 v1 = _mm256_set1_pd(a);
900 v2 = _mm256_set1_pd(a);
904 v1 = _mm256_load_pd(a);
905 v2 = _mm256_load_pd(a+4);
908 template <
typename OrdinalType>
909 void gather(
const ValueType a[],
const OrdinalType idx[]) {
910 __m128d t1 = _mm_set_pd(a[idx[1]], a[idx[0]]);
911 __m128d t2 = _mm_set_pd(a[idx[3]], a[idx[2]]);
912 __m128d t3 = _mm_set_pd(a[idx[5]], a[idx[4]]);
913 __m128d t4 = _mm_set_pd(a[idx[7]], a[idx[6]]);
914 v1 = _mm256_insertf128_pd(v1, t1, 0);
915 v1 = _mm256_insertf128_pd(v1, t2, 1);
916 v2 = _mm256_insertf128_pd(v2, t3, 0);
917 v2 = _mm256_insertf128_pd(v2, t4, 1);
920 void scatter(ValueType a[])
const {
921 _mm256_storeu_pd(a, v1);
922 _mm256_storeu_pd(a+4, v2);
926 _mm256_store_pd(a, v1);
927 _mm256_store_pd(a+4, v2);
931 v1 = _mm256_setzero_pd();
932 v2 = _mm256_setzero_pd();
936 v1 = _mm256_add_pd(v1, t.v1);
937 v2 = _mm256_add_pd(v2, t.v2);
941 v1 = _mm256_mul_pd(v1, t.v1);
942 v2 = _mm256_mul_pd(v2, t.v2);
947 __m256d t = _mm256_mul_pd(t1.v1, t2.v1);
948 __m256d s = _mm256_mul_pd(t1.v2, t2.v2);
949 v1 = _mm256_add_pd(v1, t);
950 v2 = _mm256_add_pd(v2, s);
954 __m256d s1 = _mm256_hadd_pd(v1,v1);
955 __m128d s1l = _mm256_extractf128_pd(s1, 0);
956 __m128d s1h = _mm256_extractf128_pd(s1, 1);
957 s1l = _mm_add_pd(s1l,s1h);
959 _mm_storel_pd(&res1, s1l);
961 __m256d s2 = _mm256_hadd_pd(v2,v2);
962 __m128d s2l = _mm256_extractf128_pd(s2, 0);
963 __m128d s2h = _mm256_extractf128_pd(s2, 1);
964 s2l = _mm_add_pd(s2l,s2h);
966 _mm_storel_pd(&res2, s2l);
976 #if defined( __MIC__ )
981 typedef double ValueType;
982 static const int Num = 8;
990 template <
typename OrdinalType>
991 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
1008 void load(
const ValueType a[]) {
1009 v = _mm512_load_pd(a);
1012 void load(
const ValueType a) {
1013 v = _mm512_set1_pd(a);
1017 v = _mm512_load_pd(a);
1020 template <
typename OrdinalType>
1021 void gather(
const ValueType a[],
const OrdinalType idx[]) {
1022 __mmask16 mask = _mm512_int2mask(255);
1023 __m512i vidx = _mm512_setzero_epi32();
1024 vidx = _mm512_mask_load_epi32(vidx, mask, idx);
1025 v = _mm512_i32logather_pd(vidx, a, 8);
1028 void scatter(ValueType a[])
const {
1029 _mm512_store_pd(a,
v);
1033 _mm512_store_pd(a,
v);
1037 v = _mm512_setzero_pd();
1041 v = _mm512_add_pd(
v, t.v);
1045 v = _mm512_mul_pd(
v, t.v);
1050 v = _mm512_fmadd_pd(t1.v, t2.v,
v);
1054 return _mm512_reduce_add_pd(
v);
1065 typedef double ValueType;
1066 static const int Num = 8;
1069 mask = _mm512_int2mask((1 << (sz+1))-1);
1072 TinyVec(
const ValueType a[],
const int sz) {
1073 mask = _mm512_int2mask((1 << (sz+1))-1);
1077 template <
typename OrdinalType>
1078 TinyVec(
const ValueType a[],
const OrdinalType idx[],
const int sz) {
1079 mask = _mm512_int2mask((1 << (sz+1))-1);
1083 TinyVec(
const ValueType a,
int sz) {
1084 mask = _mm512_int2mask((1 << (sz+1))-1);
1099 void load(
const ValueType a[]) {
1100 v = _mm512_setzero_pd();
1101 v = _mm512_mask_load_pd(
v, mask, a);
1104 void load(
const ValueType a) {
1105 v = _mm512_set1_pd(a);
1109 v = _mm512_setzero_pd();
1110 v = _mm512_mask_load_pd(
v, mask, a);
1113 template <
typename OrdinalType>
1114 void gather(
const ValueType a[],
const OrdinalType idx[]) {
1120 __m512i vidx = _mm512_load_epi32(idx);
1121 v = _mm512_setzero_pd();
1122 v = _mm512_mask_i32logather_pd(
v, mask, vidx, a, 8);
1125 void scatter(ValueType a[])
const {
1126 _mm512_mask_store_pd(a, mask,
v);
1130 _mm512_mask_store_pd(a, mask,
v);
1134 v = _mm512_setzero_pd();
1138 v = _mm512_mask_add_pd(
v, mask,
v, t.v);
1142 v = _mm512_mask_mul_pd(
v, mask,
v, t.v);
1147 v = _mm512_mask3_fmadd_pd(t1.v, t2.v,
v, mask);
1151 return _mm512_mask_reduce_add_pd(mask,
v);
1163 typedef double ValueType;
1164 static const int Num = 16;
1168 TinyVec(
const ValueType a[]) {
1172 template <
typename OrdinalType>
1173 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
1182 v1 = tv.v1; v2 = tv.v2;
1186 v1 = tv.v1; v2 = tv.v2;
1190 void load(
const ValueType a[]) {
1191 v1 = _mm512_load_pd(a);
1192 v2 = _mm512_load_pd(a+8);
1195 void load(
const ValueType a) {
1196 v1 = _mm512_set1_pd(a);
1197 v2 = _mm512_set1_pd(a);
1201 v1 = _mm512_load_pd(a);
1202 v2 = _mm512_load_pd(a+8);
1205 template <
typename OrdinalType>
1206 void gather(
const ValueType a[],
const OrdinalType idx[]) {
1212 __m512i v1idx = _mm512_load_epi32(idx);
1213 __m512i v2idx = _mm512_permute4f128_epi32(v1idx, _MM_PERM_BADC);
1214 v1 = _mm512_i32logather_pd(v1idx, a, 8);
1215 v2 = _mm512_i32logather_pd(v2idx, a, 8);
1218 void scatter(ValueType a[])
const {
1219 _mm512_store_pd(a, v1);
1220 _mm512_store_pd(a+8, v2);
1224 _mm512_store_pd(a, v1);
1225 _mm512_store_pd(a+8, v2);
1229 v1 = _mm512_setzero_pd();
1230 v2 = _mm512_setzero_pd();
1234 v1 = _mm512_add_pd(v1, t.v1);
1235 v2 = _mm512_add_pd(v2, t.v2);
1239 v1 = _mm512_mul_pd(v1, t.v1);
1240 v2 = _mm512_mul_pd(v2, t.v2);
1245 v1 = _mm512_fmadd_pd(t1.v1, t2.v1, v1);
1246 v2 = _mm512_fmadd_pd(t1.v2, t2.v2, v2);
1250 return _mm512_reduce_add_pd(v1) + _mm512_reduce_add_pd(v2);
1261 typedef double ValueType;
1262 static const int Num = 16;
1265 mask = _mm512_int2mask((1 << (sz-7))-1);
1268 TinyVec(
const ValueType a[],
int sz) {
1269 mask = _mm512_int2mask((1 << (sz-7))-1);
1273 template <
typename OrdinalType>
1274 TinyVec(
const ValueType a[],
const OrdinalType idx[],
int sz) {
1275 mask = _mm512_int2mask((1 << (sz-7))-1);
1279 TinyVec(
const ValueType a,
int sz) {
1280 mask = _mm512_int2mask((1 << (sz-7))-1);
1286 v1 = tv.v1; v2 = tv.v2;
1291 v1 = tv.v1; v2 = tv.v2;
1295 void load(
const ValueType a[]) {
1296 v1 = _mm512_load_pd(a);
1297 v2 = _mm512_setzero_pd();
1298 v2 = _mm512_mask_load_pd(v2, mask, a+8);
1301 void load(
const ValueType a) {
1302 v1 = _mm512_set1_pd(a);
1303 v2 = _mm512_set1_pd(a);
1307 v1 = _mm512_load_pd(a);
1308 v2 = _mm512_setzero_pd();
1309 v2 = _mm512_mask_load_pd(v2, mask, a+8);
1312 template <
typename OrdinalType>
1313 void gather(
const ValueType a[],
const OrdinalType idx[]) {
1320 __m512i v1idx = _mm512_load_epi32(idx);
1321 v1 = _mm512_i32logather_pd(v1idx, a, 8);
1323 v1idx = _mm512_permute4f128_epi32(v1idx, _MM_PERM_BADC);
1324 v2 = _mm512_setzero_pd();
1325 v2 = _mm512_mask_i32logather_pd(v2, mask, v1idx, a, 8);
1328 void scatter(ValueType a[])
const {
1329 _mm512_store_pd(a, v1);
1330 _mm512_mask_store_pd(a+8, mask, v2);
1334 _mm512_store_pd(a, v1);
1335 _mm512_mask_store_pd(a+8, mask, v2);
1339 v1 = _mm512_setzero_pd();
1340 v2 = _mm512_setzero_pd();
1344 v1 = _mm512_add_pd(v1, t.v1);
1345 v2 = _mm512_mask_add_pd(v2, mask, v2, t.v2);
1349 v1 = _mm512_mul_pd(v1, t.v1);
1350 v2 = _mm512_mask_mul_pd(v2, mask, v2, t.v2);
1355 v1 = _mm512_fmadd_pd(t1.v1, t2.v1, v1);
1356 v2 = _mm512_mask3_fmadd_pd(t1.v2, t2.v2, v2, mask);
1360 return _mm512_reduce_add_pd(v1) + _mm512_mask_reduce_add_pd(mask, v2);
1369 #endif // #if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[])
KOKKOS_INLINE_FUNCTION void gather(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION TinyVec & operator=(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], int size)
KOKKOS_INLINE_FUNCTION void aligned_load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void zero()
KOKKOS_INLINE_FUNCTION TinyVec & operator=(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void multiply_add(const TinyVec &t1, const TinyVec &t2)
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], const OrdinalType idx[], int size)
KOKKOS_INLINE_FUNCTION void times_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION TinyVec()
KOKKOS_INLINE_FUNCTION void multiply_add(const TinyVec &t1, const TinyVec &t2)
KOKKOS_INLINE_FUNCTION TinyVec(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void aligned_scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION ValueType sum() const
KOKKOS_INLINE_FUNCTION void load(const ValueType a)
KOKKOS_INLINE_FUNCTION ValueType sum() const
KOKKOS_INLINE_FUNCTION void load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void times_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION void scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a, int size)
KOKKOS_INLINE_FUNCTION void plus_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION TinyVec(int size)
KOKKOS_INLINE_FUNCTION void gather(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION void plus_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION void aligned_scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION void load(const ValueType a)
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION TinyVec(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void zero()
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a)
KOKKOS_INLINE_FUNCTION void aligned_load(const ValueType a[])