10 #ifndef STOKHOS_TINY_VEC_HPP
11 #define STOKHOS_TINY_VEC_HPP
14 #if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
17 #include <immintrin.h>
22 #include "Kokkos_Macros.hpp"
26 #if defined(__INTEL_COMPILER) && ! defined( __CUDA_ARCH__)
28 template <
typename ValueType,
int N,
bool UseIntrinsics,
bool Mask = false >
32 static const int Num = N;
34 KOKKOS_INLINE_FUNCTION
37 KOKKOS_INLINE_FUNCTION
42 template <
typename OrdinalType>
43 KOKKOS_INLINE_FUNCTION
44 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
48 KOKKOS_INLINE_FUNCTION
53 KOKKOS_INLINE_FUNCTION
56 #pragma vector aligned
57 for (
int i=0; i<
Num; ++i)
61 KOKKOS_INLINE_FUNCTION
64 #pragma vector aligned
65 for (
int i=0; i<
Num; ++i)
70 KOKKOS_INLINE_FUNCTION
71 void load(
const ValueType a[]) {
73 #pragma vector aligned
74 for (
int i=0; i<
Num; ++i)
78 KOKKOS_INLINE_FUNCTION
79 void load(
const ValueType a) {
81 #pragma vector aligned
82 for (
int i=0; i<
Num; ++i)
86 KOKKOS_INLINE_FUNCTION
89 #pragma vector aligned
90 for (
int i=0; i<
Num; ++i)
94 template <
typename OrdinalType>
95 KOKKOS_INLINE_FUNCTION
96 void gather(
const ValueType a[],
const OrdinalType idx[]) {
98 #pragma vector aligned
99 for (
int i=0; i<
Num; ++i)
103 KOKKOS_INLINE_FUNCTION
104 void scatter(ValueType a[])
const {
106 #pragma vector aligned
107 for (
int i=0; i<
Num; ++i)
111 KOKKOS_INLINE_FUNCTION
114 #pragma vector aligned
115 for (
int i=0; i<
Num; ++i)
119 KOKKOS_INLINE_FUNCTION
122 #pragma vector aligned
123 for (
int i=0; i<
Num; ++i)
124 v[i] = ValueType(0.0);
127 KOKKOS_INLINE_FUNCTION
130 #pragma vector aligned
131 for (
int i=0; i<
Num; ++i)
135 KOKKOS_INLINE_FUNCTION
138 #pragma vector aligned
139 for (
int i=0; i<
Num; ++i)
144 KOKKOS_INLINE_FUNCTION
147 #pragma vector aligned
148 for (
int i=0; i<
Num; ++i)
149 v[i] += t1.v[i]*t2.v[i];
152 KOKKOS_INLINE_FUNCTION
153 ValueType
sum()
const {
156 #pragma vector aligned
157 for (
int i=0; i<
Num; ++i)
163 ValueType
v[
Num] __attribute__((aligned(64)));
166 template <
typename ValueType,
int N,
bool UseIntrinsics >
167 class TinyVec<ValueType,N,UseIntrinsics,
true> {
170 static const int Num = N;
172 KOKKOS_INLINE_FUNCTION
173 TinyVec(
int size) { sz = size; }
175 KOKKOS_INLINE_FUNCTION
176 TinyVec(
const ValueType a[],
int size) {
181 template <
typename OrdinalType>
182 KOKKOS_INLINE_FUNCTION
183 TinyVec(
const ValueType a[],
const OrdinalType idx[],
int size) {
188 KOKKOS_INLINE_FUNCTION
189 TinyVec(
const ValueType a,
int size) {
194 KOKKOS_INLINE_FUNCTION
198 #pragma vector aligned
199 for (
int i=0; i<sz; ++i)
203 KOKKOS_INLINE_FUNCTION
207 #pragma vector aligned
208 for (
int i=0; i<sz; ++i)
213 KOKKOS_INLINE_FUNCTION
214 void load(
const ValueType a[]) {
216 #pragma vector aligned
217 for (
int i=0; i<sz; ++i)
221 KOKKOS_INLINE_FUNCTION
222 void load(
const ValueType a) {
224 #pragma vector aligned
225 for (
int i=0; i<sz; ++i)
229 KOKKOS_INLINE_FUNCTION
232 #pragma vector aligned
233 for (
int i=0; i<sz; ++i)
237 template <
typename OrdinalType>
238 KOKKOS_INLINE_FUNCTION
239 void gather(
const ValueType a[],
const OrdinalType idx[]) {
241 #pragma vector aligned
242 for (
int i=0; i<sz; ++i)
246 KOKKOS_INLINE_FUNCTION
247 void scatter(ValueType a[])
const {
249 #pragma vector aligned
250 for (
int i=0; i<sz; ++i)
254 KOKKOS_INLINE_FUNCTION
257 #pragma vector aligned
258 for (
int i=0; i<sz; ++i)
262 KOKKOS_INLINE_FUNCTION
265 #pragma vector aligned
266 for (
int i=0; i<sz; ++i)
267 v[i] = ValueType(0.0);
270 KOKKOS_INLINE_FUNCTION
273 #pragma vector aligned
274 for (
int i=0; i<sz; ++i)
278 KOKKOS_INLINE_FUNCTION
281 #pragma vector aligned
282 for (
int i=0; i<sz; ++i)
287 KOKKOS_INLINE_FUNCTION
290 #pragma vector aligned
291 for (
int i=0; i<sz; ++i)
292 v[i] += t1.v[i]*t2.v[i];
295 KOKKOS_INLINE_FUNCTION
296 ValueType
sum()
const {
299 #pragma vector aligned
300 for (
int i=0; i<sz; ++i)
306 ValueType
v[
Num] __attribute__((aligned(64)));
312 template <
typename ValueType,
int N,
bool UseIntrinsics,
bool Mask = false >
316 static const int Num = N;
318 KOKKOS_INLINE_FUNCTION
321 KOKKOS_INLINE_FUNCTION
326 template <
typename OrdinalType>
327 KOKKOS_INLINE_FUNCTION
328 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
332 KOKKOS_INLINE_FUNCTION
337 KOKKOS_INLINE_FUNCTION
339 for (
int i=0; i<
Num; ++i)
343 KOKKOS_INLINE_FUNCTION
345 for (
int i=0; i<
Num; ++i)
350 KOKKOS_INLINE_FUNCTION
351 void load(
const ValueType a[]) {
352 for (
int i=0; i<
Num; ++i)
356 KOKKOS_INLINE_FUNCTION
358 for (
int i=0; i<
Num; ++i)
362 KOKKOS_INLINE_FUNCTION
364 for (
int i=0; i<
Num; ++i)
368 template <
typename OrdinalType>
369 KOKKOS_INLINE_FUNCTION
370 void gather(
const ValueType a[],
const OrdinalType idx[]) {
371 for (
int i=0; i<
Num; ++i)
375 KOKKOS_INLINE_FUNCTION
377 for (
int i=0; i<
Num; ++i)
381 KOKKOS_INLINE_FUNCTION
383 for (
int i=0; i<
Num; ++i)
387 KOKKOS_INLINE_FUNCTION
389 for (
int i=0; i<
Num; ++i)
390 v[i] = ValueType(0.0);
393 KOKKOS_INLINE_FUNCTION
395 for (
int i=0; i<
Num; ++i)
399 KOKKOS_INLINE_FUNCTION
401 for (
int i=0; i<
Num; ++i)
406 KOKKOS_INLINE_FUNCTION
408 for (
int i=0; i<
Num; ++i)
409 v[i] += t1.v[i]*t2.v[i];
412 KOKKOS_INLINE_FUNCTION
415 for (
int i=0; i<
Num; ++i)
424 template <
typename ValueType,
int N,
bool UseIntrinsics >
428 static const int Num = N;
430 KOKKOS_INLINE_FUNCTION
433 KOKKOS_INLINE_FUNCTION
439 template <
typename OrdinalType>
440 KOKKOS_INLINE_FUNCTION
441 TinyVec(
const ValueType a[],
const OrdinalType idx[],
int size) {
446 KOKKOS_INLINE_FUNCTION
452 KOKKOS_INLINE_FUNCTION
455 for (
int i=0; i<sz; ++i)
459 KOKKOS_INLINE_FUNCTION
462 for (
int i=0; i<sz; ++i)
467 KOKKOS_INLINE_FUNCTION
468 void load(
const ValueType a[]) {
469 for (
int i=0; i<sz; ++i)
473 KOKKOS_INLINE_FUNCTION
475 for (
int i=0; i<sz; ++i)
479 KOKKOS_INLINE_FUNCTION
481 for (
int i=0; i<sz; ++i)
485 template <
typename OrdinalType>
486 KOKKOS_INLINE_FUNCTION
487 void gather(
const ValueType a[],
const OrdinalType idx[]) {
488 for (
int i=0; i<sz; ++i)
492 KOKKOS_INLINE_FUNCTION
494 for (
int i=0; i<sz; ++i)
498 KOKKOS_INLINE_FUNCTION
500 for (
int i=0; i<sz; ++i)
504 KOKKOS_INLINE_FUNCTION
506 for (
int i=0; i<sz; ++i)
507 v[i] = ValueType(0.0);
510 KOKKOS_INLINE_FUNCTION
512 for (
int i=0; i<sz; ++i)
516 KOKKOS_INLINE_FUNCTION
518 for (
int i=0; i<sz; ++i)
523 KOKKOS_INLINE_FUNCTION
525 for (
int i=0; i<sz; ++i)
526 v[i] += t1.v[i]*t2.v[i];
529 KOKKOS_INLINE_FUNCTION
532 for (
int i=0; i<sz; ++i)
544 #if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
551 typedef double ValueType;
552 static const int Num = 2;
560 template <
typename OrdinalType>
561 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
578 void load(
const ValueType a[]) {
579 v = _mm_set_pd(a[1], a[0]);
582 void load(
const ValueType a) {
590 template <
typename OrdinalType>
591 void gather(
const ValueType a[],
const OrdinalType idx[]) {
592 v = _mm_set_pd(a[idx[1]], a[idx[0]]);
595 void scatter(ValueType a[])
const {
596 _mm_storel_pd(&a[0],
v);
597 _mm_storeh_pd(&a[1],
v);
605 v = _mm_setzero_pd();
609 v = _mm_add_pd(
v, t.v);
613 v = _mm_mul_pd(
v, t.v);
618 __m128d t = _mm_mul_pd(t1.v, t2.v);
619 v = _mm_add_pd(
v, t);
622 ValueType
sum()
const {
635 class TinyVec<float,8,
true,false> {
638 typedef float ValueType;
639 static const int Num = 8;
647 template <
typename OrdinalType>
648 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
665 void load(
const ValueType a[]) {
666 v = _mm256_loadu_ps(a);
669 void load(
const ValueType a) {
670 v = _mm256_set1_ps(a);
674 v = _mm256_load_ps(a);
677 template <
typename OrdinalType>
678 void gather(
const ValueType a[],
const OrdinalType idx[]) {
679 __m128 v1 = _mm_set_ps(a[idx[3]], a[idx[2]], a[idx[1]], a[idx[0]]);
680 __m128 v2 = _mm_set_ps(a[idx[7]], a[idx[6]], a[idx[5]], a[idx[4]]);
681 v = _mm256_insertf128_ps(
v, v1, 0);
682 v = _mm256_insertf128_ps(
v, v2, 1);
685 void scatter(ValueType a[])
const {
686 _mm256_storeu_ps(a,
v);
690 _mm256_store_ps(a,
v);
694 v = _mm256_setzero_ps();
698 v = _mm256_add_ps(
v, t.v);
702 v = _mm256_mul_ps(
v, t.v);
707 __m256 t = _mm256_mul_ps(t1.v, t2.v);
708 v = _mm256_add_ps(
v, t);
712 __m256 s = _mm256_hadd_ps(
v,
v);
713 __m128 sl = _mm256_extractf128_ps(s, 0);
714 __m128 sh = _mm256_extractf128_ps(s, 1);
715 sl = _mm_add_ps(sl,sh);
716 sl = _mm_hadd_ps(sl,sl);
718 _MM_EXTRACT_FLOAT(res, sl, 0);
731 typedef double ValueType;
732 static const int Num = 4;
740 template <
typename OrdinalType>
741 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
758 void load(
const ValueType a[]) {
759 v = _mm256_loadu_pd(a);
762 void load(
const ValueType a) {
763 v = _mm256_set1_pd(a);
767 v = _mm256_load_pd(a);
770 template <
typename OrdinalType>
771 void gather(
const ValueType a[],
const OrdinalType idx[]) {
772 __m128d v1 = _mm_set_pd(a[idx[1]], a[idx[0]]);
773 __m128d v2 = _mm_set_pd(a[idx[3]], a[idx[2]]);
774 v = _mm256_insertf128_pd(
v, v1, 0);
775 v = _mm256_insertf128_pd(
v, v2, 1);
778 void scatter(ValueType a[])
const {
779 _mm256_storeu_pd(a,
v);
783 _mm256_store_pd(a,
v);
787 v = _mm256_setzero_pd();
791 v = _mm256_add_pd(
v, t.v);
795 v = _mm256_mul_pd(
v, t.v);
800 __m256d t = _mm256_mul_pd(t1.v, t2.v);
801 v = _mm256_add_pd(
v, t);
817 __m256d s = _mm256_hadd_pd(
v,
v);
818 __m128d sl = _mm256_extractf128_pd(s, 0);
819 __m128d sh = _mm256_extractf128_pd(s, 1);
820 sl = _mm_add_pd(sl,sh);
822 _mm_storel_pd(&res, sl);
834 typedef double ValueType;
835 static const int Num = 8;
843 template <
typename OrdinalType>
844 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
853 v1 = tv.v1; v2 = tv.v2;
857 v1 = tv.v1; v2 = tv.v2;
861 void load(
const ValueType a[]) {
862 v1 = _mm256_loadu_pd(a);
863 v2 = _mm256_loadu_pd(a+4);
866 void load(
const ValueType a) {
867 v1 = _mm256_set1_pd(a);
868 v2 = _mm256_set1_pd(a);
872 v1 = _mm256_load_pd(a);
873 v2 = _mm256_load_pd(a+4);
876 template <
typename OrdinalType>
877 void gather(
const ValueType a[],
const OrdinalType idx[]) {
878 __m128d t1 = _mm_set_pd(a[idx[1]], a[idx[0]]);
879 __m128d t2 = _mm_set_pd(a[idx[3]], a[idx[2]]);
880 __m128d t3 = _mm_set_pd(a[idx[5]], a[idx[4]]);
881 __m128d t4 = _mm_set_pd(a[idx[7]], a[idx[6]]);
882 v1 = _mm256_insertf128_pd(v1, t1, 0);
883 v1 = _mm256_insertf128_pd(v1, t2, 1);
884 v2 = _mm256_insertf128_pd(v2, t3, 0);
885 v2 = _mm256_insertf128_pd(v2, t4, 1);
888 void scatter(ValueType a[])
const {
889 _mm256_storeu_pd(a, v1);
890 _mm256_storeu_pd(a+4, v2);
894 _mm256_store_pd(a, v1);
895 _mm256_store_pd(a+4, v2);
899 v1 = _mm256_setzero_pd();
900 v2 = _mm256_setzero_pd();
904 v1 = _mm256_add_pd(v1, t.v1);
905 v2 = _mm256_add_pd(v2, t.v2);
909 v1 = _mm256_mul_pd(v1, t.v1);
910 v2 = _mm256_mul_pd(v2, t.v2);
915 __m256d t = _mm256_mul_pd(t1.v1, t2.v1);
916 __m256d s = _mm256_mul_pd(t1.v2, t2.v2);
917 v1 = _mm256_add_pd(v1, t);
918 v2 = _mm256_add_pd(v2, s);
922 __m256d s1 = _mm256_hadd_pd(v1,v1);
923 __m128d s1l = _mm256_extractf128_pd(s1, 0);
924 __m128d s1h = _mm256_extractf128_pd(s1, 1);
925 s1l = _mm_add_pd(s1l,s1h);
927 _mm_storel_pd(&res1, s1l);
929 __m256d s2 = _mm256_hadd_pd(v2,v2);
930 __m128d s2l = _mm256_extractf128_pd(s2, 0);
931 __m128d s2h = _mm256_extractf128_pd(s2, 1);
932 s2l = _mm_add_pd(s2l,s2h);
934 _mm_storel_pd(&res2, s2l);
944 #if defined( __MIC__ )
949 typedef double ValueType;
950 static const int Num = 8;
958 template <
typename OrdinalType>
959 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
976 void load(
const ValueType a[]) {
977 v = _mm512_load_pd(a);
980 void load(
const ValueType a) {
981 v = _mm512_set1_pd(a);
985 v = _mm512_load_pd(a);
988 template <
typename OrdinalType>
989 void gather(
const ValueType a[],
const OrdinalType idx[]) {
990 __mmask16 mask = _mm512_int2mask(255);
991 __m512i vidx = _mm512_setzero_epi32();
992 vidx = _mm512_mask_load_epi32(vidx, mask, idx);
993 v = _mm512_i32logather_pd(vidx, a, 8);
996 void scatter(ValueType a[])
const {
997 _mm512_store_pd(a,
v);
1001 _mm512_store_pd(a,
v);
1005 v = _mm512_setzero_pd();
1009 v = _mm512_add_pd(
v, t.v);
1013 v = _mm512_mul_pd(
v, t.v);
1018 v = _mm512_fmadd_pd(t1.v, t2.v,
v);
1022 return _mm512_reduce_add_pd(
v);
1033 typedef double ValueType;
1034 static const int Num = 8;
1037 mask = _mm512_int2mask((1 << (sz+1))-1);
1040 TinyVec(
const ValueType a[],
const int sz) {
1041 mask = _mm512_int2mask((1 << (sz+1))-1);
1045 template <
typename OrdinalType>
1046 TinyVec(
const ValueType a[],
const OrdinalType idx[],
const int sz) {
1047 mask = _mm512_int2mask((1 << (sz+1))-1);
1051 TinyVec(
const ValueType a,
int sz) {
1052 mask = _mm512_int2mask((1 << (sz+1))-1);
1067 void load(
const ValueType a[]) {
1068 v = _mm512_setzero_pd();
1069 v = _mm512_mask_load_pd(
v, mask, a);
1072 void load(
const ValueType a) {
1073 v = _mm512_set1_pd(a);
1077 v = _mm512_setzero_pd();
1078 v = _mm512_mask_load_pd(
v, mask, a);
1081 template <
typename OrdinalType>
1082 void gather(
const ValueType a[],
const OrdinalType idx[]) {
1088 __m512i vidx = _mm512_load_epi32(idx);
1089 v = _mm512_setzero_pd();
1090 v = _mm512_mask_i32logather_pd(
v, mask, vidx, a, 8);
1093 void scatter(ValueType a[])
const {
1094 _mm512_mask_store_pd(a, mask,
v);
1098 _mm512_mask_store_pd(a, mask,
v);
1102 v = _mm512_setzero_pd();
1106 v = _mm512_mask_add_pd(
v, mask,
v, t.v);
1110 v = _mm512_mask_mul_pd(
v, mask,
v, t.v);
1115 v = _mm512_mask3_fmadd_pd(t1.v, t2.v,
v, mask);
1119 return _mm512_mask_reduce_add_pd(mask,
v);
1131 typedef double ValueType;
1132 static const int Num = 16;
1136 TinyVec(
const ValueType a[]) {
1140 template <
typename OrdinalType>
1141 TinyVec(
const ValueType a[],
const OrdinalType idx[]) {
1150 v1 = tv.v1; v2 = tv.v2;
1154 v1 = tv.v1; v2 = tv.v2;
1158 void load(
const ValueType a[]) {
1159 v1 = _mm512_load_pd(a);
1160 v2 = _mm512_load_pd(a+8);
1163 void load(
const ValueType a) {
1164 v1 = _mm512_set1_pd(a);
1165 v2 = _mm512_set1_pd(a);
1169 v1 = _mm512_load_pd(a);
1170 v2 = _mm512_load_pd(a+8);
1173 template <
typename OrdinalType>
1174 void gather(
const ValueType a[],
const OrdinalType idx[]) {
1180 __m512i v1idx = _mm512_load_epi32(idx);
1181 __m512i v2idx = _mm512_permute4f128_epi32(v1idx, _MM_PERM_BADC);
1182 v1 = _mm512_i32logather_pd(v1idx, a, 8);
1183 v2 = _mm512_i32logather_pd(v2idx, a, 8);
1186 void scatter(ValueType a[])
const {
1187 _mm512_store_pd(a, v1);
1188 _mm512_store_pd(a+8, v2);
1192 _mm512_store_pd(a, v1);
1193 _mm512_store_pd(a+8, v2);
1197 v1 = _mm512_setzero_pd();
1198 v2 = _mm512_setzero_pd();
1202 v1 = _mm512_add_pd(v1, t.v1);
1203 v2 = _mm512_add_pd(v2, t.v2);
1207 v1 = _mm512_mul_pd(v1, t.v1);
1208 v2 = _mm512_mul_pd(v2, t.v2);
1213 v1 = _mm512_fmadd_pd(t1.v1, t2.v1, v1);
1214 v2 = _mm512_fmadd_pd(t1.v2, t2.v2, v2);
1218 return _mm512_reduce_add_pd(v1) + _mm512_reduce_add_pd(v2);
1229 typedef double ValueType;
1230 static const int Num = 16;
1233 mask = _mm512_int2mask((1 << (sz-7))-1);
1236 TinyVec(
const ValueType a[],
int sz) {
1237 mask = _mm512_int2mask((1 << (sz-7))-1);
1241 template <
typename OrdinalType>
1242 TinyVec(
const ValueType a[],
const OrdinalType idx[],
int sz) {
1243 mask = _mm512_int2mask((1 << (sz-7))-1);
1247 TinyVec(
const ValueType a,
int sz) {
1248 mask = _mm512_int2mask((1 << (sz-7))-1);
1254 v1 = tv.v1; v2 = tv.v2;
1259 v1 = tv.v1; v2 = tv.v2;
1263 void load(
const ValueType a[]) {
1264 v1 = _mm512_load_pd(a);
1265 v2 = _mm512_setzero_pd();
1266 v2 = _mm512_mask_load_pd(v2, mask, a+8);
1269 void load(
const ValueType a) {
1270 v1 = _mm512_set1_pd(a);
1271 v2 = _mm512_set1_pd(a);
1275 v1 = _mm512_load_pd(a);
1276 v2 = _mm512_setzero_pd();
1277 v2 = _mm512_mask_load_pd(v2, mask, a+8);
1280 template <
typename OrdinalType>
1281 void gather(
const ValueType a[],
const OrdinalType idx[]) {
1288 __m512i v1idx = _mm512_load_epi32(idx);
1289 v1 = _mm512_i32logather_pd(v1idx, a, 8);
1291 v1idx = _mm512_permute4f128_epi32(v1idx, _MM_PERM_BADC);
1292 v2 = _mm512_setzero_pd();
1293 v2 = _mm512_mask_i32logather_pd(v2, mask, v1idx, a, 8);
1296 void scatter(ValueType a[])
const {
1297 _mm512_store_pd(a, v1);
1298 _mm512_mask_store_pd(a+8, mask, v2);
1302 _mm512_store_pd(a, v1);
1303 _mm512_mask_store_pd(a+8, mask, v2);
1307 v1 = _mm512_setzero_pd();
1308 v2 = _mm512_setzero_pd();
1312 v1 = _mm512_add_pd(v1, t.v1);
1313 v2 = _mm512_mask_add_pd(v2, mask, v2, t.v2);
1317 v1 = _mm512_mul_pd(v1, t.v1);
1318 v2 = _mm512_mask_mul_pd(v2, mask, v2, t.v2);
1323 v1 = _mm512_fmadd_pd(t1.v1, t2.v1, v1);
1324 v2 = _mm512_mask3_fmadd_pd(t1.v2, t2.v2, v2, mask);
1328 return _mm512_reduce_add_pd(v1) + _mm512_mask_reduce_add_pd(mask, v2);
1337 #endif // #if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[])
KOKKOS_INLINE_FUNCTION void gather(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION TinyVec & operator=(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], int size)
KOKKOS_INLINE_FUNCTION void aligned_load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void zero()
KOKKOS_INLINE_FUNCTION TinyVec & operator=(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void multiply_add(const TinyVec &t1, const TinyVec &t2)
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], const OrdinalType idx[], int size)
KOKKOS_INLINE_FUNCTION void times_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION TinyVec()
KOKKOS_INLINE_FUNCTION void multiply_add(const TinyVec &t1, const TinyVec &t2)
KOKKOS_INLINE_FUNCTION TinyVec(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void aligned_scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION ValueType sum() const
KOKKOS_INLINE_FUNCTION void load(const ValueType a)
KOKKOS_INLINE_FUNCTION ValueType sum() const
KOKKOS_INLINE_FUNCTION void load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void times_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION void scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a, int size)
KOKKOS_INLINE_FUNCTION void plus_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION TinyVec(int size)
KOKKOS_INLINE_FUNCTION void gather(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION void plus_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION void aligned_scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION void load(const ValueType a)
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION TinyVec(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void zero()
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a)
KOKKOS_INLINE_FUNCTION void aligned_load(const ValueType a[])