21 Ordinal NumPerThread, Ordinal ThreadsPerVector>
23 Ordinal num_vec_threads,
24 Ordinal num_row_threads,
26 typedef Kokkos::Cuda Device;
28 const Ordinal
VectorSize = NumPerThread * ThreadsPerVector;
29 typedef typename Storage::template apply_N<VectorSize>::type
storage_type;
32 const Ordinal nGrid = 5;
33 KokkosSparse::DeviceConfig dev_config(num_blocks, num_vec_threads, num_row_threads);
35 bool success = test_embedded_vector<Vector>(
36 nGrid,
VectorSize, dev_config, MultiplyOp(), out);
43 Kokkos_CrsMatrix_MP, Multiply_Default,
Storage, MultiplyOp )
46 const Ordinal NumPerThread = 1;
47 const Ordinal ThreadsPerVector = 16;
49 const Ordinal num_blocks = 0;
50 const Ordinal num_vec_threads = 0;
51 const Ordinal num_row_threads = 0;
54 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
58 Kokkos_CrsMatrix_MP, Multiply_1,
Storage, MultiplyOp )
61 const Ordinal NumPerThread = 1;
62 const Ordinal ThreadsPerVector = 16;
64 const Ordinal num_blocks = 10;
65 const Ordinal num_vec_threads = ThreadsPerVector;
66 const Ordinal num_row_threads = 4;
69 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
73 Kokkos_CrsMatrix_MP, Multiply_2,
Storage, MultiplyOp )
76 const Ordinal NumPerThread = 2;
77 const Ordinal ThreadsPerVector = 16;
79 const Ordinal num_blocks = 10;
80 const Ordinal num_vec_threads = ThreadsPerVector;
81 const Ordinal num_row_threads = 4;
84 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
88 Kokkos_CrsMatrix_MP, Multiply_3,
Storage, MultiplyOp )
91 const Ordinal NumPerThread = 3;
92 const Ordinal ThreadsPerVector = 16;
94 const Ordinal num_blocks = 10;
95 const Ordinal num_vec_threads = ThreadsPerVector;
96 const Ordinal num_row_threads = 4;
99 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
103 Kokkos_CrsMatrix_MP, Multiply_4,
Storage, MultiplyOp )
106 const Ordinal NumPerThread = 4;
107 const Ordinal ThreadsPerVector = 16;
109 const Ordinal num_blocks = 10;
110 const Ordinal num_vec_threads = ThreadsPerVector;
111 const Ordinal num_row_threads = 4;
114 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
117 #define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( STORAGE, OP ) \
118 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
119 Kokkos_CrsMatrix_MP, Multiply_Default, STORAGE, OP ) \
120 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
121 Kokkos_CrsMatrix_MP, Multiply_1, STORAGE, OP ) \
122 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
123 Kokkos_CrsMatrix_MP, Multiply_2, STORAGE, OP ) \
124 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
125 Kokkos_CrsMatrix_MP, Multiply_3, STORAGE, OP ) \
126 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
127 Kokkos_CrsMatrix_MP, Multiply_4, STORAGE, OP )
132 #define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_ORDINAL_SCALAR_DEVICE( ORDINAL, SCALAR, DEVICE ) \
133 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( SFS, DefaultMultiply ) \
134 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( SFS, KokkosMultiply ) \
135 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( DS, DefaultMultiply ) \
136 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( DS, KokkosMultiply )
144 Kokkos::InitializationSettings init_args;
145 init_args.set_device_id(0);
146 Kokkos::initialize( init_args );
147 Kokkos::print_configuration(std::cout);
Stokhos::StandardStorage< int, double > storage_type
#define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_ORDINAL_SCALAR_DEVICE(ORDINAL, SCALAR, DEVICE)
#define CRSMATRIX_MP_VECTOR_TESTS_DEVICE(DEVICE)
static int runUnitTestsFromMain(int argc, char *argv[])
TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL(Kokkos_SG_SpMv, CrsProductTensorCijk, Scalar, Device)
bool test_cuda_embedded_vector(Ordinal num_blocks, Ordinal num_vec_threads, Ordinal num_row_threads, Teuchos::FancyOStream &out)
int main(int argc, char **argv)
basic_FancyOStream< char > FancyOStream
const unsigned VectorSize