53 Ordinal NumPerThread, Ordinal ThreadsPerVector>
55 Ordinal num_vec_threads,
56 Ordinal num_row_threads,
58 typedef Kokkos::Cuda Device;
60 const Ordinal
VectorSize = NumPerThread * ThreadsPerVector;
61 typedef typename Storage::template apply_N<VectorSize>::type
storage_type;
64 const Ordinal nGrid = 5;
65 KokkosSparse::DeviceConfig dev_config(num_blocks, num_vec_threads, num_row_threads);
67 bool success = test_embedded_vector<Vector>(
68 nGrid,
VectorSize, dev_config, MultiplyOp(), out);
75 Kokkos_CrsMatrix_MP, Multiply_Default,
Storage, MultiplyOp )
78 const Ordinal NumPerThread = 1;
79 const Ordinal ThreadsPerVector = 16;
81 const Ordinal num_blocks = 0;
82 const Ordinal num_vec_threads = 0;
83 const Ordinal num_row_threads = 0;
86 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
90 Kokkos_CrsMatrix_MP, Multiply_1,
Storage, MultiplyOp )
93 const Ordinal NumPerThread = 1;
94 const Ordinal ThreadsPerVector = 16;
96 const Ordinal num_blocks = 10;
97 const Ordinal num_vec_threads = ThreadsPerVector;
98 const Ordinal num_row_threads = 4;
101 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
105 Kokkos_CrsMatrix_MP, Multiply_2,
Storage, MultiplyOp )
108 const Ordinal NumPerThread = 2;
109 const Ordinal ThreadsPerVector = 16;
111 const Ordinal num_blocks = 10;
112 const Ordinal num_vec_threads = ThreadsPerVector;
113 const Ordinal num_row_threads = 4;
116 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
120 Kokkos_CrsMatrix_MP, Multiply_3,
Storage, MultiplyOp )
123 const Ordinal NumPerThread = 3;
124 const Ordinal ThreadsPerVector = 16;
126 const Ordinal num_blocks = 10;
127 const Ordinal num_vec_threads = ThreadsPerVector;
128 const Ordinal num_row_threads = 4;
131 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
135 Kokkos_CrsMatrix_MP, Multiply_4,
Storage, MultiplyOp )
138 const Ordinal NumPerThread = 4;
139 const Ordinal ThreadsPerVector = 16;
141 const Ordinal num_blocks = 10;
142 const Ordinal num_vec_threads = ThreadsPerVector;
143 const Ordinal num_row_threads = 4;
146 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
149 #define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( STORAGE, OP ) \
150 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
151 Kokkos_CrsMatrix_MP, Multiply_Default, STORAGE, OP ) \
152 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
153 Kokkos_CrsMatrix_MP, Multiply_1, STORAGE, OP ) \
154 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
155 Kokkos_CrsMatrix_MP, Multiply_2, STORAGE, OP ) \
156 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
157 Kokkos_CrsMatrix_MP, Multiply_3, STORAGE, OP ) \
158 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \
159 Kokkos_CrsMatrix_MP, Multiply_4, STORAGE, OP )
164 #define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_ORDINAL_SCALAR_DEVICE( ORDINAL, SCALAR, DEVICE ) \
165 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( SFS, DefaultMultiply ) \
166 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( SFS, KokkosMultiply ) \
167 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( DS, DefaultMultiply ) \
168 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( DS, KokkosMultiply )
176 Kokkos::InitArguments init_args;
177 init_args.device_id = 0;
178 Kokkos::initialize( init_args );
179 Kokkos::print_configuration(std::cout);
Stokhos::StandardStorage< int, double > storage_type
#define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_ORDINAL_SCALAR_DEVICE(ORDINAL, SCALAR, DEVICE)
#define CRSMATRIX_MP_VECTOR_TESTS_DEVICE(DEVICE)
static int runUnitTestsFromMain(int argc, char *argv[])
TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL(Kokkos_SG_SpMv, CrsProductTensorCijk, Scalar, Device)
bool test_cuda_embedded_vector(Ordinal num_blocks, Ordinal num_vec_threads, Ordinal num_row_threads, Teuchos::FancyOStream &out)
int main(int argc, char **argv)
basic_FancyOStream< char > FancyOStream
const unsigned VectorSize