13 #include <cusp/array2d.h>
14 #include <cusp/csr_matrix.h>
16 #include <cusp/gallery/poisson.h>
17 #include <cusp/detail/timer.h>
25 typedef int IndexType;
26 typedef double ValueType;
27 typedef cusp::device_memory MemorySpace;
36 CLP.
setDocString(
"This test performance of block multiply routines.\n");
38 CLP.
setOption(
"n", &n,
"Number of mesh points in the each direction");
39 IndexType nrhs_begin = 32;
41 "Staring number of right-hand-sides");
42 IndexType nrhs_end = 512;
44 "Ending number of right-hand-sides");
45 IndexType nrhs_step = 32;
47 "Increment in number of right-hand-sides");
50 "Number of multiply iterations");
52 CLP.
setOption(
"device", &device_id,
"CUDA device ID");
53 CLP.
parse( argc, argv );
56 cudaSetDevice(device_id);
57 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
60 cusp::csr_matrix<IndexType, ValueType, MemorySpace>
A;
61 cusp::gallery::poisson27pt(A, n, n, n);
63 std::cout <<
"nrhs , num_rows , num_entries , row_time , row_gflops , "
64 <<
"col_time , col_gflops" << std::endl;
66 for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) {
69 2.0 *
static_cast<double>(A.num_entries) * static_cast<double>(nrhs);
72 cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row(
74 cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row(
77 cusp::detail::timer row_timer;
79 for (IndexType iter=0; iter<nits; ++iter) {
82 cudaDeviceSynchronize();
83 double row_time = row_timer.seconds_elapsed() / nits;
84 double row_gflops = 1.0e-9 * flops / row_time;
87 cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col(
89 cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col(
92 cusp::detail::timer col_timer;
94 for (IndexType iter=0; iter<nits; ++iter) {
97 cudaDeviceSynchronize();
98 double col_time = col_timer.seconds_elapsed() / nits;
99 double col_gflops = 1.0e-9 * flops / col_time;
101 std::cout << nrhs <<
" , "
102 << A.num_rows <<
" , " << A.num_entries <<
" , "
103 << row_time <<
" , " << row_gflops <<
" , "
104 << col_time <<
" , " << col_gflops
void MVmultiply(LinearOperator &A, MatrixOrVector1 &B, MatrixOrVector2 &C)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
int main(int argc, char **argv)
void setDocString(const char doc_string[])