20 #include <cusp/detail/device/arch.h>
21 #include <cusp/detail/device/common.h>
22 #include <cusp/detail/device/utils.h>
23 #include <cusp/detail/device/texture.h>
24 #include <thrust/device_ptr.h>
25 #include <cudaProfiler.h>
26 #include <cuda_profiler_api.h>
28 #include <cusp/detail/device/arch.h>
30 #include "Stokhos_config.h"
31 #if 0 && defined(HAVE_STOKHOS_CUSPARSE)
32 #define USE_CUSPARSE_ROW 0
33 #define USE_CUSPARSE_COL 1
35 #define USE_CUSPARSE_ROW 0
36 #define USE_CUSPARSE_COL 0
39 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
42 #include <cuda_runtime.h>
53 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
55 class CudaSparseSingleton {
58 cusparseStatus_t status;
59 cusparseHandle_t handle;
60 cusparseMatDescr_t descra;
62 static CudaSparseSingleton & singleton();
68 status = cusparseCreate(&handle);
69 if(status != CUSPARSE_STATUS_SUCCESS)
71 throw std::runtime_error( std::string(
"ERROR - CUSPARSE Library Initialization failed" ) );
74 status = cusparseCreateMatDescr(&descra);
75 if(status != CUSPARSE_STATUS_SUCCESS)
77 throw std::runtime_error( std::string(
"ERROR - CUSPARSE Library Matrix descriptor failed" ) );
80 cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
81 cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
84 CudaSparseSingleton(
const CudaSparseSingleton & );
85 CudaSparseSingleton & operator = (
const CudaSparseSingleton & );
88 CudaSparseSingleton & CudaSparseSingleton::singleton()
90 static CudaSparseSingleton s ;
return s ;
98 const csr_matrix<int,double,device_memory>&
A,
99 const array2d<double, device_memory, column_major>& x,
100 array2d<double, device_memory, column_major>& y,
103 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
104 const double alpha = 1 , beta = 0 ;
105 cusparseStatus_t status =
106 cusparseDcsrmm2(s.handle,
107 CUSPARSE_OPERATION_NON_TRANSPOSE,
108 CUSPARSE_OPERATION_TRANSPOSE,
109 A.num_rows, x.num_cols, A.num_cols, A.num_entries,
112 thrust::raw_pointer_cast(&A.values[0]),
113 thrust::raw_pointer_cast(&A.row_offsets[0]),
114 thrust::raw_pointer_cast(&A.column_indices[0]),
115 thrust::raw_pointer_cast(&(x.values)[0]),
118 thrust::raw_pointer_cast(&(y.values)[0]),
121 if ( CUSPARSE_STATUS_SUCCESS != status ) {
122 throw std::runtime_error( std::string(
"ERROR - cusparseDcsrmv " ) );
128 template <
typename IndexType,
typename ValueType,
unsigned MAX_NNZ_PER_ROW>
131 const IndexType xnum_rows,
132 const IndexType xnum_cols,
133 const IndexType * Ar,
134 const IndexType * Ac,
135 const ValueType * Aval,
141 extern __shared__
int sh[];
142 volatile ValueType *
const sh_Aval = (ValueType*) sh;
143 volatile IndexType *
const sh_Ac = (IndexType*) (sh_Aval + MAX_NNZ_PER_ROW * blockDim.y);
146 const IndexType row = threadIdx.y + blockDim.y * blockIdx.x;
147 if (row < Anum_rows) {
148 const IndexType row_start = Ar[row];
149 const IndexType row_end = Ar[row+1];
152 for (IndexType
j=threadIdx.x;
j<xnum_cols;
j+=blockDim.x) {
153 y[
j+xnum_cols*row] = 0.0;
157 for (IndexType block_col=row_start; block_col<row_end;
158 block_col+=MAX_NNZ_PER_ROW) {
159 const IndexType r = block_col + MAX_NNZ_PER_ROW < row_end ?
160 MAX_NNZ_PER_ROW : row_end-block_col;
167 for (IndexType i=threadIdx.x; i<r; i+=blockDim.x){
168 sh_Aval[i*blockDim.y+threadIdx.y] = Aval[i+block_col];
169 sh_Ac[ i*blockDim.y+threadIdx.y] = Ac [i+block_col];
173 for (IndexType
j=threadIdx.x;
j<xnum_cols;
j+=blockDim.x){
177 for (IndexType jj=0; jj<r; jj++){
178 IndexType J = sh_Ac[jj*blockDim.y+threadIdx.y];
179 sum += sh_Aval[jj*blockDim.y+threadIdx.y] * x[
j+xnum_cols*J];
181 y[
j+xnum_cols*row] +=
sum;
189 template <
typename Matrix,
typename Vector2,
typename Vector3>
193 typedef typename Matrix::index_type IndexType;
195 typedef typename Matrix::memory_space MemorySpace;
201 const unsigned MAX_NNZ_PER_ROW = 32;
202 const size_t COLS_PER_BLOCK = 16;
203 const size_t ROWS_PER_BLOCK = 8;
204 const size_t shared =
205 ROWS_PER_BLOCK * MAX_NNZ_PER_ROW * (
sizeof(IndexType) +
sizeof(ValueType));
206 const size_t NUM_BLOCKS = (A.num_rows + ROWS_PER_BLOCK-1) / ROWS_PER_BLOCK;
208 dim3 block(COLS_PER_BLOCK, ROWS_PER_BLOCK, 1);
209 dim3 grid(NUM_BLOCKS, 1);
211 spmm_csr_vector_kernel_row<IndexType, ValueType, MAX_NNZ_PER_ROW> <<<grid, block, shared>>>
212 (A.num_rows, x.num_rows, x.num_cols,
213 thrust::raw_pointer_cast(&A.row_offsets[0]),
214 thrust::raw_pointer_cast(&A.column_indices[0]),
215 thrust::raw_pointer_cast(&A.values[0]),
216 thrust::raw_pointer_cast(&(x.values)[0]),
217 thrust::raw_pointer_cast(&(y.values)[0]));
225 const csr_matrix<int,double,device_memory>& A,
226 const array2d<double, device_memory, column_major>& x,
227 array2d<double, device_memory, column_major>& y,
230 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
231 const double alpha = 1 , beta = 0 ;
232 cusparseStatus_t status =
233 cusparseDcsrmm(s.handle,
234 CUSPARSE_OPERATION_NON_TRANSPOSE,
235 A.num_rows, x.num_cols, A.num_cols,
238 thrust::raw_pointer_cast(&A.values[0]),
239 thrust::raw_pointer_cast(&A.row_offsets[0]),
240 thrust::raw_pointer_cast(&A.column_indices[0]),
241 thrust::raw_pointer_cast(&(x.values)[0]),
244 thrust::raw_pointer_cast(&(y.values)[0]),
247 if ( CUSPARSE_STATUS_SUCCESS != status ) {
248 throw std::runtime_error( std::string(
"ERROR - cusparseDcsrmv " ) );
256 template <
typename IndexType,
typename ValueType,
unsigned int VECTORS_PER_BLOCK,
unsigned int THREADS_PER_VECTOR>
257 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
260 const IndexType xnum_rows,
261 const IndexType xnum_cols,
262 const IndexType * Ap,
263 const IndexType * Aj,
264 const ValueType * Ax,
268 __shared__
volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];
269 __shared__
volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
271 const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
273 const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
274 const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);
275 const IndexType vector_id = thread_id / THREADS_PER_VECTOR;
276 const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR;
277 const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x;
279 for(IndexType row = vector_id; row < Anum_rows; row += num_vectors)
284 ptrs[vector_lane][thread_lane] = Ap[row + thread_lane];
286 const IndexType row_start = ptrs[vector_lane][0];
287 const IndexType row_end = ptrs[vector_lane][1];
290 for (IndexType
j = 0;
j < xnum_cols;
j++){
295 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
299 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
302 if(jj >= row_start && jj < row_end)
303 sum += Ax[jj] * x[Aj[jj]+xnum_rows*
j];
306 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
307 sum += Ax[jj] * x[Aj[jj]+xnum_rows*
j];
312 for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
313 sum += Ax[jj] * x[Aj[jj]+xnum_rows*
j];
317 sdata[threadIdx.x] =
sum;
320 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
321 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
322 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
323 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
324 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
327 if (thread_lane == 0)
328 y[
j*Anum_rows+row] = sdata[threadIdx.x];
336 template <
typename IndexType,
typename ValueType,
unsigned int VECTORS_PER_BLOCK,
unsigned int THREADS_PER_VECTOR>
337 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
340 const IndexType xnum_rows,
341 const IndexType xnum_cols,
342 const IndexType * Ar,
343 const IndexType * Ac,
344 const ValueType * Aval,
348 __shared__
volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];
349 __shared__
volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
351 extern __shared__
int sha[];
352 ValueType *
const sh_Aval = (ValueType*) sha;
353 IndexType *
const sh_Ac = (IndexType*) (sh_Aval + 32 * VECTORS_PER_BLOCK);
355 const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
357 const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
358 const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);
359 const IndexType vector_id = thread_id / THREADS_PER_VECTOR;
360 const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR;
361 const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x;
363 for(IndexType row = vector_id; row < Anum_rows; row += num_vectors) {
368 ptrs[vector_lane][thread_lane] = Ar[row + thread_lane];
370 const IndexType row_start = ptrs[vector_lane][0];
371 const IndexType row_end = ptrs[vector_lane][1];
373 const IndexType r = row_end - row_start;
376 for (IndexType i = thread_lane; i < r; i+= THREADS_PER_VECTOR){
377 sh_Aval[vector_lane*32+i] = Aval[i+row_start];
378 sh_Ac[vector_lane*32+i] = Ac[i+row_start];
381 for (IndexType
j = 0;
j < xnum_cols;
j++){
388 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
391 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
393 if(jj >= row_start && jj < row_end)
394 sum += Aval[jj] * x[
j*xnum_rows+Ac[jj]];
396 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
397 sum += Aval[jj] * x[
j*xnum_rows+Ac[jj]];
402 for (IndexType jj = thread_lane; jj < r; jj+=THREADS_PER_VECTOR)
404 sum += Aval[jj] * x[
j*xnum_rows+Ac[jj]];
408 sdata[threadIdx.x] =
sum;
410 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
411 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
412 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
413 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
414 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
416 if (thread_lane == 0)
417 y[
j*Anum_rows+row] = sdata[threadIdx.x];
424 template <
bool UseCache,
unsigned int THREADS_PER_VECTOR,
425 typename Matrix,
typename Vector2,
typename Vector3>
428 typedef typename Matrix::index_type IndexType;
431 const size_t THREADS_PER_BLOCK = 256;
432 const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
434 const size_t SHARED = 0;
436 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, SHARED);
437 const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
439 spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, SHARED>>>
440 (A.num_rows, x.num_rows, x.num_cols,
441 thrust::raw_pointer_cast(&A.row_offsets[0]),
442 thrust::raw_pointer_cast(&A.column_indices[0]),
443 thrust::raw_pointer_cast(&A.values[0]),
444 thrust::raw_pointer_cast(&(x.values)[0]),
445 thrust::raw_pointer_cast(&(y.values)[0]));
448 template <
typename Matrix,
typename Vector2,
typename Vector3>
453 typedef typename Vector2::index_type IndexType;
454 for (IndexType col=0; col<x.num_cols; ++col) {
455 multiply(A, x.column(col), y.column(col));
458 typedef typename Matrix::index_type IndexType;
460 const IndexType nnz_per_row = A.num_entries / A.num_rows;
462 if (nnz_per_row <= 2) { __spmm_csr_vector_col<false, 2>(A, x, y);
return; }
463 if (nnz_per_row <= 4) { __spmm_csr_vector_col<false, 4>(A, x, y);
return; }
464 if (nnz_per_row <= 8) { __spmm_csr_vector_col<false, 8>(A, x, y);
return; }
465 if (nnz_per_row <= 16) { __spmm_csr_vector_col<false,16>(A, x, y);
return; }
467 __spmm_csr_vector_col<false,32>(A, x, y);
473 template <
typename Matrix,
typename Vector2,
typename Vector3>
476 y.resize(A.num_rows, x.num_cols);
__global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ap, const IndexType *Aj, const ValueType *Ax, const ValueType *x, ValueType *y)
void spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y)
__global__ void spmm_csr_vector_kernel_row(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ar, const IndexType *Ac, const ValueType *Aval, const ValueType *x, ValueType *y)
void multiply(const CrsMatrix< MatrixValue, Device, Layout > &A, const InputMultiVectorType &x, OutputMultiVectorType &y, const std::vector< OrdinalType > &col_indices, SingleColumnMultivectorMultiply)
void __spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y, cusp::row_major)
void __spmm_csr_vector_col(const Matrix &A, const Vector2 &x, Vector3 &y)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)