29 #include <cusp/detail/device/arch.h>
30 #include <cusp/detail/device/common.h>
31 #include <cusp/detail/device/utils.h>
32 #include <cusp/detail/device/texture.h>
33 #include <thrust/device_ptr.h>
34 #include <cudaProfiler.h>
35 #include <cuda_profiler_api.h>
37 #include <cusp/detail/device/arch.h>
39 #include "Stokhos_config.h"
40 #if 0 && defined(HAVE_STOKHOS_CUSPARSE)
41 #define USE_CUSPARSE_ROW 0
42 #define USE_CUSPARSE_COL 1
44 #define USE_CUSPARSE_ROW 0
45 #define USE_CUSPARSE_COL 0
48 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
51 #include <cuda_runtime.h>
62 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
64 class CudaSparseSingleton {
67 cusparseStatus_t status;
68 cusparseHandle_t handle;
69 cusparseMatDescr_t descra;
71 static CudaSparseSingleton & singleton();
77 status = cusparseCreate(&handle);
78 if(status != CUSPARSE_STATUS_SUCCESS)
80 throw std::runtime_error( std::string(
"ERROR - CUSPARSE Library Initialization failed" ) );
83 status = cusparseCreateMatDescr(&descra);
84 if(status != CUSPARSE_STATUS_SUCCESS)
86 throw std::runtime_error( std::string(
"ERROR - CUSPARSE Library Matrix descriptor failed" ) );
89 cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
90 cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
93 CudaSparseSingleton(
const CudaSparseSingleton & );
94 CudaSparseSingleton & operator = (
const CudaSparseSingleton & );
97 CudaSparseSingleton & CudaSparseSingleton::singleton()
99 static CudaSparseSingleton s ;
return s ;
107 const csr_matrix<int,double,device_memory>&
A,
108 const array2d<double, device_memory, column_major>& x,
109 array2d<double, device_memory, column_major>& y,
112 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
113 const double alpha = 1 , beta = 0 ;
114 cusparseStatus_t status =
115 cusparseDcsrmm2(s.handle,
116 CUSPARSE_OPERATION_NON_TRANSPOSE,
117 CUSPARSE_OPERATION_TRANSPOSE,
118 A.num_rows, x.num_cols, A.num_cols, A.num_entries,
121 thrust::raw_pointer_cast(&A.values[0]),
122 thrust::raw_pointer_cast(&A.row_offsets[0]),
123 thrust::raw_pointer_cast(&A.column_indices[0]),
124 thrust::raw_pointer_cast(&(x.values)[0]),
127 thrust::raw_pointer_cast(&(y.values)[0]),
130 if ( CUSPARSE_STATUS_SUCCESS != status ) {
131 throw std::runtime_error( std::string(
"ERROR - cusparseDcsrmv " ) );
137 template <
typename IndexType,
typename ValueType,
unsigned MAX_NNZ_PER_ROW>
140 const IndexType xnum_rows,
141 const IndexType xnum_cols,
142 const IndexType * Ar,
143 const IndexType * Ac,
144 const ValueType * Aval,
150 extern __shared__
int sh[];
151 volatile ValueType *
const sh_Aval = (ValueType*) sh;
152 volatile IndexType *
const sh_Ac = (IndexType*) (sh_Aval + MAX_NNZ_PER_ROW * blockDim.y);
155 const IndexType row = threadIdx.y + blockDim.y * blockIdx.x;
156 if (row < Anum_rows) {
157 const IndexType row_start = Ar[row];
158 const IndexType row_end = Ar[row+1];
161 for (IndexType
j=threadIdx.x;
j<xnum_cols;
j+=blockDim.x) {
162 y[
j+xnum_cols*row] = 0.0;
166 for (IndexType block_col=row_start; block_col<row_end;
167 block_col+=MAX_NNZ_PER_ROW) {
168 const IndexType r = block_col + MAX_NNZ_PER_ROW < row_end ?
169 MAX_NNZ_PER_ROW : row_end-block_col;
176 for (IndexType i=threadIdx.x; i<r; i+=blockDim.x){
177 sh_Aval[i*blockDim.y+threadIdx.y] = Aval[i+block_col];
178 sh_Ac[ i*blockDim.y+threadIdx.y] = Ac [i+block_col];
182 for (IndexType
j=threadIdx.x;
j<xnum_cols;
j+=blockDim.x){
186 for (IndexType jj=0; jj<r; jj++){
187 IndexType J = sh_Ac[jj*blockDim.y+threadIdx.y];
188 sum += sh_Aval[jj*blockDim.y+threadIdx.y] * x[
j+xnum_cols*J];
190 y[
j+xnum_cols*row] +=
sum;
198 template <
typename Matrix,
typename Vector2,
typename Vector3>
202 typedef typename Matrix::index_type IndexType;
204 typedef typename Matrix::memory_space MemorySpace;
210 const unsigned MAX_NNZ_PER_ROW = 32;
211 const size_t COLS_PER_BLOCK = 16;
212 const size_t ROWS_PER_BLOCK = 8;
213 const size_t shared =
214 ROWS_PER_BLOCK * MAX_NNZ_PER_ROW * (
sizeof(IndexType) +
sizeof(ValueType));
215 const size_t NUM_BLOCKS = (A.num_rows + ROWS_PER_BLOCK-1) / ROWS_PER_BLOCK;
217 dim3 block(COLS_PER_BLOCK, ROWS_PER_BLOCK, 1);
218 dim3 grid(NUM_BLOCKS, 1);
220 spmm_csr_vector_kernel_row<IndexType, ValueType, MAX_NNZ_PER_ROW> <<<grid, block, shared>>>
221 (A.num_rows, x.num_rows, x.num_cols,
222 thrust::raw_pointer_cast(&A.row_offsets[0]),
223 thrust::raw_pointer_cast(&A.column_indices[0]),
224 thrust::raw_pointer_cast(&A.values[0]),
225 thrust::raw_pointer_cast(&(x.values)[0]),
226 thrust::raw_pointer_cast(&(y.values)[0]));
234 const csr_matrix<int,double,device_memory>& A,
235 const array2d<double, device_memory, column_major>& x,
236 array2d<double, device_memory, column_major>& y,
239 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
240 const double alpha = 1 , beta = 0 ;
241 cusparseStatus_t status =
242 cusparseDcsrmm(s.handle,
243 CUSPARSE_OPERATION_NON_TRANSPOSE,
244 A.num_rows, x.num_cols, A.num_cols,
247 thrust::raw_pointer_cast(&A.values[0]),
248 thrust::raw_pointer_cast(&A.row_offsets[0]),
249 thrust::raw_pointer_cast(&A.column_indices[0]),
250 thrust::raw_pointer_cast(&(x.values)[0]),
253 thrust::raw_pointer_cast(&(y.values)[0]),
256 if ( CUSPARSE_STATUS_SUCCESS != status ) {
257 throw std::runtime_error( std::string(
"ERROR - cusparseDcsrmv " ) );
265 template <
typename IndexType,
typename ValueType,
unsigned int VECTORS_PER_BLOCK,
unsigned int THREADS_PER_VECTOR>
266 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
269 const IndexType xnum_rows,
270 const IndexType xnum_cols,
271 const IndexType * Ap,
272 const IndexType * Aj,
273 const ValueType * Ax,
277 __shared__
volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];
278 __shared__
volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
280 const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
282 const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
283 const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);
284 const IndexType vector_id = thread_id / THREADS_PER_VECTOR;
285 const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR;
286 const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x;
288 for(IndexType row = vector_id; row < Anum_rows; row += num_vectors)
293 ptrs[vector_lane][thread_lane] = Ap[row + thread_lane];
295 const IndexType row_start = ptrs[vector_lane][0];
296 const IndexType row_end = ptrs[vector_lane][1];
299 for (IndexType
j = 0;
j < xnum_cols;
j++){
304 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
308 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
311 if(jj >= row_start && jj < row_end)
312 sum += Ax[jj] * x[Aj[jj]+xnum_rows*
j];
315 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
316 sum += Ax[jj] * x[Aj[jj]+xnum_rows*
j];
321 for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
322 sum += Ax[jj] * x[Aj[jj]+xnum_rows*
j];
326 sdata[threadIdx.x] =
sum;
329 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
330 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
331 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
332 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
333 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
336 if (thread_lane == 0)
337 y[
j*Anum_rows+row] = sdata[threadIdx.x];
345 template <
typename IndexType,
typename ValueType,
unsigned int VECTORS_PER_BLOCK,
unsigned int THREADS_PER_VECTOR>
346 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
349 const IndexType xnum_rows,
350 const IndexType xnum_cols,
351 const IndexType * Ar,
352 const IndexType * Ac,
353 const ValueType * Aval,
357 __shared__
volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];
358 __shared__
volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
360 extern __shared__
int sha[];
361 ValueType *
const sh_Aval = (ValueType*) sha;
362 IndexType *
const sh_Ac = (IndexType*) (sh_Aval + 32 * VECTORS_PER_BLOCK);
364 const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
366 const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
367 const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);
368 const IndexType vector_id = thread_id / THREADS_PER_VECTOR;
369 const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR;
370 const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x;
372 for(IndexType row = vector_id; row < Anum_rows; row += num_vectors) {
377 ptrs[vector_lane][thread_lane] = Ar[row + thread_lane];
379 const IndexType row_start = ptrs[vector_lane][0];
380 const IndexType row_end = ptrs[vector_lane][1];
382 const IndexType r = row_end - row_start;
385 for (IndexType i = thread_lane; i < r; i+= THREADS_PER_VECTOR){
386 sh_Aval[vector_lane*32+i] = Aval[i+row_start];
387 sh_Ac[vector_lane*32+i] = Ac[i+row_start];
390 for (IndexType
j = 0;
j < xnum_cols;
j++){
397 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
400 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
402 if(jj >= row_start && jj < row_end)
403 sum += Aval[jj] * x[
j*xnum_rows+Ac[jj]];
405 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
406 sum += Aval[jj] * x[
j*xnum_rows+Ac[jj]];
411 for (IndexType jj = thread_lane; jj < r; jj+=THREADS_PER_VECTOR)
413 sum += Aval[jj] * x[
j*xnum_rows+Ac[jj]];
417 sdata[threadIdx.x] =
sum;
419 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
420 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
421 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
422 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
423 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
425 if (thread_lane == 0)
426 y[
j*Anum_rows+row] = sdata[threadIdx.x];
433 template <
bool UseCache,
unsigned int THREADS_PER_VECTOR,
434 typename Matrix,
typename Vector2,
typename Vector3>
437 typedef typename Matrix::index_type IndexType;
440 const size_t THREADS_PER_BLOCK = 256;
441 const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
443 const size_t SHARED = 0;
445 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, SHARED);
446 const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
448 spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, SHARED>>>
449 (A.num_rows, x.num_rows, x.num_cols,
450 thrust::raw_pointer_cast(&A.row_offsets[0]),
451 thrust::raw_pointer_cast(&A.column_indices[0]),
452 thrust::raw_pointer_cast(&A.values[0]),
453 thrust::raw_pointer_cast(&(x.values)[0]),
454 thrust::raw_pointer_cast(&(y.values)[0]));
457 template <
typename Matrix,
typename Vector2,
typename Vector3>
462 typedef typename Vector2::index_type IndexType;
463 for (IndexType col=0; col<x.num_cols; ++col) {
464 multiply(A, x.column(col), y.column(col));
467 typedef typename Matrix::index_type IndexType;
469 const IndexType nnz_per_row = A.num_entries / A.num_rows;
471 if (nnz_per_row <= 2) { __spmm_csr_vector_col<false, 2>(A, x, y);
return; }
472 if (nnz_per_row <= 4) { __spmm_csr_vector_col<false, 4>(A, x, y);
return; }
473 if (nnz_per_row <= 8) { __spmm_csr_vector_col<false, 8>(A, x, y);
return; }
474 if (nnz_per_row <= 16) { __spmm_csr_vector_col<false,16>(A, x, y);
return; }
476 __spmm_csr_vector_col<false,32>(A, x, y);
482 template <
typename Matrix,
typename Vector2,
typename Vector3>
485 y.resize(A.num_rows, x.num_cols);
__global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ap, const IndexType *Aj, const ValueType *Ax, const ValueType *x, ValueType *y)
void spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y)
__global__ void spmm_csr_vector_kernel_row(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ar, const IndexType *Ac, const ValueType *Aval, const ValueType *x, ValueType *y)
void multiply(const CrsMatrix< MatrixValue, Device, Layout > &A, const InputMultiVectorType &x, OutputMultiVectorType &y, const std::vector< OrdinalType > &col_indices, SingleColumnMultivectorMultiply)
void __spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y, cusp::row_major)
void __spmm_csr_vector_col(const Matrix &A, const Vector2 &x, Vector3 &y)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)