Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
csr_vector.h
Go to the documentation of this file.
1 /*
2  * Copyright 2008-2009 NVIDIA Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 
18 #pragma once
19 
20 #include <cusp/detail/device/arch.h>
21 #include <cusp/detail/device/common.h>
22 #include <cusp/detail/device/utils.h>
23 #include <cusp/detail/device/texture.h>
24 #include <thrust/device_ptr.h>
25 #include <cudaProfiler.h>
26 #include <cuda_profiler_api.h>
27 #include <stdio.h>
28 #include <cusp/detail/device/arch.h>
29 
30 #include "Stokhos_config.h"
31 #if 0 && defined(HAVE_STOKHOS_CUSPARSE)
32 #define USE_CUSPARSE_ROW 0
33 #define USE_CUSPARSE_COL 1
34 #else
35 #define USE_CUSPARSE_ROW 0
36 #define USE_CUSPARSE_COL 0
37 #endif
38 
39 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
40 #include <sstream>
41 #include <stdexcept>
42 #include <cuda_runtime.h>
43 #include <cusparse.h>
44 #endif
45 
46 namespace cusp
47 {
48 namespace detail
49 {
50 namespace device
51 {
52 
53 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
54 
55 class CudaSparseSingleton {
56 public:
57 
58  cusparseStatus_t status;
59  cusparseHandle_t handle;
60  cusparseMatDescr_t descra;
61 
62  static CudaSparseSingleton & singleton();
63 
64 private:
65 
66  CudaSparseSingleton()
67  {
68  status = cusparseCreate(&handle);
69  if(status != CUSPARSE_STATUS_SUCCESS)
70  {
71  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Initialization failed" ) );
72  }
73 
74  status = cusparseCreateMatDescr(&descra);
75  if(status != CUSPARSE_STATUS_SUCCESS)
76  {
77  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Matrix descriptor failed" ) );
78  }
79 
80  cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
81  cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
82  }
83 
84  CudaSparseSingleton( const CudaSparseSingleton & );
85  CudaSparseSingleton & operator = ( const CudaSparseSingleton & );
86 };
87 
88 CudaSparseSingleton & CudaSparseSingleton::singleton()
89 {
90  static CudaSparseSingleton s ; return s ;
91 }
92 
93 #endif
94 
95 #if USE_CUSPARSE_ROW
96 
98  const csr_matrix<int,double,device_memory>& A,
99  const array2d<double, device_memory, column_major>& x,
100  array2d<double, device_memory, column_major>& y,
101  cusp::row_major)
102 {
103  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
104  const double alpha = 1 , beta = 0 ;
105  cusparseStatus_t status =
106  cusparseDcsrmm2(s.handle,
107  CUSPARSE_OPERATION_NON_TRANSPOSE,
108  CUSPARSE_OPERATION_TRANSPOSE,
109  A.num_rows, x.num_cols, A.num_cols, A.num_entries,
110  alpha,
111  s.descra,
112  thrust::raw_pointer_cast(&A.values[0]),
113  thrust::raw_pointer_cast(&A.row_offsets[0]),
114  thrust::raw_pointer_cast(&A.column_indices[0]),
115  thrust::raw_pointer_cast(&(x.values)[0]),
116  x.num_rows,
117  beta,
118  thrust::raw_pointer_cast(&(y.values)[0]),
119  y.num_rows);
120 
121  if ( CUSPARSE_STATUS_SUCCESS != status ) {
122  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
123  }
124 }
125 
126 #else
127 
128 template <typename IndexType, typename ValueType, unsigned MAX_NNZ_PER_ROW>
129 __global__ void
130 spmm_csr_vector_kernel_row(const IndexType Anum_rows,
131  const IndexType xnum_rows,
132  const IndexType xnum_cols,
133  const IndexType * Ar,
134  const IndexType * Ac,
135  const ValueType * Aval,
136  const ValueType * x,
137  ValueType * y)
138 {
139 
140  // Allocate storage for shared A row
141  extern __shared__ int sh[];
142  volatile ValueType * const sh_Aval = (ValueType*) sh;
143  volatile IndexType * const sh_Ac = (IndexType*) (sh_Aval + MAX_NNZ_PER_ROW * blockDim.y);
144 
145  // Global row
146  const IndexType row = threadIdx.y + blockDim.y * blockIdx.x;
147  if (row < Anum_rows) {
148  const IndexType row_start = Ar[row];
149  const IndexType row_end = Ar[row+1];
150 
151  // Initialize y for this row
152  for (IndexType j=threadIdx.x; j<xnum_cols; j+=blockDim.x) {
153  y[j+xnum_cols*row] = 0.0;
154  }
155 
156  // Loop over cols of A MAX_NNZ_PER_ROW at a time
157  for (IndexType block_col=row_start; block_col<row_end;
158  block_col+=MAX_NNZ_PER_ROW) {
159  const IndexType r = block_col + MAX_NNZ_PER_ROW < row_end ?
160  MAX_NNZ_PER_ROW : row_end-block_col;
161 
162  // Read row of A into shared mem using all threads in block
163  // Store in shared-memory in a transposed layout to avoid
164  // bank conflicts between warps (the kernel is completely bandwidth
165  // limited, so this doesn't really help performance).
166  // And we don't need synchronization since blockDim.x <= warp_size
167  for (IndexType i=threadIdx.x; i<r; i+=blockDim.x){
168  sh_Aval[i*blockDim.y+threadIdx.y] = Aval[i+block_col];
169  sh_Ac[ i*blockDim.y+threadIdx.y] = Ac [i+block_col];
170  }
171 
172  // Loop over cols of x
173  for (IndexType j=threadIdx.x; j<xnum_cols; j+=blockDim.x){
174 
175  // Loop over cols of A
176  ValueType sum = 0.0;
177  for (IndexType jj=0; jj<r; jj++){
178  IndexType J = sh_Ac[jj*blockDim.y+threadIdx.y];
179  sum += sh_Aval[jj*blockDim.y+threadIdx.y] * x[j+xnum_cols*J];
180  }
181  y[j+xnum_cols*row] += sum;
182 
183  } // Loop over cols of x
184 
185  } // Loop over block cols of A
186  }
187 }
188 
189 template <typename Matrix, typename Vector2, typename Vector3>
190 void __spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y,
191  cusp::row_major)
192 {
193  typedef typename Matrix::index_type IndexType;
194  typedef typename Matrix::value_type ValueType;
195  typedef typename Matrix::memory_space MemorySpace;
196 
197  // Here we do a half-warp for columns of x to get a little better
198  // granularity with the number of columns not being divisible by 32.
199  // These numbers work out to 4 warps/block and 16 blocks/SM (based both on
200  // shared memory and registers for Kepler). This results in 100% occupancy.
201  const unsigned MAX_NNZ_PER_ROW = 32;
202  const size_t COLS_PER_BLOCK = 16;
203  const size_t ROWS_PER_BLOCK = 8;
204  const size_t shared =
205  ROWS_PER_BLOCK * MAX_NNZ_PER_ROW * (sizeof(IndexType) + sizeof(ValueType));
206  const size_t NUM_BLOCKS = (A.num_rows + ROWS_PER_BLOCK-1) / ROWS_PER_BLOCK;
207 
208  dim3 block(COLS_PER_BLOCK, ROWS_PER_BLOCK, 1);
209  dim3 grid(NUM_BLOCKS, 1);
210 
211  spmm_csr_vector_kernel_row<IndexType, ValueType, MAX_NNZ_PER_ROW> <<<grid, block, shared>>>
212  (A.num_rows, x.num_rows, x.num_cols,
213  thrust::raw_pointer_cast(&A.row_offsets[0]),
214  thrust::raw_pointer_cast(&A.column_indices[0]),
215  thrust::raw_pointer_cast(&A.values[0]),
216  thrust::raw_pointer_cast(&(x.values)[0]),
217  thrust::raw_pointer_cast(&(y.values)[0]));
218 }
219 
220 #endif
221 
222 #if USE_CUSPARSE_COL
223 
224 void __spmm_csr_vector(
225  const csr_matrix<int,double,device_memory>& A,
226  const array2d<double, device_memory, column_major>& x,
227  array2d<double, device_memory, column_major>& y,
228  cusp::column_major)
229 {
230  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
231  const double alpha = 1 , beta = 0 ;
232  cusparseStatus_t status =
233  cusparseDcsrmm(s.handle,
234  CUSPARSE_OPERATION_NON_TRANSPOSE,
235  A.num_rows, x.num_cols, A.num_cols,
236  alpha,
237  s.descra,
238  thrust::raw_pointer_cast(&A.values[0]),
239  thrust::raw_pointer_cast(&A.row_offsets[0]),
240  thrust::raw_pointer_cast(&A.column_indices[0]),
241  thrust::raw_pointer_cast(&(x.values)[0]),
242  x.num_rows,
243  beta,
244  thrust::raw_pointer_cast(&(y.values)[0]),
245  y.num_rows);
246 
247  if ( CUSPARSE_STATUS_SUCCESS != status ) {
248  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
249  }
250 }
251 
252 #else
253 
254 #if 1
255 
256 template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR>
257 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
258 __global__ void
259 spmm_csr_vector_kernel_col(const IndexType Anum_rows,
260  const IndexType xnum_rows,
261  const IndexType xnum_cols,
262  const IndexType * Ap,
263  const IndexType * Aj,
264  const ValueType * Ax,
265  const ValueType * x,
266  ValueType * y)
267 {
268  __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
269  __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
270 
271  const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
272 
273  const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
274  const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
275  const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
276  const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
277  const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
278 
279  for(IndexType row = vector_id; row < Anum_rows; row += num_vectors)
280  {
281  // use two threads to fetch Ap[row] and Ap[row+1]
282  // this is considerably faster than the straightforward version
283  if(thread_lane < 2)
284  ptrs[vector_lane][thread_lane] = Ap[row + thread_lane];
285 
286  const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
287  const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
288 
289  //loop over cols of x
290  for (IndexType j = 0; j < xnum_cols; j++){
291 
292  // initialize local sum
293  ValueType sum = 0;
294 
295  if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
296  {
297  // ensure aligned memory access to Aj and Ax
298 
299  IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
300 
301  // accumulate local sums
302  if(jj >= row_start && jj < row_end)
303  sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
304 
305  // accumulate local sums
306  for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
307  sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
308  }
309  else
310  {
311  // accumulate local sums
312  for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
313  sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
314  }
315 
316  // store local sum in shared memory
317  sdata[threadIdx.x] = sum;
318 
319  // reduce local sums to row sum
320  if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
321  if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
322  if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
323  if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
324  if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
325 
326  // first thread writes the result
327  if (thread_lane == 0)
328  y[j*Anum_rows+row] = sdata[threadIdx.x];
329 
330  }
331  }
332 }
333 
334 #else
335 
336 template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR>
337 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
338 __global__ void
339 spmm_csr_vector_kernel_col(const IndexType Anum_rows,
340  const IndexType xnum_rows,
341  const IndexType xnum_cols,
342  const IndexType * Ar,
343  const IndexType * Ac,
344  const ValueType * Aval,
345  const ValueType * x,
346  ValueType * y)
347 {
348  __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
349  __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
350 
351  extern __shared__ int sha[];
352  ValueType * const sh_Aval = (ValueType*) sha;
353  IndexType * const sh_Ac = (IndexType*) (sh_Aval + 32 * VECTORS_PER_BLOCK);
354 
355  const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
356 
357  const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
358  const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
359  const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index i.e. global warp id
360  const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
361  const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
362 
363  for(IndexType row = vector_id; row < Anum_rows; row += num_vectors) {
364  // use two threads to fetch Ar[row] and Ar[row+1]
365  // this is considerably faster than the straightforward version
366 
367  if(thread_lane < 2)
368  ptrs[vector_lane][thread_lane] = Ar[row + thread_lane];
369 
370  const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ar[row];
371  const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ar[row+1];
372  //nnz in row
373  const IndexType r = row_end - row_start;
374 
375  //fetch Aval and Ac for current row of A and store in shared mem
376  for (IndexType i = thread_lane; i < r; i+= THREADS_PER_VECTOR){
377  sh_Aval[vector_lane*32+i] = Aval[i+row_start];
378  sh_Ac[vector_lane*32+i] = Ac[i+row_start];
379  }
380  //loop over cols of x
381  for (IndexType j = 0; j < xnum_cols; j++){
382 
383 
384  // initialize local sum
385 
386  ValueType sum = 0;
387 
388  if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
389  {
390  // ensure aligned memory access to Ac and Aval
391  IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
392  // accumulate local sums
393  if(jj >= row_start && jj < row_end)
394  sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
395  // accumulate local sums
396  for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
397  sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
398  }
399  else
400  {
401  //accumulate local sums
402  for (IndexType jj = thread_lane; jj < r; jj+=THREADS_PER_VECTOR)
403  //sum += sh_Aval[jj+vector_lane*32] * x[j * xnum_rows + sh_Ac[vector_lane*32+jj]];
404  sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
405 
406  }
407  // store local sum in shared memory
408  sdata[threadIdx.x] = sum;
409  // reduce local sums to row sum
410  if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
411  if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
412  if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
413  if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
414  if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
415  // first thread writes the result
416  if (thread_lane == 0)
417  y[j*Anum_rows+row] = sdata[threadIdx.x];
418  }
419  }
420 }
421 
422 #endif
423 
424 template <bool UseCache, unsigned int THREADS_PER_VECTOR,
425  typename Matrix, typename Vector2, typename Vector3>
426 void __spmm_csr_vector_col(const Matrix& A, const Vector2& x, Vector3& y)
427 {
428  typedef typename Matrix::index_type IndexType;
429  typedef typename Matrix::value_type ValueType;
430 
431  const size_t THREADS_PER_BLOCK = 256;
432  const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
433  //const size_t SHARED = VECTORS_PER_BLOCK * 32 * (sizeof(IndexType)+sizeof(ValueType));
434  const size_t SHARED = 0;
435 
436  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, SHARED);
437  const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
438 
439  spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, SHARED>>>
440  (A.num_rows, x.num_rows, x.num_cols,
441  thrust::raw_pointer_cast(&A.row_offsets[0]),
442  thrust::raw_pointer_cast(&A.column_indices[0]),
443  thrust::raw_pointer_cast(&A.values[0]),
444  thrust::raw_pointer_cast(&(x.values)[0]),
445  thrust::raw_pointer_cast(&(y.values)[0]));
446 }
447 
448 template <typename Matrix, typename Vector2, typename Vector3>
449 void __spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y,
450  cusp::column_major)
451 {
452 #if 0
453  typedef typename Vector2::index_type IndexType;
454  for (IndexType col=0; col<x.num_cols; ++col) {
455  multiply(A, x.column(col), y.column(col));
456  }
457 #else
458  typedef typename Matrix::index_type IndexType;
459 
460  const IndexType nnz_per_row = A.num_entries / A.num_rows;
461 
462  if (nnz_per_row <= 2) { __spmm_csr_vector_col<false, 2>(A, x, y); return; }
463  if (nnz_per_row <= 4) { __spmm_csr_vector_col<false, 4>(A, x, y); return; }
464  if (nnz_per_row <= 8) { __spmm_csr_vector_col<false, 8>(A, x, y); return; }
465  if (nnz_per_row <= 16) { __spmm_csr_vector_col<false,16>(A, x, y); return; }
466 
467  __spmm_csr_vector_col<false,32>(A, x, y);
468 #endif
469 }
470 
471 #endif
472 
473 template <typename Matrix, typename Vector2, typename Vector3>
474 void spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y)
475 {
476  y.resize(A.num_rows, x.num_cols);
477  __spmm_csr_vector(A, x, y, typename Vector2::orientation());
478 }
479 
480 } // end namespace device
481 } // end namespace detail
482 } // end namespace cusp
__global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ap, const IndexType *Aj, const ValueType *Ax, const ValueType *x, ValueType *y)
Definition: csr_vector.h:259
void spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y)
Definition: csr_vector.h:474
__global__ void spmm_csr_vector_kernel_row(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ar, const IndexType *Ac, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: csr_vector.h:130
void multiply(const CrsMatrix< MatrixValue, Device, Layout > &A, const InputMultiVectorType &x, OutputMultiVectorType &y, const std::vector< OrdinalType > &col_indices, SingleColumnMultivectorMultiply)
void __spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: csr_vector.h:190
Kokkos::DefaultExecutionSpace device
void __spmm_csr_vector_col(const Matrix &A, const Vector2 &x, Vector3 &y)
Definition: csr_vector.h:426
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)