Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
csr_vector.h
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 /*
11  * Copyright 2008-2009 NVIDIA Corporation
12  *
13  * Licensed under the Apache License, Version 2.0 (the "License");
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  * http://www.apache.org/licenses/LICENSE-2.0
18  *
19  * Unless required by applicable law or agreed to in writing, software
20  * distributed under the License is distributed on an "AS IS" BASIS,
21  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  * See the License for the specific language governing permissions and
23  * limitations under the License.
24  */
25 
26 
27 #pragma once
28 
29 #include <cusp/detail/device/arch.h>
30 #include <cusp/detail/device/common.h>
31 #include <cusp/detail/device/utils.h>
32 #include <cusp/detail/device/texture.h>
33 #include <thrust/device_ptr.h>
34 #include <cudaProfiler.h>
35 #include <cuda_profiler_api.h>
36 #include <stdio.h>
37 #include <cusp/detail/device/arch.h>
38 
39 #include "Stokhos_config.h"
40 #if 0 && defined(HAVE_STOKHOS_CUSPARSE)
41 #define USE_CUSPARSE_ROW 0
42 #define USE_CUSPARSE_COL 1
43 #else
44 #define USE_CUSPARSE_ROW 0
45 #define USE_CUSPARSE_COL 0
46 #endif
47 
48 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
49 #include <sstream>
50 #include <stdexcept>
51 #include <cuda_runtime.h>
52 #include <cusparse.h>
53 #endif
54 
55 namespace cusp
56 {
57 namespace detail
58 {
59 namespace device
60 {
61 
62 #if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
63 
64 class CudaSparseSingleton {
65 public:
66 
67  cusparseStatus_t status;
68  cusparseHandle_t handle;
69  cusparseMatDescr_t descra;
70 
71  static CudaSparseSingleton & singleton();
72 
73 private:
74 
75  CudaSparseSingleton()
76  {
77  status = cusparseCreate(&handle);
78  if(status != CUSPARSE_STATUS_SUCCESS)
79  {
80  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Initialization failed" ) );
81  }
82 
83  status = cusparseCreateMatDescr(&descra);
84  if(status != CUSPARSE_STATUS_SUCCESS)
85  {
86  throw std::runtime_error( std::string("ERROR - CUSPARSE Library Matrix descriptor failed" ) );
87  }
88 
89  cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
90  cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
91  }
92 
93  CudaSparseSingleton( const CudaSparseSingleton & );
94  CudaSparseSingleton & operator = ( const CudaSparseSingleton & );
95 };
96 
97 CudaSparseSingleton & CudaSparseSingleton::singleton()
98 {
99  static CudaSparseSingleton s ; return s ;
100 }
101 
102 #endif
103 
104 #if USE_CUSPARSE_ROW
105 
106 void __spmm_csr_vector(
107  const csr_matrix<int,double,device_memory>& A,
108  const array2d<double, device_memory, column_major>& x,
109  array2d<double, device_memory, column_major>& y,
110  cusp::row_major)
111 {
112  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
113  const double alpha = 1 , beta = 0 ;
114  cusparseStatus_t status =
115  cusparseDcsrmm2(s.handle,
116  CUSPARSE_OPERATION_NON_TRANSPOSE,
117  CUSPARSE_OPERATION_TRANSPOSE,
118  A.num_rows, x.num_cols, A.num_cols, A.num_entries,
119  alpha,
120  s.descra,
121  thrust::raw_pointer_cast(&A.values[0]),
122  thrust::raw_pointer_cast(&A.row_offsets[0]),
123  thrust::raw_pointer_cast(&A.column_indices[0]),
124  thrust::raw_pointer_cast(&(x.values)[0]),
125  x.num_rows,
126  beta,
127  thrust::raw_pointer_cast(&(y.values)[0]),
128  y.num_rows);
129 
130  if ( CUSPARSE_STATUS_SUCCESS != status ) {
131  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
132  }
133 }
134 
135 #else
136 
137 template <typename IndexType, typename ValueType, unsigned MAX_NNZ_PER_ROW>
138 __global__ void
139 spmm_csr_vector_kernel_row(const IndexType Anum_rows,
140  const IndexType xnum_rows,
141  const IndexType xnum_cols,
142  const IndexType * Ar,
143  const IndexType * Ac,
144  const ValueType * Aval,
145  const ValueType * x,
146  ValueType * y)
147 {
148 
149  // Allocate storage for shared A row
150  extern __shared__ int sh[];
151  volatile ValueType * const sh_Aval = (ValueType*) sh;
152  volatile IndexType * const sh_Ac = (IndexType*) (sh_Aval + MAX_NNZ_PER_ROW * blockDim.y);
153 
154  // Global row
155  const IndexType row = threadIdx.y + blockDim.y * blockIdx.x;
156  if (row < Anum_rows) {
157  const IndexType row_start = Ar[row];
158  const IndexType row_end = Ar[row+1];
159 
160  // Initialize y for this row
161  for (IndexType j=threadIdx.x; j<xnum_cols; j+=blockDim.x) {
162  y[j+xnum_cols*row] = 0.0;
163  }
164 
165  // Loop over cols of A MAX_NNZ_PER_ROW at a time
166  for (IndexType block_col=row_start; block_col<row_end;
167  block_col+=MAX_NNZ_PER_ROW) {
168  const IndexType r = block_col + MAX_NNZ_PER_ROW < row_end ?
169  MAX_NNZ_PER_ROW : row_end-block_col;
170 
171  // Read row of A into shared mem using all threads in block
172  // Store in shared-memory in a transposed layout to avoid
173  // bank conflicts between warps (the kernel is completely bandwidth
174  // limited, so this doesn't really help performance).
175  // And we don't need synchronization since blockDim.x <= warp_size
176  for (IndexType i=threadIdx.x; i<r; i+=blockDim.x){
177  sh_Aval[i*blockDim.y+threadIdx.y] = Aval[i+block_col];
178  sh_Ac[ i*blockDim.y+threadIdx.y] = Ac [i+block_col];
179  }
180 
181  // Loop over cols of x
182  for (IndexType j=threadIdx.x; j<xnum_cols; j+=blockDim.x){
183 
184  // Loop over cols of A
185  ValueType sum = 0.0;
186  for (IndexType jj=0; jj<r; jj++){
187  IndexType J = sh_Ac[jj*blockDim.y+threadIdx.y];
188  sum += sh_Aval[jj*blockDim.y+threadIdx.y] * x[j+xnum_cols*J];
189  }
190  y[j+xnum_cols*row] += sum;
191 
192  } // Loop over cols of x
193 
194  } // Loop over block cols of A
195  }
196 }
197 
198 template <typename Matrix, typename Vector2, typename Vector3>
199 void __spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y,
200  cusp::row_major)
201 {
202  typedef typename Matrix::index_type IndexType;
203  typedef typename Matrix::value_type ValueType;
204  typedef typename Matrix::memory_space MemorySpace;
205 
206  // Here we do a half-warp for columns of x to get a little better
207  // granularity with the number of columns not being divisible by 32.
208  // These numbers work out to 4 warps/block and 16 blocks/SM (based both on
209  // shared memory and registers for Kepler). This results in 100% occupancy.
210  const unsigned MAX_NNZ_PER_ROW = 32;
211  const size_t COLS_PER_BLOCK = 16;
212  const size_t ROWS_PER_BLOCK = 8;
213  const size_t shared =
214  ROWS_PER_BLOCK * MAX_NNZ_PER_ROW * (sizeof(IndexType) + sizeof(ValueType));
215  const size_t NUM_BLOCKS = (A.num_rows + ROWS_PER_BLOCK-1) / ROWS_PER_BLOCK;
216 
217  dim3 block(COLS_PER_BLOCK, ROWS_PER_BLOCK, 1);
218  dim3 grid(NUM_BLOCKS, 1);
219 
220  spmm_csr_vector_kernel_row<IndexType, ValueType, MAX_NNZ_PER_ROW> <<<grid, block, shared>>>
221  (A.num_rows, x.num_rows, x.num_cols,
222  thrust::raw_pointer_cast(&A.row_offsets[0]),
223  thrust::raw_pointer_cast(&A.column_indices[0]),
224  thrust::raw_pointer_cast(&A.values[0]),
225  thrust::raw_pointer_cast(&(x.values)[0]),
226  thrust::raw_pointer_cast(&(y.values)[0]));
227 }
228 
229 #endif
230 
231 #if USE_CUSPARSE_COL
232 
233 void __spmm_csr_vector(
234  const csr_matrix<int,double,device_memory>& A,
235  const array2d<double, device_memory, column_major>& x,
236  array2d<double, device_memory, column_major>& y,
237  cusp::column_major)
238 {
239  CudaSparseSingleton & s = CudaSparseSingleton::singleton();
240  const double alpha = 1 , beta = 0 ;
241  cusparseStatus_t status =
242  cusparseDcsrmm(s.handle,
243  CUSPARSE_OPERATION_NON_TRANSPOSE,
244  A.num_rows, x.num_cols, A.num_cols,
245  alpha,
246  s.descra,
247  thrust::raw_pointer_cast(&A.values[0]),
248  thrust::raw_pointer_cast(&A.row_offsets[0]),
249  thrust::raw_pointer_cast(&A.column_indices[0]),
250  thrust::raw_pointer_cast(&(x.values)[0]),
251  x.num_rows,
252  beta,
253  thrust::raw_pointer_cast(&(y.values)[0]),
254  y.num_rows);
255 
256  if ( CUSPARSE_STATUS_SUCCESS != status ) {
257  throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
258  }
259 }
260 
261 #else
262 
263 #if 1
264 
265 template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR>
266 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
267 __global__ void
268 spmm_csr_vector_kernel_col(const IndexType Anum_rows,
269  const IndexType xnum_rows,
270  const IndexType xnum_cols,
271  const IndexType * Ap,
272  const IndexType * Aj,
273  const ValueType * Ax,
274  const ValueType * x,
275  ValueType * y)
276 {
277  __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
278  __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
279 
280  const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
281 
282  const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
283  const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
284  const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
285  const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
286  const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
287 
288  for(IndexType row = vector_id; row < Anum_rows; row += num_vectors)
289  {
290  // use two threads to fetch Ap[row] and Ap[row+1]
291  // this is considerably faster than the straightforward version
292  if(thread_lane < 2)
293  ptrs[vector_lane][thread_lane] = Ap[row + thread_lane];
294 
295  const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
296  const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
297 
298  //loop over cols of x
299  for (IndexType j = 0; j < xnum_cols; j++){
300 
301  // initialize local sum
302  ValueType sum = 0;
303 
304  if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
305  {
306  // ensure aligned memory access to Aj and Ax
307 
308  IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
309 
310  // accumulate local sums
311  if(jj >= row_start && jj < row_end)
312  sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
313 
314  // accumulate local sums
315  for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
316  sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
317  }
318  else
319  {
320  // accumulate local sums
321  for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
322  sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
323  }
324 
325  // store local sum in shared memory
326  sdata[threadIdx.x] = sum;
327 
328  // reduce local sums to row sum
329  if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
330  if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
331  if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
332  if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
333  if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
334 
335  // first thread writes the result
336  if (thread_lane == 0)
337  y[j*Anum_rows+row] = sdata[threadIdx.x];
338 
339  }
340  }
341 }
342 
343 #else
344 
345 template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR>
346 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
347 __global__ void
348 spmm_csr_vector_kernel_col(const IndexType Anum_rows,
349  const IndexType xnum_rows,
350  const IndexType xnum_cols,
351  const IndexType * Ar,
352  const IndexType * Ac,
353  const ValueType * Aval,
354  const ValueType * x,
355  ValueType * y)
356 {
357  __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
358  __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
359 
360  extern __shared__ int sha[];
361  ValueType * const sh_Aval = (ValueType*) sha;
362  IndexType * const sh_Ac = (IndexType*) (sh_Aval + 32 * VECTORS_PER_BLOCK);
363 
364  const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
365 
366  const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
367  const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
368  const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index i.e. global warp id
369  const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
370  const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
371 
372  for(IndexType row = vector_id; row < Anum_rows; row += num_vectors) {
373  // use two threads to fetch Ar[row] and Ar[row+1]
374  // this is considerably faster than the straightforward version
375 
376  if(thread_lane < 2)
377  ptrs[vector_lane][thread_lane] = Ar[row + thread_lane];
378 
379  const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ar[row];
380  const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ar[row+1];
381  //nnz in row
382  const IndexType r = row_end - row_start;
383 
384  //fetch Aval and Ac for current row of A and store in shared mem
385  for (IndexType i = thread_lane; i < r; i+= THREADS_PER_VECTOR){
386  sh_Aval[vector_lane*32+i] = Aval[i+row_start];
387  sh_Ac[vector_lane*32+i] = Ac[i+row_start];
388  }
389  //loop over cols of x
390  for (IndexType j = 0; j < xnum_cols; j++){
391 
392 
393  // initialize local sum
394 
395  ValueType sum = 0;
396 
397  if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
398  {
399  // ensure aligned memory access to Ac and Aval
400  IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
401  // accumulate local sums
402  if(jj >= row_start && jj < row_end)
403  sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
404  // accumulate local sums
405  for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
406  sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
407  }
408  else
409  {
410  //accumulate local sums
411  for (IndexType jj = thread_lane; jj < r; jj+=THREADS_PER_VECTOR)
412  //sum += sh_Aval[jj+vector_lane*32] * x[j * xnum_rows + sh_Ac[vector_lane*32+jj]];
413  sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
414 
415  }
416  // store local sum in shared memory
417  sdata[threadIdx.x] = sum;
418  // reduce local sums to row sum
419  if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
420  if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
421  if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
422  if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
423  if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
424  // first thread writes the result
425  if (thread_lane == 0)
426  y[j*Anum_rows+row] = sdata[threadIdx.x];
427  }
428  }
429 }
430 
431 #endif
432 
433 template <bool UseCache, unsigned int THREADS_PER_VECTOR,
434  typename Matrix, typename Vector2, typename Vector3>
435 void __spmm_csr_vector_col(const Matrix& A, const Vector2& x, Vector3& y)
436 {
437  typedef typename Matrix::index_type IndexType;
438  typedef typename Matrix::value_type ValueType;
439 
440  const size_t THREADS_PER_BLOCK = 256;
441  const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
442  //const size_t SHARED = VECTORS_PER_BLOCK * 32 * (sizeof(IndexType)+sizeof(ValueType));
443  const size_t SHARED = 0;
444 
445  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, SHARED);
446  const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
447 
448  spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, SHARED>>>
449  (A.num_rows, x.num_rows, x.num_cols,
450  thrust::raw_pointer_cast(&A.row_offsets[0]),
451  thrust::raw_pointer_cast(&A.column_indices[0]),
452  thrust::raw_pointer_cast(&A.values[0]),
453  thrust::raw_pointer_cast(&(x.values)[0]),
454  thrust::raw_pointer_cast(&(y.values)[0]));
455 }
456 
457 template <typename Matrix, typename Vector2, typename Vector3>
458 void __spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y,
459  cusp::column_major)
460 {
461 #if 0
462  typedef typename Vector2::index_type IndexType;
463  for (IndexType col=0; col<x.num_cols; ++col) {
464  multiply(A, x.column(col), y.column(col));
465  }
466 #else
467  typedef typename Matrix::index_type IndexType;
468 
469  const IndexType nnz_per_row = A.num_entries / A.num_rows;
470 
471  if (nnz_per_row <= 2) { __spmm_csr_vector_col<false, 2>(A, x, y); return; }
472  if (nnz_per_row <= 4) { __spmm_csr_vector_col<false, 4>(A, x, y); return; }
473  if (nnz_per_row <= 8) { __spmm_csr_vector_col<false, 8>(A, x, y); return; }
474  if (nnz_per_row <= 16) { __spmm_csr_vector_col<false,16>(A, x, y); return; }
475 
476  __spmm_csr_vector_col<false,32>(A, x, y);
477 #endif
478 }
479 
480 #endif
481 
482 template <typename Matrix, typename Vector2, typename Vector3>
483 void spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y)
484 {
485  y.resize(A.num_rows, x.num_cols);
486  __spmm_csr_vector(A, x, y, typename Vector2::orientation());
487 }
488 
489 } // end namespace device
490 } // end namespace detail
491 } // end namespace cusp
__global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ap, const IndexType *Aj, const ValueType *Ax, const ValueType *x, ValueType *y)
Definition: csr_vector.h:268
void spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y)
Definition: csr_vector.h:483
__global__ void spmm_csr_vector_kernel_row(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ar, const IndexType *Ac, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: csr_vector.h:139
void multiply(const CrsMatrix< MatrixValue, Device, Layout > &A, const InputMultiVectorType &x, OutputMultiVectorType &y, const std::vector< OrdinalType > &col_indices, SingleColumnMultivectorMultiply)
void __spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: csr_vector.h:199
Kokkos::DefaultExecutionSpace device
void __spmm_csr_vector_col(const Matrix &A, const Vector2 &x, Vector3 &y)
Definition: csr_vector.h:435
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)