Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
array2d.h
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 /*
11  * Copyright 2008-2009 NVIDIA Corporation
12  *
13  * Licensed under the Apache License, Version 2.0 (the "License");
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  * http://www.apache.org/licenses/LICENSE-2.0
18  *
19  * Unless required by applicable law or agreed to in writing, software
20  * distributed under the License is distributed on an "AS IS" BASIS,
21  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  * See the License for the specific language governing permissions and
23  * limitations under the License.
24  */
25 #include <cusp/array2d.h>
26 #include <cusp/array1d.h>
27 #include <cuda.h>
28 #include <iostream>
29 
30 namespace cusp
31 {
32 namespace detail
33 {
34 namespace device
35 {
36 
37 
38 template <typename IndexType,
39  typename ValueType>
40 __global__ void
41 row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows,
42  const IndexType num_cols,
43  const ValueType * x,
44  const ValueType * y,
45  ValueType * z)
46 {
47  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
48  row += gridDim.x * blockDim.y)
49  for (IndexType j = threadIdx.x; j < num_cols; j+=blockDim.x )
50  z[j+num_cols*row]= a * x[j+num_cols*row] + b * y[j+num_cols*row];
51 }
52 
53 template <typename IndexType,
54  typename ValueType>
55 __global__ void
56 col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows,
57  const IndexType num_cols,
58  const ValueType * x,
59  const ValueType * y,
60  ValueType * z)
61 {
62  for (IndexType j = threadIdx.y; j < num_cols; j+=blockDim.y )
63  for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < num_rows;
64  row += gridDim.x * blockDim.x)
65  z[j*num_rows+row]= a * x[j*num_rows+row] + b * y[j*num_rows+row];
66 }
67 
68 template <typename IndexType,
69  typename ValueType>
70 __global__ void
71 row_spmm_dense_diag_kernel(const IndexType Anum_rows,
72  const IndexType Anum_cols,
73  const ValueType * Aval,
74  const ValueType * x,
75  ValueType * y)
76 {
77  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
78  row += gridDim.x * blockDim.y)
79  for (IndexType j = threadIdx.x; j < Anum_cols; j+=blockDim.x )
80  y[j+Anum_cols*row]= x[j] * Aval[j+Anum_cols*row];
81 }
82 
83 template <typename IndexType,
84  typename ValueType>
85 __global__ void
86 col_spmm_dense_diag_kernel(const IndexType Anum_rows,
87  const IndexType Anum_cols,
88  const ValueType * Aval,
89  const ValueType * x,
90  ValueType * y)
91 {
92  for (IndexType j = threadIdx.y; j < Anum_cols; j+=blockDim.y )
93  for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < Anum_rows;
94  row += gridDim.x * blockDim.x)
95  y[j*Anum_rows+row]= x[j] * Aval[j*Anum_rows+row];
96 }
97 
98 template <typename IndexType,
99  typename ValueType>
100 __global__ void
101 row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows,
102  const IndexType Anum_cols,
103  const ValueType * Aval,
104  const ValueType * x,
105  ValueType * temp)
106 {
107  extern __shared__ int sh[];
108  ValueType * const sdata = (ValueType*) sh;
109 
110  for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
111  sdata[col + Anum_cols*threadIdx.y] = 0;
112  }
113  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
114  row += gridDim.x * blockDim.y) {
115  for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
116  sdata[col + Anum_cols*threadIdx.y] +=
117  Aval[col + Anum_cols * row] * x[col + Anum_cols * row];
118  }
119  }
120  //sum all local sums together to get
121  __syncthreads();
122  IndexType nwarp = blockDim.y / 2;
123  while (nwarp > 0) {
124  for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
125  IndexType j = threadIdx.y;
126  if (j < nwarp){
127  IndexType j2 = j+nwarp;
128  sdata[col+j*Anum_cols] += sdata[col+j2*Anum_cols];
129  }
130  __syncthreads();
131  }
132  nwarp /= 2;
133  }
134  __syncthreads();
135  for (IndexType col = threadIdx.x; col < Anum_cols; col+=blockDim.x){
136  temp[ Anum_cols * blockIdx.x + col ] = sdata[col];
137  }
138 
139 }
140 
141 template <typename IndexType,
142  typename ValueType>
143 __global__ void
144 col_spmm_MVdot_kernel(const IndexType Anum_rows,
145  const IndexType Anum_cols,
146  const ValueType * Aval,
147  const ValueType * x,
148  ValueType * temp)
149 {
150  extern __shared__ int sh[];
151  volatile ValueType * const sdata = (ValueType*) sh;
152 
153  IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
154 
155  for (IndexType col = threadIdx.y; col < Anum_cols; col += blockDim.y) {
156 
157  ValueType s = 0.0;
158  for (IndexType row = threadIdx.x+blockDim.x*blockIdx.x; row < Anum_rows;
159  row += gridDim.x*blockDim.x) {
160  s += Aval[col*Anum_rows+row] * x[col*Anum_rows+row];
161  }
162 
163  // sum all local sums together to get
164  sdata[tid] = s;
165  if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
166  if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
167  if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
168  if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
169  if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
170 
171  // store results
172  if (threadIdx.x == 0)
173  temp[col*gridDim.x+blockIdx.x] = sdata[tid];
174  }
175 
176 }
177 
178 //kernel to compute final dot product by adding the sums across the blocks
179 template <typename IndexType,
180  typename ValueType>
181 __global__ void
182 row_reduce_kernel(const IndexType num_rows,
183  const IndexType num_cols,
184  const ValueType * temp,
185  ValueType * y)
186 {
187  extern __shared__ int sh[];
188  ValueType * const sdata = (ValueType*) sh;
189  for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
190  sdata[col + num_cols*threadIdx.y] = 0;
191  }
192  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
193  row += gridDim.x * blockDim.y) {
194  for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
195  sdata[col + num_cols*threadIdx.y] += temp[col + num_cols * row];
196  }
197  }
198  __syncthreads();
199  IndexType nwarp = blockDim.y / 2;
200  while (nwarp > 0) {
201  for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
202  IndexType j = threadIdx.y;
203  if (j < nwarp){
204  IndexType j2 = j+nwarp;
205  sdata[col+j*num_cols] += sdata[col+j2*num_cols];
206  }
207  __syncthreads();
208  }
209  nwarp /= 2;
210  }
211  __syncthreads();
212  for (IndexType col = threadIdx.x; col < num_cols; col+=blockDim.x){
213  y[col + num_cols * blockIdx.x] = sdata[col];
214  }
215 }
216 
217 //kernel to compute final dot product by adding the sums across the blocks
218 template <typename IndexType,
219  typename ValueType>
220 __global__ void
221 col_reduce_kernel(const IndexType num_rows,
222  const IndexType num_cols,
223  const ValueType * temp,
224  ValueType * y)
225 {
226  extern __shared__ int sh[];
227  volatile ValueType * const sdata = (ValueType*) sh;
228 
229  IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
230 
231  for (IndexType col = threadIdx.y; col < num_cols; col += blockDim.y) {
232 
233  ValueType s = 0.0;
234  for (IndexType row = threadIdx.x; row < num_rows; row += blockDim.x) {
235  s += temp[col*num_rows+row];
236  }
237 
238  // sum all local sums together to get
239  sdata[tid] = s;
240  if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
241  if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
242  if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
243  if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
244  if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
245 
246  // store results
247  if (threadIdx.x == 0)
248  y[col] = sdata[tid];
249  }
250 }
251 
252 
253 template <typename Vector1,
254  typename Vector2,
255  typename Vector3>
256 void __spmm_MVdot(const int numRHS, const Vector1& A,
257  const Vector2& x,
258  Vector3& y, cusp::row_major)
259 {
260  CUSP_PROFILE_SCOPED();
261  typedef typename Vector2::index_type IndexType;
262  typedef typename Vector2::value_type ValueType;
263  typedef typename Vector2::memory_space MemorySpace;
264  const size_t BLOCK_SIZE = 32;
265 
266  const size_t ROWS_PER_BLOCK = 8;
267 
268  dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
269  const size_t shared = block.y * numRHS * sizeof(ValueType);
270 
271 
272  const size_t MAX_BLOCKS = 96;
273  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
274  dim3 grid(NUM_BLOCKS, 1);
275 
276 
277  cusp::array2d<ValueType, MemorySpace, cusp::row_major> temp(NUM_BLOCKS, A.num_cols, 0);
278 
279  row_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
280  (ROWS_PER_BLOCK, A.num_rows, A.num_cols,
281  thrust::raw_pointer_cast(&A.values[0]),
282  thrust::raw_pointer_cast(&x.values[0]),
283  thrust::raw_pointer_cast(&temp.values[0]));
284 
285  // add rows of temp (sum from each block) for final answer
286  IndexType num_rows = NUM_BLOCKS;
287  IndexType num_blocks = 16;
288  cusp::array2d<ValueType, MemorySpace, cusp::row_major> sum_temp(num_blocks, numRHS, 0);
289  //Need size to be power of 2
290  while (num_blocks > 0){
291  dim3 sum_grid(num_blocks, 1);
292  IndexType size = 1;
293  while (size < num_rows / num_blocks){
294  size <<=1;
295  }
296  dim3 sum_block( 32, size);
297  size_t sum_shared = sum_block.y * numRHS * sizeof(ValueType);
298  sum_temp.resize(num_blocks, numRHS);
299  row_reduce_kernel<IndexType,ValueType> <<<sum_grid, sum_block, sum_shared>>>
300  (temp.num_rows, temp.num_cols, thrust::raw_pointer_cast(&temp.values[0]),
301  thrust::raw_pointer_cast(&sum_temp.values[0]));
302  cusp::copy(sum_temp, temp);
303  num_rows = num_blocks;
304  num_blocks /= 2;
305  }
306  for (IndexType i = 0; i < numRHS; i++){
307  y[i] = sum_temp(0, i);
308  }
309 }
310 
311 template <typename Vector1,
312  typename Vector2,
313  typename Vector3>
314 void __spmm_MVdot(const int numRHS, const Vector1& A,
315  const Vector2& x,
316  Vector3& y, cusp::column_major)
317 {
318 #if 0
319  typedef typename Vector2::index_type IndexType;
320  for (IndexType col=0; col<A.num_cols; ++col) {
321  y[col] = cusp::blas::dotc(A.column(col), x.column(col));
322  }
323 #else
324  CUSP_PROFILE_SCOPED();
325  typedef typename Vector2::index_type IndexType;
326  typedef typename Vector2::value_type ValueType;
327  typedef typename Vector2::memory_space MemorySpace;
328 
329  const size_t BLOCK_SIZE = 32;
330  const size_t COLS_PER_BLOCK = 8;
331 
332  dim3 block(BLOCK_SIZE, COLS_PER_BLOCK);
333  const size_t shared = block.x * block.y * sizeof(ValueType);
334 
335  const size_t MAX_BLOCKS = 96;
336  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
337  dim3 grid(NUM_BLOCKS, 1);
338 
339 
340  cusp::array2d<ValueType, MemorySpace, cusp::column_major> temp(NUM_BLOCKS, A.num_cols, 0);
341 
342  col_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
343  (A.num_rows, A.num_cols,
344  thrust::raw_pointer_cast(&A.values[0]),
345  thrust::raw_pointer_cast(&x.values[0]),
346  thrust::raw_pointer_cast(&temp.values[0]));
347 
348  // add rows of temp (sum from each block) for final answer
349  dim3 sum_grid(1, 1);
350  col_reduce_kernel<IndexType,ValueType> <<<sum_grid, block, shared>>>
351  (NUM_BLOCKS, A.num_cols,
352  thrust::raw_pointer_cast(&temp.values[0]),
353  thrust::raw_pointer_cast(&y[0]));
354 #endif
355 }
356 
357 template <typename Vector1,
358  typename Vector2,
359  typename Vector3>
360 void __spmm_dense_diag(const Vector1& A,
361  const Vector2& x,
362  Vector3& y,
363  cusp::row_major)
364 {
365  CUSP_PROFILE_SCOPED();
366  typedef typename Vector3::index_type IndexType;
367  typedef typename Vector3::value_type ValueType;
368  typedef typename Vector3::memory_space MemorySpace;
369  const size_t BLOCK_SIZE = 32;
370  const size_t ROWS_PER_BLOCK = 8;
371  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
372  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
373  dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
374  dim3 grid(NUM_BLOCKS, 1);
375 
376  row_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
377  (A.num_rows, A.num_cols,
378  thrust::raw_pointer_cast(&A.values[0]),
379  thrust::raw_pointer_cast(&x[0]),
380  thrust::raw_pointer_cast(&(y.values)[0]));
381 }
382 
383 template <typename Vector1,
384  typename Vector2,
385  typename Vector3>
386 void __spmm_dense_diag(const Vector1& A,
387  const Vector2& x,
388  Vector3& y,
389  cusp::column_major)
390 {
391 #if 0
392  typedef typename Vector1::index_type IndexType;
393  typedef typename Vector1::value_type ValueType;
394  for (IndexType col=0; col<A.num_cols; ++col) {
395  cusp::blas::axpby(A.column(col), A.column(col), y.column(col),
396  x[col], ValueType(0.0));
397  }
398 #else
399  CUSP_PROFILE_SCOPED();
400  typedef typename Vector3::index_type IndexType;
401  typedef typename Vector3::value_type ValueType;
402  typedef typename Vector3::memory_space MemorySpace;
403  const size_t BLOCK_SIZE = 32;
404  const size_t COLS_PER_BLOCK = 8;
405  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
406  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
407  dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
408  dim3 grid(NUM_BLOCKS, 1);
409 
410  col_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
411  (A.num_rows, A.num_cols,
412  thrust::raw_pointer_cast(&A.values[0]),
413  thrust::raw_pointer_cast(&x[0]),
414  thrust::raw_pointer_cast(&(y.values)[0]));
415 #endif
416 }
417 
418 template <typename ValueType,
419  typename Vector1,
420  typename Vector2>
421 void __spmm_axpby(const ValueType& a,
422  const Vector1& x,
423  const ValueType& b,
424  const Vector1& y,
425  Vector2& z,
426  cusp::row_major)
427 {
428  CUSP_PROFILE_SCOPED();
429  typedef typename Vector2::index_type IndexType;
430  typedef typename Vector2::memory_space MemorySpace;
431  const size_t BLOCK_SIZE = 32;
432  const size_t ROWS_PER_BLOCK = 6;
433  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
434  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, ROWS_PER_BLOCK));
435  dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
436  dim3 grid(NUM_BLOCKS, 1);
437 
438  row_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
439  (a,b, x.num_rows, x.num_cols,
440  thrust::raw_pointer_cast(&x.values[0]),
441  thrust::raw_pointer_cast(&(y.values)[0]),
442  thrust::raw_pointer_cast(&(z.values)[0]));
443 }
444 
445 template <typename ValueType,
446  typename Vector1,
447  typename Vector2>
448 void __spmm_axpby(const ValueType& a,
449  const Vector1& x,
450  const ValueType& b,
451  const Vector1& y,
452  Vector2& z,
453  cusp::column_major)
454 {
455 #if 0
456  typedef typename Vector1::index_type IndexType;
457  for (IndexType col=0; col<x.num_cols; ++col) {
458  cusp::blas::axpby(x.column(col), y.column(col), z.column(col), a, b);
459  }
460 #else
461  CUSP_PROFILE_SCOPED();
462  typedef typename Vector2::index_type IndexType;
463  typedef typename Vector2::memory_space MemorySpace;
464  const size_t BLOCK_SIZE = 32;
465  const size_t COLS_PER_BLOCK = 6;
466  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
467  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, BLOCK_SIZE));
468  dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
469  dim3 grid(NUM_BLOCKS, 1);
470 
471  col_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
472  (a,b, x.num_rows, x.num_cols,
473  thrust::raw_pointer_cast(&x.values[0]),
474  thrust::raw_pointer_cast(&(y.values)[0]),
475  thrust::raw_pointer_cast(&(z.values)[0]));
476 #endif
477 }
478 
479 template <typename Vector1,
480  typename Vector2,
481  typename Vector3>
482 void spmm_MVdot(const Vector1& A,
483  const Vector2& x,
484  Vector3& y)
485 {
486  //Determine if row-wise or col-wise then call appropriate multiply
487  __spmm_MVdot(A.num_cols, A, x, y, typename Vector2::orientation());
488 }
489 
490 template <typename Vector1,
491  typename Vector2,
492  typename Vector3>
493 void spmm_dense_diag(const Vector1& A,
494  const Vector2& x,
495  Vector3& y)
496 {
497  //Determine if row-wise or col-wise then call appropriate multiply
498  __spmm_dense_diag(A, x, y, typename Vector1::orientation());
499 }
500 
501 template <typename ValueType,
502  typename Vector1,
503  typename Vector2>
504 void spmm_axpby(const ValueType& a, const Vector1& x, const ValueType& b,
505  const Vector1& y,
506  Vector2& z)
507 {
508  __spmm_axpby(a, x, b, y, z, typename Vector1::orientation());
509 }
510 
511 
512 } // end namespace device
513 } // end namespace detail
514 } // end namespace cusp
void spmm_MVdot(const Vector1 &A, const Vector2 &x, Vector3 &y)
Definition: array2d.h:482
__global__ void col_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: array2d.h:86
__global__ void col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
Definition: array2d.h:56
__global__ void row_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
Definition: array2d.h:182
__global__ void row_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: array2d.h:71
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
__global__ void col_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
Definition: array2d.h:221
void spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z)
Definition: array2d.h:504
void spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y)
Definition: array2d.h:493
void __spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: array2d.h:360
void __spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z, cusp::row_major)
Definition: array2d.h:421
Kokkos::DefaultExecutionSpace device
void axpby(const ValueType &A, const MV1 &X, const ValueType &B, const MV1 &Y, MV2 &Z)
__global__ void row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
Definition: array2d.h:41
__global__ void col_spmm_MVdot_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
Definition: array2d.h:144
__global__ void row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
Definition: array2d.h:101
void __spmm_MVdot(const int numRHS, const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: array2d.h:256