Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
array2d.h
Go to the documentation of this file.
1 /*
2  * Copyright 2008-2009 NVIDIA Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <cusp/array2d.h>
17 #include <cusp/array1d.h>
18 #include <cuda.h>
19 #include <iostream>
20 
21 namespace cusp
22 {
23 namespace detail
24 {
25 namespace device
26 {
27 
28 
29 template <typename IndexType,
30  typename ValueType>
31 __global__ void
32 row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows,
33  const IndexType num_cols,
34  const ValueType * x,
35  const ValueType * y,
36  ValueType * z)
37 {
38  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
39  row += gridDim.x * blockDim.y)
40  for (IndexType j = threadIdx.x; j < num_cols; j+=blockDim.x )
41  z[j+num_cols*row]= a * x[j+num_cols*row] + b * y[j+num_cols*row];
42 }
43 
44 template <typename IndexType,
45  typename ValueType>
46 __global__ void
47 col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows,
48  const IndexType num_cols,
49  const ValueType * x,
50  const ValueType * y,
51  ValueType * z)
52 {
53  for (IndexType j = threadIdx.y; j < num_cols; j+=blockDim.y )
54  for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < num_rows;
55  row += gridDim.x * blockDim.x)
56  z[j*num_rows+row]= a * x[j*num_rows+row] + b * y[j*num_rows+row];
57 }
58 
59 template <typename IndexType,
60  typename ValueType>
61 __global__ void
62 row_spmm_dense_diag_kernel(const IndexType Anum_rows,
63  const IndexType Anum_cols,
64  const ValueType * Aval,
65  const ValueType * x,
66  ValueType * y)
67 {
68  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
69  row += gridDim.x * blockDim.y)
70  for (IndexType j = threadIdx.x; j < Anum_cols; j+=blockDim.x )
71  y[j+Anum_cols*row]= x[j] * Aval[j+Anum_cols*row];
72 }
73 
74 template <typename IndexType,
75  typename ValueType>
76 __global__ void
77 col_spmm_dense_diag_kernel(const IndexType Anum_rows,
78  const IndexType Anum_cols,
79  const ValueType * Aval,
80  const ValueType * x,
81  ValueType * y)
82 {
83  for (IndexType j = threadIdx.y; j < Anum_cols; j+=blockDim.y )
84  for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < Anum_rows;
85  row += gridDim.x * blockDim.x)
86  y[j*Anum_rows+row]= x[j] * Aval[j*Anum_rows+row];
87 }
88 
89 template <typename IndexType,
90  typename ValueType>
91 __global__ void
92 row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows,
93  const IndexType Anum_cols,
94  const ValueType * Aval,
95  const ValueType * x,
96  ValueType * temp)
97 {
98  extern __shared__ int sh[];
99  ValueType * const sdata = (ValueType*) sh;
100 
101  for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
102  sdata[col + Anum_cols*threadIdx.y] = 0;
103  }
104  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
105  row += gridDim.x * blockDim.y) {
106  for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
107  sdata[col + Anum_cols*threadIdx.y] +=
108  Aval[col + Anum_cols * row] * x[col + Anum_cols * row];
109  }
110  }
111  //sum all local sums together to get
112  __syncthreads();
113  IndexType nwarp = blockDim.y / 2;
114  while (nwarp > 0) {
115  for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
116  IndexType j = threadIdx.y;
117  if (j < nwarp){
118  IndexType j2 = j+nwarp;
119  sdata[col+j*Anum_cols] += sdata[col+j2*Anum_cols];
120  }
121  __syncthreads();
122  }
123  nwarp /= 2;
124  }
125  __syncthreads();
126  for (IndexType col = threadIdx.x; col < Anum_cols; col+=blockDim.x){
127  temp[ Anum_cols * blockIdx.x + col ] = sdata[col];
128  }
129 
130 }
131 
132 template <typename IndexType,
133  typename ValueType>
134 __global__ void
135 col_spmm_MVdot_kernel(const IndexType Anum_rows,
136  const IndexType Anum_cols,
137  const ValueType * Aval,
138  const ValueType * x,
139  ValueType * temp)
140 {
141  extern __shared__ int sh[];
142  volatile ValueType * const sdata = (ValueType*) sh;
143 
144  IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
145 
146  for (IndexType col = threadIdx.y; col < Anum_cols; col += blockDim.y) {
147 
148  ValueType s = 0.0;
149  for (IndexType row = threadIdx.x+blockDim.x*blockIdx.x; row < Anum_rows;
150  row += gridDim.x*blockDim.x) {
151  s += Aval[col*Anum_rows+row] * x[col*Anum_rows+row];
152  }
153 
154  // sum all local sums together to get
155  sdata[tid] = s;
156  if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
157  if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
158  if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
159  if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
160  if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
161 
162  // store results
163  if (threadIdx.x == 0)
164  temp[col*gridDim.x+blockIdx.x] = sdata[tid];
165  }
166 
167 }
168 
169 //kernel to compute final dot product by adding the sums across the blocks
170 template <typename IndexType,
171  typename ValueType>
172 __global__ void
173 row_reduce_kernel(const IndexType num_rows,
174  const IndexType num_cols,
175  const ValueType * temp,
176  ValueType * y)
177 {
178  extern __shared__ int sh[];
179  ValueType * const sdata = (ValueType*) sh;
180  for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
181  sdata[col + num_cols*threadIdx.y] = 0;
182  }
183  for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
184  row += gridDim.x * blockDim.y) {
185  for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
186  sdata[col + num_cols*threadIdx.y] += temp[col + num_cols * row];
187  }
188  }
189  __syncthreads();
190  IndexType nwarp = blockDim.y / 2;
191  while (nwarp > 0) {
192  for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
193  IndexType j = threadIdx.y;
194  if (j < nwarp){
195  IndexType j2 = j+nwarp;
196  sdata[col+j*num_cols] += sdata[col+j2*num_cols];
197  }
198  __syncthreads();
199  }
200  nwarp /= 2;
201  }
202  __syncthreads();
203  for (IndexType col = threadIdx.x; col < num_cols; col+=blockDim.x){
204  y[col + num_cols * blockIdx.x] = sdata[col];
205  }
206 }
207 
208 //kernel to compute final dot product by adding the sums across the blocks
209 template <typename IndexType,
210  typename ValueType>
211 __global__ void
212 col_reduce_kernel(const IndexType num_rows,
213  const IndexType num_cols,
214  const ValueType * temp,
215  ValueType * y)
216 {
217  extern __shared__ int sh[];
218  volatile ValueType * const sdata = (ValueType*) sh;
219 
220  IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
221 
222  for (IndexType col = threadIdx.y; col < num_cols; col += blockDim.y) {
223 
224  ValueType s = 0.0;
225  for (IndexType row = threadIdx.x; row < num_rows; row += blockDim.x) {
226  s += temp[col*num_rows+row];
227  }
228 
229  // sum all local sums together to get
230  sdata[tid] = s;
231  if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
232  if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
233  if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
234  if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
235  if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
236 
237  // store results
238  if (threadIdx.x == 0)
239  y[col] = sdata[tid];
240  }
241 }
242 
243 
244 template <typename Vector1,
245  typename Vector2,
246  typename Vector3>
247 void __spmm_MVdot(const int numRHS, const Vector1& A,
248  const Vector2& x,
249  Vector3& y, cusp::row_major)
250 {
251  CUSP_PROFILE_SCOPED();
252  typedef typename Vector2::index_type IndexType;
253  typedef typename Vector2::value_type ValueType;
254  typedef typename Vector2::memory_space MemorySpace;
255  const size_t BLOCK_SIZE = 32;
256 
257  const size_t ROWS_PER_BLOCK = 8;
258 
259  dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
260  const size_t shared = block.y * numRHS * sizeof(ValueType);
261 
262 
263  const size_t MAX_BLOCKS = 96;
264  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
265  dim3 grid(NUM_BLOCKS, 1);
266 
267 
268  cusp::array2d<ValueType, MemorySpace, cusp::row_major> temp(NUM_BLOCKS, A.num_cols, 0);
269 
270  row_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
271  (ROWS_PER_BLOCK, A.num_rows, A.num_cols,
272  thrust::raw_pointer_cast(&A.values[0]),
273  thrust::raw_pointer_cast(&x.values[0]),
274  thrust::raw_pointer_cast(&temp.values[0]));
275 
276  // add rows of temp (sum from each block) for final answer
277  IndexType num_rows = NUM_BLOCKS;
278  IndexType num_blocks = 16;
279  cusp::array2d<ValueType, MemorySpace, cusp::row_major> sum_temp(num_blocks, numRHS, 0);
280  //Need size to be power of 2
281  while (num_blocks > 0){
282  dim3 sum_grid(num_blocks, 1);
283  IndexType size = 1;
284  while (size < num_rows / num_blocks){
285  size <<=1;
286  }
287  dim3 sum_block( 32, size);
288  size_t sum_shared = sum_block.y * numRHS * sizeof(ValueType);
289  sum_temp.resize(num_blocks, numRHS);
290  row_reduce_kernel<IndexType,ValueType> <<<sum_grid, sum_block, sum_shared>>>
291  (temp.num_rows, temp.num_cols, thrust::raw_pointer_cast(&temp.values[0]),
292  thrust::raw_pointer_cast(&sum_temp.values[0]));
293  cusp::copy(sum_temp, temp);
294  num_rows = num_blocks;
295  num_blocks /= 2;
296  }
297  for (IndexType i = 0; i < numRHS; i++){
298  y[i] = sum_temp(0, i);
299  }
300 }
301 
302 template <typename Vector1,
303  typename Vector2,
304  typename Vector3>
305 void __spmm_MVdot(const int numRHS, const Vector1& A,
306  const Vector2& x,
307  Vector3& y, cusp::column_major)
308 {
309 #if 0
310  typedef typename Vector2::index_type IndexType;
311  for (IndexType col=0; col<A.num_cols; ++col) {
312  y[col] = cusp::blas::dotc(A.column(col), x.column(col));
313  }
314 #else
315  CUSP_PROFILE_SCOPED();
316  typedef typename Vector2::index_type IndexType;
317  typedef typename Vector2::value_type ValueType;
318  typedef typename Vector2::memory_space MemorySpace;
319 
320  const size_t BLOCK_SIZE = 32;
321  const size_t COLS_PER_BLOCK = 8;
322 
323  dim3 block(BLOCK_SIZE, COLS_PER_BLOCK);
324  const size_t shared = block.x * block.y * sizeof(ValueType);
325 
326  const size_t MAX_BLOCKS = 96;
327  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
328  dim3 grid(NUM_BLOCKS, 1);
329 
330 
331  cusp::array2d<ValueType, MemorySpace, cusp::column_major> temp(NUM_BLOCKS, A.num_cols, 0);
332 
333  col_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
334  (A.num_rows, A.num_cols,
335  thrust::raw_pointer_cast(&A.values[0]),
336  thrust::raw_pointer_cast(&x.values[0]),
337  thrust::raw_pointer_cast(&temp.values[0]));
338 
339  // add rows of temp (sum from each block) for final answer
340  dim3 sum_grid(1, 1);
341  col_reduce_kernel<IndexType,ValueType> <<<sum_grid, block, shared>>>
342  (NUM_BLOCKS, A.num_cols,
343  thrust::raw_pointer_cast(&temp.values[0]),
344  thrust::raw_pointer_cast(&y[0]));
345 #endif
346 }
347 
348 template <typename Vector1,
349  typename Vector2,
350  typename Vector3>
351 void __spmm_dense_diag(const Vector1& A,
352  const Vector2& x,
353  Vector3& y,
354  cusp::row_major)
355 {
356  CUSP_PROFILE_SCOPED();
357  typedef typename Vector3::index_type IndexType;
358  typedef typename Vector3::value_type ValueType;
359  typedef typename Vector3::memory_space MemorySpace;
360  const size_t BLOCK_SIZE = 32;
361  const size_t ROWS_PER_BLOCK = 8;
362  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
363  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
364  dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
365  dim3 grid(NUM_BLOCKS, 1);
366 
367  row_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
368  (A.num_rows, A.num_cols,
369  thrust::raw_pointer_cast(&A.values[0]),
370  thrust::raw_pointer_cast(&x[0]),
371  thrust::raw_pointer_cast(&(y.values)[0]));
372 }
373 
374 template <typename Vector1,
375  typename Vector2,
376  typename Vector3>
377 void __spmm_dense_diag(const Vector1& A,
378  const Vector2& x,
379  Vector3& y,
380  cusp::column_major)
381 {
382 #if 0
383  typedef typename Vector1::index_type IndexType;
384  typedef typename Vector1::value_type ValueType;
385  for (IndexType col=0; col<A.num_cols; ++col) {
386  cusp::blas::axpby(A.column(col), A.column(col), y.column(col),
387  x[col], ValueType(0.0));
388  }
389 #else
390  CUSP_PROFILE_SCOPED();
391  typedef typename Vector3::index_type IndexType;
392  typedef typename Vector3::value_type ValueType;
393  typedef typename Vector3::memory_space MemorySpace;
394  const size_t BLOCK_SIZE = 32;
395  const size_t COLS_PER_BLOCK = 8;
396  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
397  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
398  dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
399  dim3 grid(NUM_BLOCKS, 1);
400 
401  col_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
402  (A.num_rows, A.num_cols,
403  thrust::raw_pointer_cast(&A.values[0]),
404  thrust::raw_pointer_cast(&x[0]),
405  thrust::raw_pointer_cast(&(y.values)[0]));
406 #endif
407 }
408 
409 template <typename ValueType,
410  typename Vector1,
411  typename Vector2>
412 void __spmm_axpby(const ValueType& a,
413  const Vector1& x,
414  const ValueType& b,
415  const Vector1& y,
416  Vector2& z,
417  cusp::row_major)
418 {
419  CUSP_PROFILE_SCOPED();
420  typedef typename Vector2::index_type IndexType;
421  typedef typename Vector2::memory_space MemorySpace;
422  const size_t BLOCK_SIZE = 32;
423  const size_t ROWS_PER_BLOCK = 6;
424  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
425  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, ROWS_PER_BLOCK));
426  dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
427  dim3 grid(NUM_BLOCKS, 1);
428 
429  row_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
430  (a,b, x.num_rows, x.num_cols,
431  thrust::raw_pointer_cast(&x.values[0]),
432  thrust::raw_pointer_cast(&(y.values)[0]),
433  thrust::raw_pointer_cast(&(z.values)[0]));
434 }
435 
436 template <typename ValueType,
437  typename Vector1,
438  typename Vector2>
439 void __spmm_axpby(const ValueType& a,
440  const Vector1& x,
441  const ValueType& b,
442  const Vector1& y,
443  Vector2& z,
444  cusp::column_major)
445 {
446 #if 0
447  typedef typename Vector1::index_type IndexType;
448  for (IndexType col=0; col<x.num_cols; ++col) {
449  cusp::blas::axpby(x.column(col), y.column(col), z.column(col), a, b);
450  }
451 #else
452  CUSP_PROFILE_SCOPED();
453  typedef typename Vector2::index_type IndexType;
454  typedef typename Vector2::memory_space MemorySpace;
455  const size_t BLOCK_SIZE = 32;
456  const size_t COLS_PER_BLOCK = 6;
457  const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
458  const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, BLOCK_SIZE));
459  dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
460  dim3 grid(NUM_BLOCKS, 1);
461 
462  col_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
463  (a,b, x.num_rows, x.num_cols,
464  thrust::raw_pointer_cast(&x.values[0]),
465  thrust::raw_pointer_cast(&(y.values)[0]),
466  thrust::raw_pointer_cast(&(z.values)[0]));
467 #endif
468 }
469 
470 template <typename Vector1,
471  typename Vector2,
472  typename Vector3>
473 void spmm_MVdot(const Vector1& A,
474  const Vector2& x,
475  Vector3& y)
476 {
477  //Determine if row-wise or col-wise then call appropriate multiply
478  __spmm_MVdot(A.num_cols, A, x, y, typename Vector2::orientation());
479 }
480 
481 template <typename Vector1,
482  typename Vector2,
483  typename Vector3>
484 void spmm_dense_diag(const Vector1& A,
485  const Vector2& x,
486  Vector3& y)
487 {
488  //Determine if row-wise or col-wise then call appropriate multiply
489  __spmm_dense_diag(A, x, y, typename Vector1::orientation());
490 }
491 
492 template <typename ValueType,
493  typename Vector1,
494  typename Vector2>
495 void spmm_axpby(const ValueType& a, const Vector1& x, const ValueType& b,
496  const Vector1& y,
497  Vector2& z)
498 {
499  __spmm_axpby(a, x, b, y, z, typename Vector1::orientation());
500 }
501 
502 
503 } // end namespace device
504 } // end namespace detail
505 } // end namespace cusp
void spmm_MVdot(const Vector1 &A, const Vector2 &x, Vector3 &y)
Definition: array2d.h:473
__global__ void col_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: array2d.h:77
__global__ void col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
Definition: array2d.h:47
__global__ void row_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
Definition: array2d.h:173
__global__ void row_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: array2d.h:62
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
__global__ void col_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
Definition: array2d.h:212
void spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z)
Definition: array2d.h:495
void spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y)
Definition: array2d.h:484
void __spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: array2d.h:351
void __spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z, cusp::row_major)
Definition: array2d.h:412
Kokkos::DefaultExecutionSpace device
void axpby(const ValueType &A, const MV1 &X, const ValueType &B, const MV1 &Y, MV2 &Z)
__global__ void row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
Definition: array2d.h:32
__global__ void col_spmm_MVdot_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
Definition: array2d.h:135
__global__ void row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
Definition: array2d.h:92
void __spmm_MVdot(const int numRHS, const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: array2d.h:247