16 #include <cusp/array2d.h>
17 #include <cusp/array1d.h>
29 template <
typename IndexType,
33 const IndexType num_cols,
38 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
39 row += gridDim.x * blockDim.y)
40 for (IndexType
j = threadIdx.x;
j < num_cols;
j+=blockDim.x )
41 z[
j+num_cols*row]= a * x[
j+num_cols*row] + b * y[
j+num_cols*row];
44 template <
typename IndexType,
48 const IndexType num_cols,
53 for (IndexType
j = threadIdx.y;
j < num_cols;
j+=blockDim.y )
54 for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < num_rows;
55 row += gridDim.x * blockDim.x)
56 z[
j*num_rows+row]= a * x[
j*num_rows+row] + b * y[
j*num_rows+row];
59 template <
typename IndexType,
63 const IndexType Anum_cols,
64 const ValueType * Aval,
68 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
69 row += gridDim.x * blockDim.y)
70 for (IndexType
j = threadIdx.x;
j < Anum_cols;
j+=blockDim.x )
71 y[
j+Anum_cols*row]= x[
j] * Aval[
j+Anum_cols*row];
74 template <
typename IndexType,
78 const IndexType Anum_cols,
79 const ValueType * Aval,
83 for (IndexType
j = threadIdx.y;
j < Anum_cols;
j+=blockDim.y )
84 for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < Anum_rows;
85 row += gridDim.x * blockDim.x)
86 y[
j*Anum_rows+row]= x[
j] * Aval[
j*Anum_rows+row];
89 template <
typename IndexType,
93 const IndexType Anum_cols,
94 const ValueType * Aval,
98 extern __shared__
int sh[];
99 ValueType *
const sdata = (ValueType*) sh;
101 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
102 sdata[col + Anum_cols*threadIdx.y] = 0;
104 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
105 row += gridDim.x * blockDim.y) {
106 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
107 sdata[col + Anum_cols*threadIdx.y] +=
108 Aval[col + Anum_cols * row] * x[col + Anum_cols * row];
113 IndexType nwarp = blockDim.y / 2;
115 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
116 IndexType
j = threadIdx.y;
118 IndexType j2 = j+nwarp;
119 sdata[col+j*Anum_cols] += sdata[col+j2*Anum_cols];
126 for (IndexType col = threadIdx.x; col < Anum_cols; col+=blockDim.x){
127 temp[ Anum_cols * blockIdx.x + col ] = sdata[col];
132 template <
typename IndexType,
136 const IndexType Anum_cols,
137 const ValueType * Aval,
141 extern __shared__
int sh[];
142 volatile ValueType *
const sdata = (ValueType*) sh;
144 IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
146 for (IndexType col = threadIdx.y; col < Anum_cols; col += blockDim.y) {
149 for (IndexType row = threadIdx.x+blockDim.x*blockIdx.x; row < Anum_rows;
150 row += gridDim.x*blockDim.x) {
151 s += Aval[col*Anum_rows+row] * x[col*Anum_rows+row];
156 if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
157 if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
158 if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
159 if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
160 if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
163 if (threadIdx.x == 0)
164 temp[col*gridDim.x+blockIdx.x] = sdata[tid];
170 template <
typename IndexType,
174 const IndexType num_cols,
175 const ValueType * temp,
178 extern __shared__
int sh[];
179 ValueType *
const sdata = (ValueType*) sh;
180 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
181 sdata[col + num_cols*threadIdx.y] = 0;
183 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
184 row += gridDim.x * blockDim.y) {
185 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
186 sdata[col + num_cols*threadIdx.y] += temp[col + num_cols * row];
190 IndexType nwarp = blockDim.y / 2;
192 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
193 IndexType
j = threadIdx.y;
195 IndexType j2 = j+nwarp;
196 sdata[col+j*num_cols] += sdata[col+j2*num_cols];
203 for (IndexType col = threadIdx.x; col < num_cols; col+=blockDim.x){
204 y[col + num_cols * blockIdx.x] = sdata[col];
209 template <
typename IndexType,
213 const IndexType num_cols,
214 const ValueType * temp,
217 extern __shared__
int sh[];
218 volatile ValueType *
const sdata = (ValueType*) sh;
220 IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
222 for (IndexType col = threadIdx.y; col < num_cols; col += blockDim.y) {
225 for (IndexType row = threadIdx.x; row < num_rows; row += blockDim.x) {
226 s += temp[col*num_rows+row];
231 if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
232 if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
233 if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
234 if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
235 if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
238 if (threadIdx.x == 0)
244 template <
typename Vector1,
249 Vector3& y, cusp::row_major)
251 CUSP_PROFILE_SCOPED();
252 typedef typename Vector2::index_type IndexType;
254 typedef typename Vector2::memory_space MemorySpace;
255 const size_t BLOCK_SIZE = 32;
257 const size_t ROWS_PER_BLOCK = 8;
259 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
260 const size_t shared = block.y * numRHS *
sizeof(ValueType);
263 const size_t MAX_BLOCKS = 96;
264 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
265 dim3 grid(NUM_BLOCKS, 1);
268 cusp::array2d<ValueType, MemorySpace, cusp::row_major> temp(NUM_BLOCKS, A.num_cols, 0);
270 row_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
271 (ROWS_PER_BLOCK, A.num_rows, A.num_cols,
272 thrust::raw_pointer_cast(&A.values[0]),
273 thrust::raw_pointer_cast(&x.values[0]),
274 thrust::raw_pointer_cast(&temp.values[0]));
277 IndexType num_rows = NUM_BLOCKS;
278 IndexType num_blocks = 16;
279 cusp::array2d<ValueType, MemorySpace, cusp::row_major> sum_temp(num_blocks, numRHS, 0);
281 while (num_blocks > 0){
282 dim3 sum_grid(num_blocks, 1);
284 while (size < num_rows / num_blocks){
287 dim3 sum_block( 32, size);
288 size_t sum_shared = sum_block.y * numRHS *
sizeof(ValueType);
289 sum_temp.resize(num_blocks, numRHS);
290 row_reduce_kernel<IndexType,ValueType> <<<sum_grid, sum_block, sum_shared>>>
291 (temp.num_rows, temp.num_cols, thrust::raw_pointer_cast(&temp.values[0]),
292 thrust::raw_pointer_cast(&sum_temp.values[0]));
293 cusp::copy(sum_temp, temp);
294 num_rows = num_blocks;
297 for (IndexType i = 0; i < numRHS; i++){
298 y[i] = sum_temp(0, i);
302 template <
typename Vector1,
307 Vector3& y, cusp::column_major)
310 typedef typename Vector2::index_type IndexType;
311 for (IndexType col=0; col<A.num_cols; ++col) {
312 y[col] = cusp::blas::dotc(A.column(col), x.column(col));
315 CUSP_PROFILE_SCOPED();
316 typedef typename Vector2::index_type IndexType;
318 typedef typename Vector2::memory_space MemorySpace;
320 const size_t BLOCK_SIZE = 32;
321 const size_t COLS_PER_BLOCK = 8;
323 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK);
324 const size_t shared = block.x * block.y *
sizeof(ValueType);
326 const size_t MAX_BLOCKS = 96;
327 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
328 dim3 grid(NUM_BLOCKS, 1);
331 cusp::array2d<ValueType, MemorySpace, cusp::column_major> temp(NUM_BLOCKS, A.num_cols, 0);
333 col_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
334 (A.num_rows, A.num_cols,
335 thrust::raw_pointer_cast(&A.values[0]),
336 thrust::raw_pointer_cast(&x.values[0]),
337 thrust::raw_pointer_cast(&temp.values[0]));
341 col_reduce_kernel<IndexType,ValueType> <<<sum_grid, block, shared>>>
342 (NUM_BLOCKS, A.num_cols,
343 thrust::raw_pointer_cast(&temp.values[0]),
344 thrust::raw_pointer_cast(&y[0]));
348 template <
typename Vector1,
356 CUSP_PROFILE_SCOPED();
357 typedef typename Vector3::index_type IndexType;
359 typedef typename Vector3::memory_space MemorySpace;
360 const size_t BLOCK_SIZE = 32;
361 const size_t ROWS_PER_BLOCK = 8;
362 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
363 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
364 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
365 dim3 grid(NUM_BLOCKS, 1);
367 row_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
368 (A.num_rows, A.num_cols,
369 thrust::raw_pointer_cast(&A.values[0]),
370 thrust::raw_pointer_cast(&x[0]),
371 thrust::raw_pointer_cast(&(y.values)[0]));
374 template <
typename Vector1,
383 typedef typename Vector1::index_type IndexType;
385 for (IndexType col=0; col<A.num_cols; ++col) {
387 x[col], ValueType(0.0));
390 CUSP_PROFILE_SCOPED();
391 typedef typename Vector3::index_type IndexType;
393 typedef typename Vector3::memory_space MemorySpace;
394 const size_t BLOCK_SIZE = 32;
395 const size_t COLS_PER_BLOCK = 8;
396 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
397 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
398 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
399 dim3 grid(NUM_BLOCKS, 1);
401 col_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
402 (A.num_rows, A.num_cols,
403 thrust::raw_pointer_cast(&A.values[0]),
404 thrust::raw_pointer_cast(&x[0]),
405 thrust::raw_pointer_cast(&(y.values)[0]));
409 template <
typename ValueType,
419 CUSP_PROFILE_SCOPED();
420 typedef typename Vector2::index_type IndexType;
421 typedef typename Vector2::memory_space MemorySpace;
422 const size_t BLOCK_SIZE = 32;
423 const size_t ROWS_PER_BLOCK = 6;
424 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
425 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, ROWS_PER_BLOCK));
426 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
427 dim3 grid(NUM_BLOCKS, 1);
429 row_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
430 (a,b, x.num_rows, x.num_cols,
431 thrust::raw_pointer_cast(&x.values[0]),
432 thrust::raw_pointer_cast(&(y.values)[0]),
433 thrust::raw_pointer_cast(&(z.values)[0]));
436 template <
typename ValueType,
447 typedef typename Vector1::index_type IndexType;
448 for (IndexType col=0; col<x.num_cols; ++col) {
452 CUSP_PROFILE_SCOPED();
453 typedef typename Vector2::index_type IndexType;
454 typedef typename Vector2::memory_space MemorySpace;
455 const size_t BLOCK_SIZE = 32;
456 const size_t COLS_PER_BLOCK = 6;
457 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
458 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, BLOCK_SIZE));
459 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
460 dim3 grid(NUM_BLOCKS, 1);
462 col_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
463 (a,b, x.num_rows, x.num_cols,
464 thrust::raw_pointer_cast(&x.values[0]),
465 thrust::raw_pointer_cast(&(y.values)[0]),
466 thrust::raw_pointer_cast(&(z.values)[0]));
470 template <
typename Vector1,
478 __spmm_MVdot(A.num_cols, A, x, y,
typename Vector2::orientation());
481 template <
typename Vector1,
492 template <
typename ValueType,
495 void spmm_axpby(
const ValueType& a,
const Vector1& x,
const ValueType& b,
499 __spmm_axpby(a, x, b, y, z,
typename Vector1::orientation());
void spmm_MVdot(const Vector1 &A, const Vector2 &x, Vector3 &y)
__global__ void col_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
__global__ void col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
__global__ void row_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
__global__ void row_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
__global__ void col_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
void spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z)
void spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y)
void __spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
void __spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z, cusp::row_major)
void axpby(const ValueType &A, const MV1 &X, const ValueType &B, const MV1 &Y, MV2 &Z)
__global__ void row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
__global__ void col_spmm_MVdot_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
__global__ void row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
void __spmm_MVdot(const int numRHS, const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)