25 #include <cusp/array2d.h>
26 #include <cusp/array1d.h>
38 template <
typename IndexType,
42 const IndexType num_cols,
47 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
48 row += gridDim.x * blockDim.y)
49 for (IndexType
j = threadIdx.x;
j < num_cols;
j+=blockDim.x )
50 z[
j+num_cols*row]= a * x[
j+num_cols*row] + b * y[
j+num_cols*row];
53 template <
typename IndexType,
57 const IndexType num_cols,
62 for (IndexType
j = threadIdx.y;
j < num_cols;
j+=blockDim.y )
63 for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < num_rows;
64 row += gridDim.x * blockDim.x)
65 z[
j*num_rows+row]= a * x[
j*num_rows+row] + b * y[
j*num_rows+row];
68 template <
typename IndexType,
72 const IndexType Anum_cols,
73 const ValueType * Aval,
77 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
78 row += gridDim.x * blockDim.y)
79 for (IndexType
j = threadIdx.x;
j < Anum_cols;
j+=blockDim.x )
80 y[
j+Anum_cols*row]= x[
j] * Aval[
j+Anum_cols*row];
83 template <
typename IndexType,
87 const IndexType Anum_cols,
88 const ValueType * Aval,
92 for (IndexType
j = threadIdx.y;
j < Anum_cols;
j+=blockDim.y )
93 for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < Anum_rows;
94 row += gridDim.x * blockDim.x)
95 y[
j*Anum_rows+row]= x[
j] * Aval[
j*Anum_rows+row];
98 template <
typename IndexType,
102 const IndexType Anum_cols,
103 const ValueType * Aval,
107 extern __shared__
int sh[];
108 ValueType *
const sdata = (ValueType*) sh;
110 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
111 sdata[col + Anum_cols*threadIdx.y] = 0;
113 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
114 row += gridDim.x * blockDim.y) {
115 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
116 sdata[col + Anum_cols*threadIdx.y] +=
117 Aval[col + Anum_cols * row] * x[col + Anum_cols * row];
122 IndexType nwarp = blockDim.y / 2;
124 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
125 IndexType
j = threadIdx.y;
127 IndexType j2 = j+nwarp;
128 sdata[col+j*Anum_cols] += sdata[col+j2*Anum_cols];
135 for (IndexType col = threadIdx.x; col < Anum_cols; col+=blockDim.x){
136 temp[ Anum_cols * blockIdx.x + col ] = sdata[col];
141 template <
typename IndexType,
145 const IndexType Anum_cols,
146 const ValueType * Aval,
150 extern __shared__
int sh[];
151 volatile ValueType *
const sdata = (ValueType*) sh;
153 IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
155 for (IndexType col = threadIdx.y; col < Anum_cols; col += blockDim.y) {
158 for (IndexType row = threadIdx.x+blockDim.x*blockIdx.x; row < Anum_rows;
159 row += gridDim.x*blockDim.x) {
160 s += Aval[col*Anum_rows+row] * x[col*Anum_rows+row];
165 if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
166 if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
167 if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
168 if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
169 if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
172 if (threadIdx.x == 0)
173 temp[col*gridDim.x+blockIdx.x] = sdata[tid];
179 template <
typename IndexType,
183 const IndexType num_cols,
184 const ValueType * temp,
187 extern __shared__
int sh[];
188 ValueType *
const sdata = (ValueType*) sh;
189 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
190 sdata[col + num_cols*threadIdx.y] = 0;
192 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
193 row += gridDim.x * blockDim.y) {
194 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
195 sdata[col + num_cols*threadIdx.y] += temp[col + num_cols * row];
199 IndexType nwarp = blockDim.y / 2;
201 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
202 IndexType
j = threadIdx.y;
204 IndexType j2 = j+nwarp;
205 sdata[col+j*num_cols] += sdata[col+j2*num_cols];
212 for (IndexType col = threadIdx.x; col < num_cols; col+=blockDim.x){
213 y[col + num_cols * blockIdx.x] = sdata[col];
218 template <
typename IndexType,
222 const IndexType num_cols,
223 const ValueType * temp,
226 extern __shared__
int sh[];
227 volatile ValueType *
const sdata = (ValueType*) sh;
229 IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
231 for (IndexType col = threadIdx.y; col < num_cols; col += blockDim.y) {
234 for (IndexType row = threadIdx.x; row < num_rows; row += blockDim.x) {
235 s += temp[col*num_rows+row];
240 if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
241 if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
242 if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
243 if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
244 if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
247 if (threadIdx.x == 0)
253 template <
typename Vector1,
258 Vector3& y, cusp::row_major)
260 CUSP_PROFILE_SCOPED();
261 typedef typename Vector2::index_type IndexType;
263 typedef typename Vector2::memory_space MemorySpace;
264 const size_t BLOCK_SIZE = 32;
266 const size_t ROWS_PER_BLOCK = 8;
268 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
269 const size_t shared = block.y * numRHS *
sizeof(ValueType);
272 const size_t MAX_BLOCKS = 96;
273 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
274 dim3 grid(NUM_BLOCKS, 1);
277 cusp::array2d<ValueType, MemorySpace, cusp::row_major> temp(NUM_BLOCKS, A.num_cols, 0);
279 row_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
280 (ROWS_PER_BLOCK, A.num_rows, A.num_cols,
281 thrust::raw_pointer_cast(&A.values[0]),
282 thrust::raw_pointer_cast(&x.values[0]),
283 thrust::raw_pointer_cast(&temp.values[0]));
286 IndexType num_rows = NUM_BLOCKS;
287 IndexType num_blocks = 16;
288 cusp::array2d<ValueType, MemorySpace, cusp::row_major> sum_temp(num_blocks, numRHS, 0);
290 while (num_blocks > 0){
291 dim3 sum_grid(num_blocks, 1);
293 while (size < num_rows / num_blocks){
296 dim3 sum_block( 32, size);
297 size_t sum_shared = sum_block.y * numRHS *
sizeof(ValueType);
298 sum_temp.resize(num_blocks, numRHS);
299 row_reduce_kernel<IndexType,ValueType> <<<sum_grid, sum_block, sum_shared>>>
300 (temp.num_rows, temp.num_cols, thrust::raw_pointer_cast(&temp.values[0]),
301 thrust::raw_pointer_cast(&sum_temp.values[0]));
302 cusp::copy(sum_temp, temp);
303 num_rows = num_blocks;
306 for (IndexType i = 0; i < numRHS; i++){
307 y[i] = sum_temp(0, i);
311 template <
typename Vector1,
316 Vector3& y, cusp::column_major)
319 typedef typename Vector2::index_type IndexType;
320 for (IndexType col=0; col<A.num_cols; ++col) {
321 y[col] = cusp::blas::dotc(A.column(col), x.column(col));
324 CUSP_PROFILE_SCOPED();
325 typedef typename Vector2::index_type IndexType;
327 typedef typename Vector2::memory_space MemorySpace;
329 const size_t BLOCK_SIZE = 32;
330 const size_t COLS_PER_BLOCK = 8;
332 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK);
333 const size_t shared = block.x * block.y *
sizeof(ValueType);
335 const size_t MAX_BLOCKS = 96;
336 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
337 dim3 grid(NUM_BLOCKS, 1);
340 cusp::array2d<ValueType, MemorySpace, cusp::column_major> temp(NUM_BLOCKS, A.num_cols, 0);
342 col_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
343 (A.num_rows, A.num_cols,
344 thrust::raw_pointer_cast(&A.values[0]),
345 thrust::raw_pointer_cast(&x.values[0]),
346 thrust::raw_pointer_cast(&temp.values[0]));
350 col_reduce_kernel<IndexType,ValueType> <<<sum_grid, block, shared>>>
351 (NUM_BLOCKS, A.num_cols,
352 thrust::raw_pointer_cast(&temp.values[0]),
353 thrust::raw_pointer_cast(&y[0]));
357 template <
typename Vector1,
365 CUSP_PROFILE_SCOPED();
366 typedef typename Vector3::index_type IndexType;
368 typedef typename Vector3::memory_space MemorySpace;
369 const size_t BLOCK_SIZE = 32;
370 const size_t ROWS_PER_BLOCK = 8;
371 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
372 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
373 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
374 dim3 grid(NUM_BLOCKS, 1);
376 row_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
377 (A.num_rows, A.num_cols,
378 thrust::raw_pointer_cast(&A.values[0]),
379 thrust::raw_pointer_cast(&x[0]),
380 thrust::raw_pointer_cast(&(y.values)[0]));
383 template <
typename Vector1,
392 typedef typename Vector1::index_type IndexType;
394 for (IndexType col=0; col<A.num_cols; ++col) {
396 x[col], ValueType(0.0));
399 CUSP_PROFILE_SCOPED();
400 typedef typename Vector3::index_type IndexType;
402 typedef typename Vector3::memory_space MemorySpace;
403 const size_t BLOCK_SIZE = 32;
404 const size_t COLS_PER_BLOCK = 8;
405 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
406 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
407 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
408 dim3 grid(NUM_BLOCKS, 1);
410 col_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
411 (A.num_rows, A.num_cols,
412 thrust::raw_pointer_cast(&A.values[0]),
413 thrust::raw_pointer_cast(&x[0]),
414 thrust::raw_pointer_cast(&(y.values)[0]));
418 template <
typename ValueType,
428 CUSP_PROFILE_SCOPED();
429 typedef typename Vector2::index_type IndexType;
430 typedef typename Vector2::memory_space MemorySpace;
431 const size_t BLOCK_SIZE = 32;
432 const size_t ROWS_PER_BLOCK = 6;
433 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
434 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, ROWS_PER_BLOCK));
435 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
436 dim3 grid(NUM_BLOCKS, 1);
438 row_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
439 (a,b, x.num_rows, x.num_cols,
440 thrust::raw_pointer_cast(&x.values[0]),
441 thrust::raw_pointer_cast(&(y.values)[0]),
442 thrust::raw_pointer_cast(&(z.values)[0]));
445 template <
typename ValueType,
456 typedef typename Vector1::index_type IndexType;
457 for (IndexType col=0; col<x.num_cols; ++col) {
461 CUSP_PROFILE_SCOPED();
462 typedef typename Vector2::index_type IndexType;
463 typedef typename Vector2::memory_space MemorySpace;
464 const size_t BLOCK_SIZE = 32;
465 const size_t COLS_PER_BLOCK = 6;
466 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (
size_t) 0);
467 const size_t NUM_BLOCKS =
std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, BLOCK_SIZE));
468 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
469 dim3 grid(NUM_BLOCKS, 1);
471 col_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
472 (a,b, x.num_rows, x.num_cols,
473 thrust::raw_pointer_cast(&x.values[0]),
474 thrust::raw_pointer_cast(&(y.values)[0]),
475 thrust::raw_pointer_cast(&(z.values)[0]));
479 template <
typename Vector1,
487 __spmm_MVdot(A.num_cols, A, x, y,
typename Vector2::orientation());
490 template <
typename Vector1,
501 template <
typename ValueType,
504 void spmm_axpby(
const ValueType& a,
const Vector1& x,
const ValueType& b,
508 __spmm_axpby(a, x, b, y, z,
typename Vector1::orientation());
void spmm_MVdot(const Vector1 &A, const Vector2 &x, Vector3 &y)
__global__ void col_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
__global__ void col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
__global__ void row_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
__global__ void row_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
__global__ void col_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
void spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z)
void spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y)
void __spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
void __spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z, cusp::row_major)
void axpby(const ValueType &A, const MV1 &X, const ValueType &B, const MV1 &Y, MV2 &Z)
__global__ void row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
__global__ void col_spmm_MVdot_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
__global__ void row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
void __spmm_MVdot(const int numRHS, const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)