10 #ifndef STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
11 #define STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
13 #include "Kokkos_Core.hpp"
28 template<
typename ValueType,
class ExecutionSpace >
40 #elif defined(__MIC__)
49 #if defined( KOKKOS_ENABLE_CUDA )
50 std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
61 typedef Kokkos::View< value_type[], execution_space >
vec_type;
64 typedef Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space >
coord2_array_type;
66 typedef Kokkos::View< size_type*, execution_space >
i_size_type;
67 typedef Kokkos::View< size_type*, execution_space >
num_j_type;
69 typedef Kokkos::View< size_type**, execution_space >
j_size_type;
70 typedef Kokkos::View< size_type**, execution_space >
num_k_type;
71 typedef Kokkos::View< size_type***, execution_space >
k_begin_type;
72 typedef Kokkos::View< size_type***, execution_space >
k_size_type;
74 typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space >
row_map_type;
75 typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space >
num_entry_type;
102 template <
typename coord_t>
188 KOKKOS_INLINE_FUNCTION
192 KOKKOS_INLINE_FUNCTION
196 KOKKOS_INLINE_FUNCTION
200 KOKKOS_INLINE_FUNCTION
204 KOKKOS_INLINE_FUNCTION
208 KOKKOS_INLINE_FUNCTION
212 KOKKOS_INLINE_FUNCTION
218 KOKKOS_INLINE_FUNCTION
224 KOKKOS_INLINE_FUNCTION
229 KOKKOS_INLINE_FUNCTION
236 KOKKOS_INLINE_FUNCTION
243 KOKKOS_INLINE_FUNCTION
250 KOKKOS_INLINE_FUNCTION
257 KOKKOS_INLINE_FUNCTION
264 KOKKOS_INLINE_FUNCTION
270 KOKKOS_INLINE_FUNCTION
276 KOKKOS_INLINE_FUNCTION
282 KOKKOS_INLINE_FUNCTION
287 KOKKOS_INLINE_FUNCTION
292 KOKKOS_INLINE_FUNCTION
296 KOKKOS_INLINE_FUNCTION
299 template <
typename OrdinalType>
311 typedef typename Cijk_type::i_iterator i_iterator;
312 typedef typename Cijk_type::ik_iterator ik_iterator;
313 typedef typename Cijk_type::ikj_iterator ikj_iterator;
315 const size_type i_tile_size = params.
get<OrdinalType>(
"Tile Size");
320 i_iterator i_end = Cijk.
i_end();
321 for (i_iterator i_it=i_begin; i_it!=i_end; ++i_it) {
322 OrdinalType i = index(i_it);
324 ik_iterator k_end = Cijk.
k_end(i_it);
325 for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
326 OrdinalType k = index(k_it);
328 ikj_iterator j_end = Cijk.
j_end(k_it);
329 for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
330 OrdinalType
j = index(j_it);
332 ValueType c = Stokhos::value(j_it);
333 Cijk_sym.add_term(i, j, k, c);
338 Cijk_sym.fillComplete();
343 size_type num_i_parts = (basis_size + i_tile_size-1) / i_tile_size;
347 for (
size_type i=0; i<num_i_parts; ++i) {
348 i_tiles[i].lower = i*its;
349 i_tiles[i].upper =
std::min(basis_size, (i+1)*its);
350 i_tiles[i].parts.
resize(1);
351 i_tiles[i].parts[0].lower = basis_size;
352 i_tiles[i].parts[0].upper = 0;
357 for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
358 OrdinalType i = index(i_it);
362 while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
366 ik_iterator
k_begin = Cijk_sym.k_begin(i_it);
367 ik_iterator k_end = Cijk_sym.k_end(i_it);
368 for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
369 OrdinalType
j = index(k_it);
371 if (j < i_tiles[idx].parts[0].lower)
372 i_tiles[idx].parts[0].lower =
j;
373 if (j > i_tiles[idx].parts[0].upper)
374 i_tiles[idx].parts[0].upper =
j;
377 for (
size_type idx=0; idx<num_i_parts; ++idx) {
378 size_type lower = i_tiles[idx].parts[0].lower;
379 size_type upper = i_tiles[idx].parts[0].upper;
381 size_type num_j_parts = (range + j_tile_size-1) / j_tile_size;
384 max_jk_tile_size =
std::max(max_jk_tile_size, jts);
387 j_tiles[
j].lower = lower +
j*jts;
388 j_tiles[
j].upper =
std::min(upper+1, lower + (
j+1)*jts);
390 j_tiles[
j].parts[0].lower = basis_size;
391 j_tiles[
j].parts[0].upper = 0;
393 i_tiles[idx].parts.
swap(j_tiles);
397 for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
398 OrdinalType i = index(i_it);
402 while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
406 ik_iterator
k_begin = Cijk_sym.k_begin(i_it);
407 ik_iterator k_end = Cijk_sym.k_end(i_it);
408 for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
409 OrdinalType
j = index(k_it);
414 while (jdx < num_j_parts && j >= i_tiles[idx].parts[jdx].lower) ++jdx;
418 ikj_iterator
j_begin = Cijk_sym.j_begin(k_it);
419 ikj_iterator j_end = Cijk_sym.j_end(k_it);
420 for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
421 OrdinalType k = index(j_it);
422 ValueType
cijk = Stokhos::value(j_it);
425 coord.
i = i; coord.
j =
j; coord.
k = k; coord.
cijk =
cijk;
426 i_tiles[idx].parts[jdx].parts[0].parts.
push_back(coord);
427 if (k < i_tiles[idx].parts[jdx].parts[0].lower)
428 i_tiles[idx].parts[jdx].parts[0].lower = k;
429 if (k > i_tiles[idx].parts[jdx].parts[0].upper)
430 i_tiles[idx].parts[jdx].parts[0].upper = k;
438 for (
size_type idx=0; idx<num_i_parts; ++idx) {
440 for (
size_type jdx=0; jdx<num_j_parts; ++jdx) {
441 size_type lower = i_tiles[idx].parts[jdx].parts[0].lower;
442 size_type upper = i_tiles[idx].parts[jdx].parts[0].upper;
444 size_type num_k_parts = (range + j_tile_size-1) / j_tile_size;
447 max_jk_tile_size =
std::max(max_jk_tile_size, kts);
449 for (
size_type k=0; k<num_k_parts; ++k) {
450 k_tiles[k].lower = lower + k*kts;
451 k_tiles[k].upper =
std::min(upper+1, lower + (k+1)*kts);
453 size_type num_k = i_tiles[idx].parts[jdx].parts[0].parts.
size();
455 size_type i = i_tiles[idx].parts[jdx].parts[0].parts[l].i;
456 size_type j = i_tiles[idx].parts[jdx].parts[0].parts[l].j;
457 size_type k = i_tiles[idx].parts[jdx].parts[0].parts[l].k;
462 while (kdx < num_k_parts && k >= k_tiles[kdx].lower) ++kdx;
467 coord.
i = i; coord.
j =
j; coord.
k = k; coord.
cijk =
cijk;
470 if (j != k) ++num_coord;
475 for (
size_type k=0; k<num_k_parts; ++k) {
476 if (k_tiles[k].parts.
size() > 0)
479 i_tiles[idx].parts[jdx].parts.
swap(k_tiles2);
486 size_type max_num_j_parts = 0, max_num_k_parts = 0;
488 for (
size_type idx=0; idx<num_i_parts; ++idx) {
490 max_num_j_parts =
std::max(max_num_j_parts, num_j_parts);
491 coord_work[idx].
resize(num_j_parts);
492 for (
size_type jdx=0; jdx<num_j_parts; ++jdx) {
493 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.
size();
494 max_num_k_parts =
std::max(max_num_k_parts, num_k_parts);
495 coord_work[idx][jdx].
resize(num_k_parts);
496 for (
size_type kdx=0; kdx<num_k_parts; ++kdx) {
497 size_type num_rows = i_tiles[idx].upper - i_tiles[idx].lower + 1;
498 total_num_rows += num_rows;
499 max_num_rows =
std::max(max_num_rows, num_rows);
500 coord_work[idx][jdx][kdx].
resize(num_rows, 0);
502 size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.
size();
504 size_type i = i_tiles[idx].parts[jdx].parts[kdx].parts[c].i;
506 ++(coord_work[idx][jdx][kdx][i-
i_begin]);
514 for (
size_type idx=0; idx<num_i_parts; ++idx) {
516 for (
size_type jdx=0; jdx<num_j_parts; ++jdx) {
517 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.
size();
518 for (
size_type kdx=0; kdx<num_k_parts; ++kdx) {
521 const size_t rem = coord_work[idx][jdx][kdx][i] %
tensor_align;
524 coord_work[idx][jdx][kdx][i] += pad;
548 max_num_j_parts, max_num_k_parts,
551 max_num_j_parts, max_num_k_parts,
559 typename value_array_type::HostMirror host_value =
561 typename coord_array_type::HostMirror host_coord =
563 typename coord2_array_type::HostMirror host_coord2 =
565 typename i_begin_type::HostMirror host_i_begin =
567 typename i_size_type::HostMirror host_i_size =
569 typename num_j_type::HostMirror host_num_j =
571 typename j_begin_type::HostMirror host_j_begin =
573 typename j_size_type::HostMirror host_j_size =
575 typename num_k_type::HostMirror host_num_k =
577 typename k_begin_type::HostMirror host_k_begin =
579 typename k_size_type::HostMirror host_k_size =
581 typename row_map_type::HostMirror host_row_map =
583 typename num_entry_type::HostMirror host_num_entry =
588 for (
size_type idx=0; idx<num_i_parts; ++idx) {
590 for (
size_type jdx=0; jdx<num_j_parts; ++jdx) {
591 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.
size();
592 for (
size_type kdx=0; kdx<num_k_parts; ++kdx) {
594 host_row_map(idx,jdx,kdx,0) =
sum;
596 sum += coord_work[idx][jdx][kdx][t];
597 host_row_map(idx,jdx,kdx,t+1) =
sum;
598 host_num_entry(idx,jdx,kdx,t) = 0;
605 for (
size_type idx=0; idx<num_i_parts; ++idx) {
607 for (
size_type jdx=0; jdx<num_j_parts; ++jdx) {
608 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.
size();
609 for (
size_type kdx=0; kdx<num_k_parts; ++kdx) {
612 coord_work[idx][jdx][kdx][t] = host_row_map(idx,jdx,kdx,t);
619 for (
size_type idx=0; idx<num_i_parts; ++idx) {
620 host_i_begin(idx) = i_tiles[idx].lower;
621 host_i_size(idx) = i_tiles[idx].upper - i_tiles[idx].lower;
624 host_num_j(idx) = num_j_parts;
625 for (
size_type jdx=0; jdx<num_j_parts; ++jdx) {
626 host_j_begin(idx,jdx) = i_tiles[idx].parts[jdx].lower;
627 host_j_size(idx,jdx) = i_tiles[idx].parts[jdx].upper -
628 i_tiles[idx].parts[jdx].lower;
630 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.
size();
631 host_num_k(idx,jdx) = num_k_parts;
632 for (
size_type kdx=0; kdx<num_k_parts; ++kdx) {
633 host_k_begin(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].lower;
634 host_k_size(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].upper -
635 i_tiles[idx].parts[jdx].parts[kdx].lower;
638 size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.
size();
640 Coord s = i_tiles[idx].parts[jdx].parts[kdx].parts[t];
646 const size_type row = i - host_i_begin(idx);
647 const size_type n = coord_work[idx][jdx][kdx][row];
648 ++coord_work[idx][jdx][kdx][row];
650 host_value(n) = (j != k) ? c : 0.5*c;
651 host_coord2(n,0) = j - host_j_begin(idx,jdx);
652 host_coord2(n,1) = k - host_k_begin(idx,jdx,kdx);
653 host_coord(n) = (host_coord2(n,1) << 16) | host_coord2(n,0);
655 ++host_num_entry(idx,jdx,kdx,row);
678 for (
size_type idx=0; idx<num_i_parts; ++idx) {
680 for (
size_type jdx=0; jdx<num_j_parts; ++jdx) {
681 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.
size();
682 for (
size_type kdx=0; kdx<num_k_parts; ++kdx) {
683 for (
size_type i = 0; i < host_i_size(idx); ++i) {
684 tensor.
m_flops += 5*host_num_entry(idx,jdx,kdx,i) + 1;
694 template<
class Device,
typename OrdinalType,
typename ValueType >
695 SimpleTiledCrsProductTensor<ValueType, Device>
702 basis, Cijk, params);
705 template <
typename ValueType,
typename Device >
713 template<
typename MatrixValue ,
typename VectorValue >
714 KOKKOS_INLINE_FUNCTION
716 const MatrixValue *
const a ,
717 const VectorValue *
const x ,
718 VectorValue *
const y )
724 for (
size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
729 for (
size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
734 for (
size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
741 tensor.
num_entry(i_tile,j_tile,k_tile,i);
744 const size_type iEntryEnd = iEntryBeg + nEntry;
747 VectorValue ytmp = 0 ;
750 if (block_size > 1) {
751 const size_type nBlock = nEntry / block_size;
752 const size_type nEntryB = nBlock * block_size;
753 const size_type iEnd = iEntryBeg + nEntryB;
757 int j[block_size], k[block_size];
759 for ( ; iEntry < iEnd ; iEntry += block_size ) {
761 for (
size_type ii=0; ii<block_size; ++ii) {
762 j[ii] = tensor.
coord(iEntry+ii,0) + j_begin;
763 k[ii] = tensor.
coord(iEntry+ii,1) + k_begin;
765 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
766 c(&(tensor.
value(iEntry)));
781 for ( ; iEntry<iEntryEnd; ++iEntry) {
785 ytmp += tensor.
value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
788 y[i+i_begin] += ytmp;
796 KOKKOS_INLINE_FUNCTION
800 KOKKOS_INLINE_FUNCTION
KOKKOS_INLINE_FUNCTION size_type entry_end(const size_type i, const size_type j, const size_type k, const size_type r) const
End entries for tile (i,j,k) and row r.
size_type m_max_i_tile_size
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop's per multiply-add.
k_iterator k_begin() const
Iterator pointing to first k entry.
SimpleTiledCrsProductTensor(const SimpleTiledCrsProductTensor &rhs)
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
Device::size_type size_type
Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
T & get(ParameterList &l, const std::string &name)
Kokkos::View< size_type **, execution_space > j_begin_type
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
static const size_type tensor_align
KOKKOS_INLINE_FUNCTION size_type num_i_tiles() const
Number i-tiles.
KOKKOS_INLINE_FUNCTION size_type k_size(const size_type i, const size_type j, const size_type k) const
Number of entries with for tile 'i,j'.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
Kokkos::View< value_type[], execution_space > value_array_type
Kokkos::View< value_type[], execution_space > vec_type
Teuchos::Array< coord_t > parts
KOKKOS_INLINE_FUNCTION size_type j_size(const size_type i, const size_type j) const
Number of entries with for tile 'i,j'.
Kokkos::View< size_type **, execution_space > j_size_type
static const bool is_cuda
SimpleTiledCrsProductTensor & operator=(const SimpleTiledCrsProductTensor &rhs)
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
Kokkos::View< size_type ***, execution_space > k_size_type
KOKKOS_INLINE_FUNCTION size_type max_i_tile_size() const
Max size of any i tile.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Kokkos::View< size_type *, execution_space > i_size_type
~SimpleTiledCrsProductTensor()
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
num_entry_type m_num_entry
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y)
size_type m_max_jk_tile_size
Kokkos::View< size_type ***, execution_space > k_begin_type
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static const size_type vectorsize
Kokkos::View< size_type *, execution_space > i_begin_type
KOKKOS_INLINE_FUNCTION size_type num_k_tiles(const size_type i, const size_type j) const
Number k-tiles.
static SimpleTiledCrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms)
Kokkos::View< size_type[], execution_space > coord_array_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
SimpleTiledCrsProductTensor< ValueType, Device > create_simple_tiled_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms)
ExecutionSpace execution_space
friend friend void swap(Array< T2 > &a1, Array< T2 > &a2)
void resize(size_type new_size, const value_type &x=value_type())
KOKKOS_INLINE_FUNCTION size_type entry_begin(const size_type i, const size_type j, const size_type k, const size_type r) const
Begin entries for tile (i,j,k) and row r.
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION size_type j_begin(const size_type i, const size_type j) const
Begin entries with for tile 'i,j'.
KOKKOS_INLINE_FUNCTION size_type i_begin(const size_type i) const
Begin entries with for tile 'i'.
Kokkos::View< size_type **, execution_space > num_k_type
void push_back(const value_type &x)
Kokkos::View< size_type *, execution_space > num_j_type
KOKKOS_INLINE_FUNCTION size_type num_j_tiles(const size_type i) const
Number j-tiles.
static const size_type cuda_vectorsize
SimpleTiledCrsProductTensor< ValueType, Device > tensor_type
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
static const size_type host_vectorsize
SimpleTiledCrsProductTensor()
coord2_array_type m_coord2
#define TEUCHOS_ASSERT(assertion_test)
KOKKOS_INLINE_FUNCTION size_type max_jk_tile_size() const
Max size of any j/k tile.
KOKKOS_INLINE_FUNCTION size_type num_entry(const size_type i, const size_type j, const size_type k, const size_type r) const
Number of entries for tile (i,j,k) and row r.
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > num_entry_type
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero's.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
KOKKOS_INLINE_FUNCTION size_type k_begin(const size_type i, const size_type j, const size_type k) const
Begin entries with for tile 'i,j,k'.
KOKKOS_INLINE_FUNCTION size_type i_size(const size_type i) const
Number of entries with for tile 'i'.
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > row_map_type
virtual ordinal_type size() const =0
Return total size of basis.
static const bool use_intrinsics
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
ordinal_type num_entries() const
Return number of non-zero entries.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)