42 #ifndef STOKHOS_TILED_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_TILED_CRS_PRODUCT_TENSOR_HPP
45 #include "Kokkos_Core.hpp"
60 template<
typename ValueType,
class ExecutionSpace >
72 #elif defined(__MIC__)
81 #if defined( KOKKOS_ENABLE_CUDA )
82 std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
94 typedef Kokkos::View< value_type[], execution_space >
vec_type;
96 typedef Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space >
coord2_array_type;
179 KOKKOS_INLINE_FUNCTION
183 KOKKOS_INLINE_FUNCTION
188 KOKKOS_INLINE_FUNCTION
193 KOKKOS_INLINE_FUNCTION
198 KOKKOS_INLINE_FUNCTION
203 KOKKOS_INLINE_FUNCTION
208 KOKKOS_INLINE_FUNCTION
213 KOKKOS_INLINE_FUNCTION
218 KOKKOS_INLINE_FUNCTION
223 KOKKOS_INLINE_FUNCTION
228 KOKKOS_INLINE_FUNCTION
233 KOKKOS_INLINE_FUNCTION
238 KOKKOS_INLINE_FUNCTION
243 KOKKOS_INLINE_FUNCTION
248 KOKKOS_INLINE_FUNCTION
253 KOKKOS_INLINE_FUNCTION
258 KOKKOS_INLINE_FUNCTION
263 KOKKOS_INLINE_FUNCTION
267 template <
typename OrdinalType>
276 const size_type max_tiles = params.
get<
int>(
"Max Tiles");
284 typedef typename rcb_type::Box box_type;
285 rcb_type rcb(tile_size, max_tiles, coordinate_list());
288 size_type num_parts = rcb.get_num_parts();
293 for (
size_type part=0; part<num_parts; ++part) {
298 coord_work[part].
resize(num_rows, 0);
302 size_type i = box->coords[c](0) - box->xmin;
303 ++(coord_work[part][i]);
309 for (
size_type part=0; part<num_parts; ++part) {
315 coord_work[part][i] += pad;
344 typename coord_array_type::HostMirror host_coord =
346 typename coord2_array_type::HostMirror host_coord2 =
348 typename coord_offset_type::HostMirror host_coord_offset =
350 typename coord_range_type::HostMirror host_coord_range =
352 typename value_array_type::HostMirror host_value =
354 typename entry_array_type::HostMirror host_num_entry =
356 typename row_map_array_type::HostMirror host_row_map =
358 typename num_row_array_type::HostMirror host_num_rows =
363 for (
size_type part=0; part<num_parts; ++part) {
365 host_row_map(part,0) =
sum;
367 sum += coord_work[part][t];
368 host_row_map(part,t+1) =
sum;
373 for (
size_type part=0; part<num_parts; ++part) {
376 coord_work[part][t] = host_row_map(part,t);
381 for (
size_type part=0; part<num_parts; ++part) {
384 host_coord_offset(part,0) = box->xmin;
385 host_coord_offset(part,1) = box->ymin;
386 host_coord_offset(part,2) = box->zmin;
388 host_coord_range(part,0) = box->delta_x;
389 host_coord_range(part,1) = box->delta_y;
390 host_coord_range(part,2) = box->delta_z;
392 host_num_rows(part) = coord_work[part].
size();
403 ++coord_work[part][row];
406 host_coord2(n,0) = j - box->ymin;
407 host_coord2(n,1) = k - box->zmin;
408 host_coord(n) = ( host_coord2(n,1) << 16 ) | host_coord2(n,0);
410 ++host_num_entry(part,row);
427 for (
size_type part=0; part<num_parts; ++part) {
428 for (
size_type i = 0; i < host_num_rows(part); ++i ) {
430 host_num_entry(part,i) );
431 tensor.
m_flops += 5*host_num_entry(part,i) + 1;
439 template<
class Device,
typename OrdinalType,
typename ValueType >
440 TiledCrsProductTensor<ValueType, Device>
447 basis, Cijk, params );
450 template <
typename ValueType,
typename Device >
458 template<
typename MatrixValue ,
typename VectorValue >
459 KOKKOS_INLINE_FUNCTION
461 const MatrixValue *
const a ,
462 const VectorValue *
const x ,
463 VectorValue *
const y )
470 for (
size_type tile = 0 ; tile < n_tile ; ++tile ) {
478 for (
size_type i = 0 ; i < n_row ; ++i ) {
482 const size_type iEntryEnd = iEntryBeg + nEntry;
485 VectorValue ytmp = 0 ;
488 if (block_size > 1) {
489 const size_type nBlock = nEntry / block_size;
490 const size_type nEntryB = nBlock * block_size;
491 const size_type iEnd = iEntryBeg + nEntryB;
495 int j[block_size], k[block_size];
497 for ( ; iEntry < iEnd ; iEntry += block_size ) {
499 for (
size_type ii=0; ii<block_size; ++ii) {
500 j[ii] = tensor.
coord(iEntry+ii,0) + j_offset;
501 k[ii] = tensor.
coord(iEntry+ii,1) + k_offset;
503 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
504 c(&(tensor.
value(iEntry)));
519 for ( ; iEntry<iEntryEnd; ++iEntry) {
523 ytmp += tensor.
value(iEntry) * ( a[
j] * x[k] + a[k] * x[
j] );
526 y[i+i_offset] += ytmp ;
532 KOKKOS_INLINE_FUNCTION
536 KOKKOS_INLINE_FUNCTION
coord_range_type m_coord_range
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION size_type num_tiles() const
Number tiles.
TiledCrsProductTensor< ValueType, Device > tensor_type
static const bool use_intrinsics
KOKKOS_INLINE_FUNCTION const size_type & range(const size_type entry, const size_type c) const
Coordinate range.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
TiledCrsProductTensor & operator=(const TiledCrsProductTensor &rhs)
T & get(ParameterList &l, const std::string &name)
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
KOKKOS_INLINE_FUNCTION const size_type & num_entry(size_type tile, size_type i) const
Number of entries with a coordinate 'i'.
KOKKOS_INLINE_FUNCTION size_type tile_size() const
Number tiles.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
Kokkos::View< size_type[][3], execution_space > coord_range_type
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
TiledCrsProductTensor< ValueType, Device > create_tiled_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms)
Kokkos::View< value_type[], execution_space > vec_type
Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type
static TiledCrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList ¶ms)
coord2_array_type m_coord2
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static const size_type vectorsize
Kokkos::View< size_type[][3], execution_space > coord_offset_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
Kokkos::View< size_type[], execution_space > coord_array_type
KOKKOS_INLINE_FUNCTION const size_type * row_map_ptr() const
Return row_map ptr.
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
TiledCrsProductTensor(const TiledCrsProductTensor &rhs)
void resize(size_type new_size, const value_type &x=value_type())
Kokkos::View< value_type[], execution_space > value_array_type
KOKKOS_INLINE_FUNCTION const size_type & offset(const size_type entry, const size_type c) const
Coordinate offset.
coord_offset_type m_coord_offset
Kokkos::View< size_type **, layout_type, execution_space > row_map_array_type
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero's.
Device::size_type size_type
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
entry_array_type m_num_entry
Kokkos::LayoutRight layout_type
static const bool is_cuda
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type tile, size_type i) const
End entries with a coordinate 'i'.
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop's per multiply-add.
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
KOKKOS_INLINE_FUNCTION size_type num_rows(size_type tile) const
Number of rows in given tile.
KOKKOS_INLINE_FUNCTION const size_type & entry_begin(size_type tile, size_type i) const
Begin entries with a coordinate 'i'.
Kokkos::View< size_type[], execution_space > num_row_array_type
Kokkos::View< size_type **, layout_type, execution_space > entry_array_type
Teuchos::ArrayRCP< CijkData< ordinal_type, scalar_type > > build_cijk_coordinate_list(const Sparse3Tensor< ordinal_type, scalar_type > &Cijk, CijkSymmetryType symmetry_type)
num_row_array_type m_num_rows
static const size_type cuda_vectorsize
KOKKOS_INLINE_FUNCTION size_type max_num_rows() const
Maximum number of rows in any tile.
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
virtual ordinal_type size() const =0
Return total size of basis.
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
static const size_type tensor_align
ExecutionSpace execution_space
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
row_map_array_type m_row_map