Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_SimpleTiledCrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
11 #define STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
12 
13 #include "Kokkos_Core.hpp"
14 
15 #include "Stokhos_Multiply.hpp"
16 #include "Stokhos_ProductBasis.hpp"
20 #include "Stokhos_TinyVec.hpp"
21 
22 
23 //----------------------------------------------------------------------------
24 //----------------------------------------------------------------------------
25 
26 namespace Stokhos {
27 
28 template< typename ValueType, class ExecutionSpace >
30 public:
31 
32  typedef ExecutionSpace execution_space;
33  typedef int size_type;
34  typedef ValueType value_type;
35 
36 // Vectorsize used in multiply algorithm
37 #if defined(__AVX__)
38  static const size_type host_vectorsize = 32/sizeof(value_type);
39  static const bool use_intrinsics = true;
40 #elif defined(__MIC__)
41  static const size_type host_vectorsize = 16;
42  static const bool use_intrinsics = true;
43 #else
44  static const size_type host_vectorsize = 2;
45  static const bool use_intrinsics = false;
46 #endif
47  static const size_type cuda_vectorsize = 32;
48  static const bool is_cuda =
49 #if defined( KOKKOS_ENABLE_CUDA )
50  std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
51 #else
52  false ;
53 #endif
55 
56  // Alignment in terms of number of entries of CRS rows
58 
59 private:
60 
61  typedef Kokkos::View< value_type[], execution_space > vec_type;
62  typedef Kokkos::View< value_type[], execution_space > value_array_type;
63  typedef Kokkos::View< size_type[], execution_space > coord_array_type;
64  typedef Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type;
65  typedef Kokkos::View< size_type*, execution_space > i_begin_type;
66  typedef Kokkos::View< size_type*, execution_space > i_size_type;
67  typedef Kokkos::View< size_type*, execution_space > num_j_type;
68  typedef Kokkos::View< size_type**, execution_space > j_begin_type;
69  typedef Kokkos::View< size_type**, execution_space > j_size_type;
70  typedef Kokkos::View< size_type**, execution_space > num_k_type;
71  typedef Kokkos::View< size_type***, execution_space > k_begin_type;
72  typedef Kokkos::View< size_type***, execution_space > k_size_type;
73 
74  typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space > row_map_type;
75  typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space > num_entry_type;
76 
96 
97  struct Coord {
100  };
101 
102  template <typename coord_t>
103  struct Tile {
106  };
107 
111 
112 public:
113 
114  inline
116 
117  inline
119  m_value(),
120  m_coord(),
121  m_coord2(),
122  m_i_begin(),
123  m_i_size(),
124  m_num_j(),
125  m_j_begin(),
126  m_j_size(),
127  m_num_k(),
128  m_k_begin(),
129  m_k_size(),
130  m_row_map(),
131  m_num_entry(),
132  m_dimension(0),
135  m_num_i(0),
136  m_nnz(0),
137  m_flops(0) {}
138 
139  inline
141  m_value(rhs.m_value),
142  m_coord(rhs.m_coord),
143  m_coord2(rhs.m_coord2),
144  m_i_begin(rhs.m_i_begin),
145  m_i_size(rhs.m_i_size),
146  m_num_j(rhs.m_num_j),
147  m_j_begin(rhs.m_j_begin),
148  m_j_size(rhs.m_j_size),
149  m_num_k(rhs.m_num_k),
150  m_k_begin(rhs.m_k_begin),
151  m_k_size(rhs.m_k_size),
152  m_row_map(rhs.m_row_map),
157  m_num_i(rhs.m_num_i),
158  m_nnz(rhs.m_nnz),
159  m_flops(rhs.m_flops) {}
160 
161  inline
163  const SimpleTiledCrsProductTensor & rhs)
164  {
165  m_value = rhs.m_value;
166  m_coord = rhs.m_coord;
167  m_coord2 = rhs.m_coord2;
168  m_i_begin = rhs.m_i_begin;
169  m_i_size = rhs.m_i_size;
170  m_num_j = rhs.m_num_j;
171  m_j_begin = rhs.m_j_begin;
172  m_j_size = rhs.m_j_size;
173  m_num_k = rhs.m_num_k;
174  m_k_begin = rhs.m_k_begin;
175  m_k_size = rhs.m_k_size;
176  m_row_map = rhs.m_row_map;
177  m_num_entry = rhs.m_num_entry;
178  m_dimension = rhs.m_dimension;
181  m_num_i = rhs.m_num_i;
182  m_nnz = rhs.m_nnz;
183  m_flops = rhs.m_flops;
184  return *this;
185  }
186 
188  KOKKOS_INLINE_FUNCTION
189  size_type dimension() const { return m_dimension; }
190 
192  KOKKOS_INLINE_FUNCTION
193  size_type entry_count() const { return m_coord.extent(0); }
194 
196  KOKKOS_INLINE_FUNCTION
197  size_type num_i_tiles() const { return m_num_i; }
198 
200  KOKKOS_INLINE_FUNCTION
201  size_type i_begin(const size_type i) const { return m_i_begin(i); }
202 
204  KOKKOS_INLINE_FUNCTION
205  size_type i_size(const size_type i) const { return m_i_size(i); }
206 
208  KOKKOS_INLINE_FUNCTION
209  size_type num_j_tiles(const size_type i) const { return m_num_j(i); }
210 
212  KOKKOS_INLINE_FUNCTION
213  size_type j_begin(const size_type i, const size_type j) const {
214  return m_j_begin(i,j);
215  }
216 
218  KOKKOS_INLINE_FUNCTION
219  size_type j_size(const size_type i, const size_type j) const {
220  return m_j_size(i,j);
221  }
222 
224  KOKKOS_INLINE_FUNCTION
225  size_type num_k_tiles(const size_type i, const size_type j) const {
226  return m_num_k(i,j); }
227 
229  KOKKOS_INLINE_FUNCTION
231  const size_type k) const {
232  return m_k_begin(i,j,k);
233  }
234 
236  KOKKOS_INLINE_FUNCTION
238  const size_type k) const {
239  return m_k_size(i,j,k);
240  }
241 
243  KOKKOS_INLINE_FUNCTION
245  const size_type k, const size_type r) const {
246  return m_num_entry(i,j,k,r);
247  }
248 
250  KOKKOS_INLINE_FUNCTION
252  const size_type k, const size_type r) const {
253  return m_row_map(i,j,k,r);
254  }
255 
257  KOKKOS_INLINE_FUNCTION
259  const size_type k, const size_type r) const {
260  return m_row_map(i,j,k,r) + m_num_entry(i,j,k,r);
261  }
262 
264  KOKKOS_INLINE_FUNCTION
265  const size_type& coord(const size_type entry, const size_type c) const {
266  return m_coord2(entry, c);
267  }
268 
270  KOKKOS_INLINE_FUNCTION
271  const size_type& coord(const size_type entry) const {
272  return m_coord(entry);
273  }
274 
276  KOKKOS_INLINE_FUNCTION
277  const value_type & value(const size_type entry) const {
278  return m_value(entry);
279  }
280 
282  KOKKOS_INLINE_FUNCTION
284  { return m_nnz; }
285 
287  KOKKOS_INLINE_FUNCTION
289  { return m_flops; }
290 
292  KOKKOS_INLINE_FUNCTION
294 
296  KOKKOS_INLINE_FUNCTION
298 
299  template <typename OrdinalType>
303  const Teuchos::ParameterList& params)
304  {
305  using Teuchos::rcp;
306  using Teuchos::RCP;
308  using Teuchos::Array;
309 
311  typedef typename Cijk_type::i_iterator i_iterator;
312  typedef typename Cijk_type::ik_iterator ik_iterator;
313  typedef typename Cijk_type::ikj_iterator ikj_iterator;
314 
315  const size_type i_tile_size = params.get<OrdinalType>("Tile Size");
316 
317  // Build 2-way symmetric Cijk tensor
318  Cijk_type Cijk_sym;
319  i_iterator i_begin = Cijk.i_begin();
320  i_iterator i_end = Cijk.i_end();
321  for (i_iterator i_it=i_begin; i_it!=i_end; ++i_it) {
322  OrdinalType i = index(i_it);
323  ik_iterator k_begin = Cijk.k_begin(i_it);
324  ik_iterator k_end = Cijk.k_end(i_it);
325  for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
326  OrdinalType k = index(k_it);
327  ikj_iterator j_begin = Cijk.j_begin(k_it);
328  ikj_iterator j_end = Cijk.j_end(k_it);
329  for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
330  OrdinalType j = index(j_it);
331  if (k <= j) {
332  ValueType c = Stokhos::value(j_it);
333  Cijk_sym.add_term(i, j, k, c);
334  }
335  }
336  }
337  }
338  Cijk_sym.fillComplete();
339 
340  // First partition based on i
341  size_type j_tile_size = i_tile_size / 2;
342  size_type basis_size = basis.size();
343  size_type num_i_parts = (basis_size + i_tile_size-1) / i_tile_size;
344  //size_type its = basis_size / num_i_parts;
345  size_type its = i_tile_size;
346  Array<ITile> i_tiles(num_i_parts);
347  for (size_type i=0; i<num_i_parts; ++i) {
348  i_tiles[i].lower = i*its;
349  i_tiles[i].upper = std::min(basis_size, (i+1)*its);
350  i_tiles[i].parts.resize(1);
351  i_tiles[i].parts[0].lower = basis_size;
352  i_tiles[i].parts[0].upper = 0;
353  }
354 
355  // Next partition j
357  for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
358  OrdinalType i = index(i_it);
359 
360  // Find which part i belongs to
361  size_type idx = 0;
362  while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
363  --idx;
364  TEUCHOS_ASSERT(idx >= 0 && idx < num_i_parts);
365 
366  ik_iterator k_begin = Cijk_sym.k_begin(i_it);
367  ik_iterator k_end = Cijk_sym.k_end(i_it);
368  for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
369  OrdinalType j = index(k_it); // using symmetry to interchange j and k
370 
371  if (j < i_tiles[idx].parts[0].lower)
372  i_tiles[idx].parts[0].lower = j;
373  if (j > i_tiles[idx].parts[0].upper)
374  i_tiles[idx].parts[0].upper = j;
375  }
376  }
377  for (size_type idx=0; idx<num_i_parts; ++idx) {
378  size_type lower = i_tiles[idx].parts[0].lower;
379  size_type upper = i_tiles[idx].parts[0].upper;
380  size_type range = upper - lower + 1;
381  size_type num_j_parts = (range + j_tile_size-1) / j_tile_size;
382  //size_type jts = range / num_j_parts;
383  size_type jts = j_tile_size;
384  max_jk_tile_size = std::max(max_jk_tile_size, jts);
385  Array<JTile> j_tiles(num_j_parts);
386  for (size_type j=0; j<num_j_parts; ++j) {
387  j_tiles[j].lower = lower + j*jts;
388  j_tiles[j].upper = std::min(upper+1, lower + (j+1)*jts);
389  j_tiles[j].parts.resize(1);
390  j_tiles[j].parts[0].lower = basis_size;
391  j_tiles[j].parts[0].upper = 0;
392  }
393  i_tiles[idx].parts.swap(j_tiles);
394  }
395 
396  // Now partition k
397  for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
398  OrdinalType i = index(i_it);
399 
400  // Find which part i belongs to
401  size_type idx = 0;
402  while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
403  --idx;
404  TEUCHOS_ASSERT(idx >= 0 && idx < num_i_parts);
405 
406  ik_iterator k_begin = Cijk_sym.k_begin(i_it);
407  ik_iterator k_end = Cijk_sym.k_end(i_it);
408  for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
409  OrdinalType j = index(k_it); // using symmetry to interchange j and k
410 
411  // Find which part j belongs to
412  size_type num_j_parts = i_tiles[idx].parts.size();
413  size_type jdx = 0;
414  while (jdx < num_j_parts && j >= i_tiles[idx].parts[jdx].lower) ++jdx;
415  --jdx;
416  TEUCHOS_ASSERT(jdx >= 0 && jdx < num_j_parts);
417 
418  ikj_iterator j_begin = Cijk_sym.j_begin(k_it);
419  ikj_iterator j_end = Cijk_sym.j_end(k_it);
420  for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
421  OrdinalType k = index(j_it); // using symmetry to interchange j and k
422  ValueType cijk = Stokhos::value(j_it);
423  if (k >= j) {
424  Coord coord;
425  coord.i = i; coord.j = j; coord.k = k; coord.cijk = cijk;
426  i_tiles[idx].parts[jdx].parts[0].parts.push_back(coord);
427  if (k < i_tiles[idx].parts[jdx].parts[0].lower)
428  i_tiles[idx].parts[jdx].parts[0].lower = k;
429  if (k > i_tiles[idx].parts[jdx].parts[0].upper)
430  i_tiles[idx].parts[jdx].parts[0].upper = k;
431  }
432  }
433  }
434  }
435 
436  // Now need to divide up k-parts based on lower/upper bounds
437  size_type num_coord = 0;
438  for (size_type idx=0; idx<num_i_parts; ++idx) {
439  size_type num_j_parts = i_tiles[idx].parts.size();
440  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
441  size_type lower = i_tiles[idx].parts[jdx].parts[0].lower;
442  size_type upper = i_tiles[idx].parts[jdx].parts[0].upper;
443  size_type range = upper - lower + 1;
444  size_type num_k_parts = (range + j_tile_size-1) / j_tile_size;
445  //size_type kts = range / num_k_parts;
446  size_type kts = j_tile_size;
447  max_jk_tile_size = std::max(max_jk_tile_size, kts);
448  Array<KTile> k_tiles(num_k_parts);
449  for (size_type k=0; k<num_k_parts; ++k) {
450  k_tiles[k].lower = lower + k*kts;
451  k_tiles[k].upper = std::min(upper+1, lower + (k+1)*kts);
452  }
453  size_type num_k = i_tiles[idx].parts[jdx].parts[0].parts.size();
454  for (size_type l=0; l<num_k; ++l) {
455  size_type i = i_tiles[idx].parts[jdx].parts[0].parts[l].i;
456  size_type j = i_tiles[idx].parts[jdx].parts[0].parts[l].j;
457  size_type k = i_tiles[idx].parts[jdx].parts[0].parts[l].k;
458  value_type cijk = i_tiles[idx].parts[jdx].parts[0].parts[l].cijk;
459 
460  // Find which part k belongs to
461  size_type kdx = 0;
462  while (kdx < num_k_parts && k >= k_tiles[kdx].lower) ++kdx;
463  --kdx;
464  TEUCHOS_ASSERT(kdx >= 0 && kdx < num_k_parts);
465 
466  Coord coord;
467  coord.i = i; coord.j = j; coord.k = k; coord.cijk = cijk;
468  k_tiles[kdx].parts.push_back(coord);
469  ++num_coord;
470  if (j != k) ++num_coord;
471  }
472 
473  // Eliminate parts with zero size
474  Array<KTile> k_tiles2;
475  for (size_type k=0; k<num_k_parts; ++k) {
476  if (k_tiles[k].parts.size() > 0)
477  k_tiles2.push_back(k_tiles[k]);
478  }
479  i_tiles[idx].parts[jdx].parts.swap(k_tiles2);
480  }
481  }
482  TEUCHOS_ASSERT(num_coord == Cijk.num_entries());
483 
484  // Compute number of non-zeros for each row in each part
485  size_type total_num_rows = 0, max_num_rows = 0, entry_count = 0;
486  size_type max_num_j_parts = 0, max_num_k_parts = 0;
487  Array< Array< Array< Array<size_type> > > > coord_work(num_i_parts);
488  for (size_type idx=0; idx<num_i_parts; ++idx) {
489  size_type num_j_parts = i_tiles[idx].parts.size();
490  max_num_j_parts = std::max(max_num_j_parts, num_j_parts);
491  coord_work[idx].resize(num_j_parts);
492  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
493  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
494  max_num_k_parts = std::max(max_num_k_parts, num_k_parts);
495  coord_work[idx][jdx].resize(num_k_parts);
496  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
497  size_type num_rows = i_tiles[idx].upper - i_tiles[idx].lower + 1;
498  total_num_rows += num_rows;
499  max_num_rows = std::max(max_num_rows, num_rows);
500  coord_work[idx][jdx][kdx].resize(num_rows, 0);
501 
502  size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.size();
503  for (size_type c=0; c<nc; ++c) {
504  size_type i = i_tiles[idx].parts[jdx].parts[kdx].parts[c].i;
505  size_type i_begin = i_tiles[idx].lower;
506  ++(coord_work[idx][jdx][kdx][i-i_begin]);
507  ++entry_count;
508  }
509  }
510  }
511  }
512 
513  // Pad each row to have size divisible by alignment size
514  for (size_type idx=0; idx<num_i_parts; ++idx) {
515  size_type num_j_parts = i_tiles[idx].parts.size();
516  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
517  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
518  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
519  size_type sz = coord_work[idx][jdx][kdx].size();
520  for (size_type i = 0; i < sz; ++i) {
521  const size_t rem = coord_work[idx][jdx][kdx][i] % tensor_align;
522  if (rem > 0) {
523  const size_t pad = tensor_align - rem;
524  coord_work[idx][jdx][kdx][i] += pad;
525  entry_count += pad;
526  }
527  }
528  }
529  }
530  }
531 
532  // Allocate tensor data
534  tensor.m_value = value_array_type("value", entry_count);
535  tensor.m_coord = coord_array_type("coord", entry_count);
536  tensor.m_coord2 = coord2_array_type("coord2", entry_count);
537  tensor.m_i_begin = i_begin_type("i_begin", num_i_parts);
538  tensor.m_i_size = i_size_type("i_size", num_i_parts);
539  tensor.m_num_j = num_j_type("num_j", num_i_parts);
540  tensor.m_j_begin = j_begin_type("j_begin", num_i_parts, max_num_j_parts);
541  tensor.m_j_size = j_size_type("j_size", num_i_parts, max_num_j_parts);
542  tensor.m_num_k = num_k_type("num_k", num_i_parts, max_num_j_parts);
543  tensor.m_k_begin = k_begin_type("k_begin", num_i_parts, max_num_j_parts,
544  max_num_k_parts);
545  tensor.m_k_size = k_size_type("k_size", num_i_parts, max_num_j_parts,
546  max_num_k_parts);
547  tensor.m_row_map = row_map_type("row_map", num_i_parts,
548  max_num_j_parts, max_num_k_parts,
549  max_num_rows+1);
550  tensor.m_num_entry = num_entry_type("num_entry", num_i_parts,
551  max_num_j_parts, max_num_k_parts,
552  max_num_rows);
553  tensor.m_dimension = basis.size();
554  tensor.m_max_i_tile_size = i_tile_size;
556  tensor.m_num_i = num_i_parts;
557 
558  // Create mirror, is a view if is host memory
559  typename value_array_type::HostMirror host_value =
561  typename coord_array_type::HostMirror host_coord =
563  typename coord2_array_type::HostMirror host_coord2 =
565  typename i_begin_type::HostMirror host_i_begin =
567  typename i_size_type::HostMirror host_i_size =
569  typename num_j_type::HostMirror host_num_j =
571  typename j_begin_type::HostMirror host_j_begin =
573  typename j_size_type::HostMirror host_j_size =
575  typename num_k_type::HostMirror host_num_k =
577  typename k_begin_type::HostMirror host_k_begin =
579  typename k_size_type::HostMirror host_k_size =
581  typename row_map_type::HostMirror host_row_map =
583  typename num_entry_type::HostMirror host_num_entry =
585 
586  // Compute row map
587  size_type sum = 0;
588  for (size_type idx=0; idx<num_i_parts; ++idx) {
589  size_type num_j_parts = i_tiles[idx].parts.size();
590  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
591  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
592  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
593  size_type nc = coord_work[idx][jdx][kdx].size();
594  host_row_map(idx,jdx,kdx,0) = sum;
595  for (size_type t=0; t<nc; ++t) {
596  sum += coord_work[idx][jdx][kdx][t];
597  host_row_map(idx,jdx,kdx,t+1) = sum;
598  host_num_entry(idx,jdx,kdx,t) = 0;
599  }
600  }
601  }
602  }
603 
604  // Copy per part row offsets back into coord_work
605  for (size_type idx=0; idx<num_i_parts; ++idx) {
606  size_type num_j_parts = i_tiles[idx].parts.size();
607  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
608  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
609  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
610  size_type nc = coord_work[idx][jdx][kdx].size();
611  for (size_type t=0; t<nc; ++t) {
612  coord_work[idx][jdx][kdx][t] = host_row_map(idx,jdx,kdx,t);
613  }
614  }
615  }
616  }
617 
618  // Fill in coordinate and value arrays
619  for (size_type idx=0; idx<num_i_parts; ++idx) {
620  host_i_begin(idx) = i_tiles[idx].lower;
621  host_i_size(idx) = i_tiles[idx].upper - i_tiles[idx].lower;
622  TEUCHOS_ASSERT(host_i_size(idx) <= i_tile_size);
623  size_type num_j_parts = i_tiles[idx].parts.size();
624  host_num_j(idx) = num_j_parts;
625  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
626  host_j_begin(idx,jdx) = i_tiles[idx].parts[jdx].lower;
627  host_j_size(idx,jdx) = i_tiles[idx].parts[jdx].upper -
628  i_tiles[idx].parts[jdx].lower;
629  TEUCHOS_ASSERT(host_j_size(idx,jdx) <= max_jk_tile_size);
630  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
631  host_num_k(idx,jdx) = num_k_parts;
632  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
633  host_k_begin(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].lower;
634  host_k_size(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].upper -
635  i_tiles[idx].parts[jdx].parts[kdx].lower;
636  TEUCHOS_ASSERT(host_k_size(idx,jdx,kdx) <= max_jk_tile_size);
637 
638  size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.size();
639  for (size_type t=0; t<nc; ++t) {
640  Coord s = i_tiles[idx].parts[jdx].parts[kdx].parts[t];
641  const size_type i = s.i;
642  const size_type j = s.j;
643  const size_type k = s.k;
644  const value_type c = s.cijk;
645 
646  const size_type row = i - host_i_begin(idx);
647  const size_type n = coord_work[idx][jdx][kdx][row];
648  ++coord_work[idx][jdx][kdx][row];
649 
650  host_value(n) = (j != k) ? c : 0.5*c;
651  host_coord2(n,0) = j - host_j_begin(idx,jdx);
652  host_coord2(n,1) = k - host_k_begin(idx,jdx,kdx);
653  host_coord(n) = (host_coord2(n,1) << 16) | host_coord2(n,0);
654 
655  ++host_num_entry(idx,jdx,kdx,row);
656  ++tensor.m_nnz;
657  }
658  }
659  }
660  }
661 
662  // Copy data to device if necessary
663  Kokkos::deep_copy(tensor.m_value, host_value);
664  Kokkos::deep_copy(tensor.m_coord, host_coord);
665  Kokkos::deep_copy(tensor.m_coord2, host_coord2);
666  Kokkos::deep_copy(tensor.m_i_begin, host_i_begin);
667  Kokkos::deep_copy(tensor.m_i_size, host_i_size);
668  Kokkos::deep_copy(tensor.m_num_j, host_num_j);
669  Kokkos::deep_copy(tensor.m_j_begin, host_j_begin);
670  Kokkos::deep_copy(tensor.m_j_size, host_j_size);
671  Kokkos::deep_copy(tensor.m_num_k, host_num_k);
672  Kokkos::deep_copy(tensor.m_k_begin, host_k_begin);
673  Kokkos::deep_copy(tensor.m_k_size, host_k_size);
674  Kokkos::deep_copy(tensor.m_row_map, host_row_map);
675  Kokkos::deep_copy(tensor.m_num_entry, host_num_entry);
676 
677  tensor.m_flops = 0;
678  for (size_type idx=0; idx<num_i_parts; ++idx) {
679  size_type num_j_parts = i_tiles[idx].parts.size();
680  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
681  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
682  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
683  for (size_type i = 0; i < host_i_size(idx); ++i) {
684  tensor.m_flops += 5*host_num_entry(idx,jdx,kdx,i) + 1;
685  }
686  }
687  }
688  }
689 
690  return tensor;
691  }
692 };
693 
694 template< class Device, typename OrdinalType, typename ValueType >
695 SimpleTiledCrsProductTensor<ValueType, Device>
699  const Teuchos::ParameterList& params)
700 {
702  basis, Cijk, params);
703 }
704 
705 template < typename ValueType, typename Device >
706 class BlockMultiply< SimpleTiledCrsProductTensor< ValueType , Device > >
707 {
708 public:
709 
710  typedef typename Device::size_type size_type ;
712 
713  template< typename MatrixValue , typename VectorValue >
714  KOKKOS_INLINE_FUNCTION
715  static void apply( const tensor_type & tensor ,
716  const MatrixValue * const a ,
717  const VectorValue * const x ,
718  VectorValue * const y )
719  {
720  const size_type block_size = 2;
722 
723  const size_type n_i_tile = tensor.num_i_tiles();
724  for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
725  const size_type i_begin = tensor.i_begin(i_tile);
726  const size_type i_size = tensor.i_size(i_tile);
727 
728  const size_type n_j_tile = tensor.num_j_tiles(i_tile);
729  for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
730  const size_type j_begin = tensor.j_begin(i_tile, j_tile);
731  //const size_type j_size = tensor.j_size(i_tile, j_tile);
732 
733  const size_type n_k_tile = tensor.num_k_tiles(i_tile, j_tile);
734  for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
735  const size_type k_begin = tensor.k_begin(i_tile, j_tile, k_tile);
736  //const size_type k_size = tensor.k_size(i_tile, j_tile, k_tile);
737 
738  for (size_type i=0; i<i_size; ++i) {
739 
740  const size_type nEntry =
741  tensor.num_entry(i_tile,j_tile,k_tile,i);
742  const size_type iEntryBeg =
743  tensor.entry_begin(i_tile,j_tile,k_tile,i);
744  const size_type iEntryEnd = iEntryBeg + nEntry;
745  size_type iEntry = iEntryBeg;
746 
747  VectorValue ytmp = 0 ;
748 
749  // Do entries with a blocked loop of size block_size
750  if (block_size > 1) {
751  const size_type nBlock = nEntry / block_size;
752  const size_type nEntryB = nBlock * block_size;
753  const size_type iEnd = iEntryBeg + nEntryB;
754 
755  TV vy;
756  vy.zero();
757  int j[block_size], k[block_size];
758 
759  for ( ; iEntry < iEnd ; iEntry += block_size ) {
760 
761  for (size_type ii=0; ii<block_size; ++ii) {
762  j[ii] = tensor.coord(iEntry+ii,0) + j_begin;
763  k[ii] = tensor.coord(iEntry+ii,1) + k_begin;
764  }
765  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
766  c(&(tensor.value(iEntry)));
767 
768  // vy += c * ( aj * xk + ak * xj)
769  aj.times_equal(xk);
770  ak.times_equal(xj);
771  aj.plus_equal(ak);
772  c.times_equal(aj);
773  vy.plus_equal(c);
774 
775  }
776 
777  ytmp += vy.sum();
778  }
779 
780  // Do remaining entries with a scalar loop
781  for ( ; iEntry<iEntryEnd; ++iEntry) {
782  const size_type j = tensor.coord(iEntry,0) + j_begin;
783  const size_type k = tensor.coord(iEntry,1) + k_begin;
784 
785  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
786  }
787 
788  y[i+i_begin] += ytmp;
789 
790  }
791  }
792  }
793  }
794  }
795 
796  KOKKOS_INLINE_FUNCTION
797  static size_type matrix_size( const tensor_type & tensor )
798  { return tensor.dimension(); }
799 
800  KOKKOS_INLINE_FUNCTION
801  static size_type vector_size( const tensor_type & tensor )
802  { return tensor.dimension(); }
803 };
804 
805 } /* namespace Stokhos */
806 
807 //----------------------------------------------------------------------------
808 //----------------------------------------------------------------------------
809 
810 #endif /* #ifndef STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION size_type entry_end(const size_type i, const size_type j, const size_type k, const size_type r) const
End entries for tile (i,j,k) and row r.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop&#39;s per multiply-add.
k_iterator k_begin() const
Iterator pointing to first k entry.
SimpleTiledCrsProductTensor(const SimpleTiledCrsProductTensor &rhs)
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
T & get(ParameterList &l, const std::string &name)
Kokkos::View< size_type **, execution_space > j_begin_type
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
KOKKOS_INLINE_FUNCTION size_type num_i_tiles() const
Number i-tiles.
KOKKOS_INLINE_FUNCTION size_type k_size(const size_type i, const size_type j, const size_type k) const
Number of entries with for tile &#39;i,j&#39;.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
Kokkos::View< value_type[], execution_space > value_array_type
Kokkos::View< value_type[], execution_space > vec_type
KOKKOS_INLINE_FUNCTION size_type j_size(const size_type i, const size_type j) const
Number of entries with for tile &#39;i,j&#39;.
Kokkos::View< size_type **, execution_space > j_size_type
SimpleTiledCrsProductTensor & operator=(const SimpleTiledCrsProductTensor &rhs)
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
Kokkos::View< size_type ***, execution_space > k_size_type
KOKKOS_INLINE_FUNCTION size_type max_i_tile_size() const
Max size of any i tile.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Kokkos::View< size_type *, execution_space > i_size_type
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y)
Kokkos::View< size_type ***, execution_space > k_begin_type
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Kokkos::View< size_type *, execution_space > i_begin_type
KOKKOS_INLINE_FUNCTION size_type num_k_tiles(const size_type i, const size_type j) const
Number k-tiles.
static SimpleTiledCrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params)
Kokkos::View< size_type[], execution_space > coord_array_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
SimpleTiledCrsProductTensor< ValueType, Device > create_simple_tiled_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params)
friend friend void swap(Array< T2 > &a1, Array< T2 > &a2)
void resize(size_type new_size, const value_type &x=value_type())
KOKKOS_INLINE_FUNCTION size_type entry_begin(const size_type i, const size_type j, const size_type k, const size_type r) const
Begin entries for tile (i,j,k) and row r.
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION size_type j_begin(const size_type i, const size_type j) const
Begin entries with for tile &#39;i,j&#39;.
KOKKOS_INLINE_FUNCTION size_type i_begin(const size_type i) const
Begin entries with for tile &#39;i&#39;.
Kokkos::View< size_type **, execution_space > num_k_type
void push_back(const value_type &x)
Kokkos::View< size_type *, execution_space > num_j_type
KOKKOS_INLINE_FUNCTION size_type num_j_tiles(const size_type i) const
Number j-tiles.
size_type size() const
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
#define TEUCHOS_ASSERT(assertion_test)
KOKKOS_INLINE_FUNCTION size_type max_jk_tile_size() const
Max size of any j/k tile.
KOKKOS_INLINE_FUNCTION size_type num_entry(const size_type i, const size_type j, const size_type k, const size_type r) const
Number of entries for tile (i,j,k) and row r.
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > num_entry_type
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero&#39;s.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
int n
KOKKOS_INLINE_FUNCTION size_type k_begin(const size_type i, const size_type j, const size_type k) const
Begin entries with for tile &#39;i,j,k&#39;.
KOKKOS_INLINE_FUNCTION size_type i_size(const size_type i) const
Number of entries with for tile &#39;i&#39;.
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > row_map_type
virtual ordinal_type size() const =0
Return total size of basis.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
ordinal_type num_entries() const
Return number of non-zero entries.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)