Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_SimpleTiledCrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
44 
45 #include "Kokkos_Core.hpp"
46 
47 #include "Stokhos_Multiply.hpp"
48 #include "Stokhos_ProductBasis.hpp"
52 #include "Stokhos_TinyVec.hpp"
53 
54 
55 //----------------------------------------------------------------------------
56 //----------------------------------------------------------------------------
57 
58 namespace Stokhos {
59 
60 template< typename ValueType, class ExecutionSpace >
62 public:
63 
64  typedef ExecutionSpace execution_space;
65  typedef int size_type;
66  typedef ValueType value_type;
67 
68 // Vectorsize used in multiply algorithm
69 #if defined(__AVX__)
70  static const size_type host_vectorsize = 32/sizeof(value_type);
71  static const bool use_intrinsics = true;
72 #elif defined(__MIC__)
73  static const size_type host_vectorsize = 16;
74  static const bool use_intrinsics = true;
75 #else
76  static const size_type host_vectorsize = 2;
77  static const bool use_intrinsics = false;
78 #endif
79  static const size_type cuda_vectorsize = 32;
80  static const bool is_cuda =
81 #if defined( KOKKOS_ENABLE_CUDA )
82  Kokkos::Impl::is_same<ExecutionSpace,Kokkos::Cuda>::value;
83 #else
84  false ;
85 #endif
87 
88  // Alignment in terms of number of entries of CRS rows
90 
91 private:
92 
93  typedef Kokkos::View< value_type[], execution_space > vec_type;
94  typedef Kokkos::View< value_type[], execution_space > value_array_type;
95  typedef Kokkos::View< size_type[], execution_space > coord_array_type;
96  typedef Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type;
97  typedef Kokkos::View< size_type*, execution_space > i_begin_type;
98  typedef Kokkos::View< size_type*, execution_space > i_size_type;
99  typedef Kokkos::View< size_type*, execution_space > num_j_type;
100  typedef Kokkos::View< size_type**, execution_space > j_begin_type;
101  typedef Kokkos::View< size_type**, execution_space > j_size_type;
102  typedef Kokkos::View< size_type**, execution_space > num_k_type;
103  typedef Kokkos::View< size_type***, execution_space > k_begin_type;
104  typedef Kokkos::View< size_type***, execution_space > k_size_type;
105 
106  typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space > row_map_type;
107  typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space > num_entry_type;
108 
128 
129  struct Coord {
132  };
133 
134  template <typename coord_t>
135  struct Tile {
138  };
139 
143 
144 public:
145 
146  inline
148 
149  inline
151  m_value(),
152  m_coord(),
153  m_coord2(),
154  m_i_begin(),
155  m_i_size(),
156  m_num_j(),
157  m_j_begin(),
158  m_j_size(),
159  m_num_k(),
160  m_k_begin(),
161  m_k_size(),
162  m_row_map(),
163  m_num_entry(),
164  m_dimension(0),
167  m_num_i(0),
168  m_nnz(0),
169  m_flops(0) {}
170 
171  inline
173  m_value(rhs.m_value),
174  m_coord(rhs.m_coord),
175  m_coord2(rhs.m_coord2),
176  m_i_begin(rhs.m_i_begin),
177  m_i_size(rhs.m_i_size),
178  m_num_j(rhs.m_num_j),
179  m_j_begin(rhs.m_j_begin),
180  m_j_size(rhs.m_j_size),
181  m_num_k(rhs.m_num_k),
182  m_k_begin(rhs.m_k_begin),
183  m_k_size(rhs.m_k_size),
184  m_row_map(rhs.m_row_map),
189  m_num_i(rhs.m_num_i),
190  m_nnz(rhs.m_nnz),
191  m_flops(rhs.m_flops) {}
192 
193  inline
195  const SimpleTiledCrsProductTensor & rhs)
196  {
197  m_value = rhs.m_value;
198  m_coord = rhs.m_coord;
199  m_coord2 = rhs.m_coord2;
200  m_i_begin = rhs.m_i_begin;
201  m_i_size = rhs.m_i_size;
202  m_num_j = rhs.m_num_j;
203  m_j_begin = rhs.m_j_begin;
204  m_j_size = rhs.m_j_size;
205  m_num_k = rhs.m_num_k;
206  m_k_begin = rhs.m_k_begin;
207  m_k_size = rhs.m_k_size;
208  m_row_map = rhs.m_row_map;
209  m_num_entry = rhs.m_num_entry;
210  m_dimension = rhs.m_dimension;
213  m_num_i = rhs.m_num_i;
214  m_nnz = rhs.m_nnz;
215  m_flops = rhs.m_flops;
216  return *this;
217  }
218 
220  KOKKOS_INLINE_FUNCTION
221  size_type dimension() const { return m_dimension; }
222 
224  KOKKOS_INLINE_FUNCTION
225  size_type entry_count() const { return m_coord.extent(0); }
226 
228  KOKKOS_INLINE_FUNCTION
229  size_type num_i_tiles() const { return m_num_i; }
230 
232  KOKKOS_INLINE_FUNCTION
233  size_type i_begin(const size_type i) const { return m_i_begin(i); }
234 
236  KOKKOS_INLINE_FUNCTION
237  size_type i_size(const size_type i) const { return m_i_size(i); }
238 
240  KOKKOS_INLINE_FUNCTION
241  size_type num_j_tiles(const size_type i) const { return m_num_j(i); }
242 
244  KOKKOS_INLINE_FUNCTION
245  size_type j_begin(const size_type i, const size_type j) const {
246  return m_j_begin(i,j);
247  }
248 
250  KOKKOS_INLINE_FUNCTION
251  size_type j_size(const size_type i, const size_type j) const {
252  return m_j_size(i,j);
253  }
254 
256  KOKKOS_INLINE_FUNCTION
257  size_type num_k_tiles(const size_type i, const size_type j) const {
258  return m_num_k(i,j); }
259 
261  KOKKOS_INLINE_FUNCTION
263  const size_type k) const {
264  return m_k_begin(i,j,k);
265  }
266 
268  KOKKOS_INLINE_FUNCTION
270  const size_type k) const {
271  return m_k_size(i,j,k);
272  }
273 
275  KOKKOS_INLINE_FUNCTION
277  const size_type k, const size_type r) const {
278  return m_num_entry(i,j,k,r);
279  }
280 
282  KOKKOS_INLINE_FUNCTION
284  const size_type k, const size_type r) const {
285  return m_row_map(i,j,k,r);
286  }
287 
289  KOKKOS_INLINE_FUNCTION
291  const size_type k, const size_type r) const {
292  return m_row_map(i,j,k,r) + m_num_entry(i,j,k,r);
293  }
294 
296  KOKKOS_INLINE_FUNCTION
297  const size_type& coord(const size_type entry, const size_type c) const {
298  return m_coord2(entry, c);
299  }
300 
302  KOKKOS_INLINE_FUNCTION
303  const size_type& coord(const size_type entry) const {
304  return m_coord(entry);
305  }
306 
308  KOKKOS_INLINE_FUNCTION
309  const value_type & value(const size_type entry) const {
310  return m_value(entry);
311  }
312 
314  KOKKOS_INLINE_FUNCTION
316  { return m_nnz; }
317 
319  KOKKOS_INLINE_FUNCTION
321  { return m_flops; }
322 
324  KOKKOS_INLINE_FUNCTION
326 
328  KOKKOS_INLINE_FUNCTION
330 
331  template <typename OrdinalType>
335  const Teuchos::ParameterList& params)
336  {
337  using Teuchos::rcp;
338  using Teuchos::RCP;
340  using Teuchos::Array;
341 
343  typedef typename Cijk_type::i_iterator i_iterator;
344  typedef typename Cijk_type::ik_iterator ik_iterator;
345  typedef typename Cijk_type::ikj_iterator ikj_iterator;
346 
347  const size_type i_tile_size = params.get<OrdinalType>("Tile Size");
348 
349  // Build 2-way symmetric Cijk tensor
350  Cijk_type Cijk_sym;
351  i_iterator i_begin = Cijk.i_begin();
352  i_iterator i_end = Cijk.i_end();
353  for (i_iterator i_it=i_begin; i_it!=i_end; ++i_it) {
354  OrdinalType i = index(i_it);
355  ik_iterator k_begin = Cijk.k_begin(i_it);
356  ik_iterator k_end = Cijk.k_end(i_it);
357  for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
358  OrdinalType k = index(k_it);
359  ikj_iterator j_begin = Cijk.j_begin(k_it);
360  ikj_iterator j_end = Cijk.j_end(k_it);
361  for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
362  OrdinalType j = index(j_it);
363  if (k <= j) {
364  ValueType c = Stokhos::value(j_it);
365  Cijk_sym.add_term(i, j, k, c);
366  }
367  }
368  }
369  }
370  Cijk_sym.fillComplete();
371 
372  // First partition based on i
373  size_type j_tile_size = i_tile_size / 2;
374  size_type basis_size = basis.size();
375  size_type num_i_parts = (basis_size + i_tile_size-1) / i_tile_size;
376  //size_type its = basis_size / num_i_parts;
377  size_type its = i_tile_size;
378  Array<ITile> i_tiles(num_i_parts);
379  for (size_type i=0; i<num_i_parts; ++i) {
380  i_tiles[i].lower = i*its;
381  i_tiles[i].upper = std::min(basis_size, (i+1)*its);
382  i_tiles[i].parts.resize(1);
383  i_tiles[i].parts[0].lower = basis_size;
384  i_tiles[i].parts[0].upper = 0;
385  }
386 
387  // Next partition j
389  for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
390  OrdinalType i = index(i_it);
391 
392  // Find which part i belongs to
393  size_type idx = 0;
394  while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
395  --idx;
396  TEUCHOS_ASSERT(idx >= 0 && idx < num_i_parts);
397 
398  ik_iterator k_begin = Cijk_sym.k_begin(i_it);
399  ik_iterator k_end = Cijk_sym.k_end(i_it);
400  for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
401  OrdinalType j = index(k_it); // using symmetry to interchange j and k
402 
403  if (j < i_tiles[idx].parts[0].lower)
404  i_tiles[idx].parts[0].lower = j;
405  if (j > i_tiles[idx].parts[0].upper)
406  i_tiles[idx].parts[0].upper = j;
407  }
408  }
409  for (size_type idx=0; idx<num_i_parts; ++idx) {
410  size_type lower = i_tiles[idx].parts[0].lower;
411  size_type upper = i_tiles[idx].parts[0].upper;
412  size_type range = upper - lower + 1;
413  size_type num_j_parts = (range + j_tile_size-1) / j_tile_size;
414  //size_type jts = range / num_j_parts;
415  size_type jts = j_tile_size;
416  max_jk_tile_size = std::max(max_jk_tile_size, jts);
417  Array<JTile> j_tiles(num_j_parts);
418  for (size_type j=0; j<num_j_parts; ++j) {
419  j_tiles[j].lower = lower + j*jts;
420  j_tiles[j].upper = std::min(upper+1, lower + (j+1)*jts);
421  j_tiles[j].parts.resize(1);
422  j_tiles[j].parts[0].lower = basis_size;
423  j_tiles[j].parts[0].upper = 0;
424  }
425  i_tiles[idx].parts.swap(j_tiles);
426  }
427 
428  // Now partition k
429  for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
430  OrdinalType i = index(i_it);
431 
432  // Find which part i belongs to
433  size_type idx = 0;
434  while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
435  --idx;
436  TEUCHOS_ASSERT(idx >= 0 && idx < num_i_parts);
437 
438  ik_iterator k_begin = Cijk_sym.k_begin(i_it);
439  ik_iterator k_end = Cijk_sym.k_end(i_it);
440  for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
441  OrdinalType j = index(k_it); // using symmetry to interchange j and k
442 
443  // Find which part j belongs to
444  size_type num_j_parts = i_tiles[idx].parts.size();
445  size_type jdx = 0;
446  while (jdx < num_j_parts && j >= i_tiles[idx].parts[jdx].lower) ++jdx;
447  --jdx;
448  TEUCHOS_ASSERT(jdx >= 0 && jdx < num_j_parts);
449 
450  ikj_iterator j_begin = Cijk_sym.j_begin(k_it);
451  ikj_iterator j_end = Cijk_sym.j_end(k_it);
452  for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
453  OrdinalType k = index(j_it); // using symmetry to interchange j and k
454  ValueType cijk = Stokhos::value(j_it);
455  if (k >= j) {
456  Coord coord;
457  coord.i = i; coord.j = j; coord.k = k; coord.cijk = cijk;
458  i_tiles[idx].parts[jdx].parts[0].parts.push_back(coord);
459  if (k < i_tiles[idx].parts[jdx].parts[0].lower)
460  i_tiles[idx].parts[jdx].parts[0].lower = k;
461  if (k > i_tiles[idx].parts[jdx].parts[0].upper)
462  i_tiles[idx].parts[jdx].parts[0].upper = k;
463  }
464  }
465  }
466  }
467 
468  // Now need to divide up k-parts based on lower/upper bounds
469  size_type num_coord = 0;
470  for (size_type idx=0; idx<num_i_parts; ++idx) {
471  size_type num_j_parts = i_tiles[idx].parts.size();
472  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
473  size_type lower = i_tiles[idx].parts[jdx].parts[0].lower;
474  size_type upper = i_tiles[idx].parts[jdx].parts[0].upper;
475  size_type range = upper - lower + 1;
476  size_type num_k_parts = (range + j_tile_size-1) / j_tile_size;
477  //size_type kts = range / num_k_parts;
478  size_type kts = j_tile_size;
479  max_jk_tile_size = std::max(max_jk_tile_size, kts);
480  Array<KTile> k_tiles(num_k_parts);
481  for (size_type k=0; k<num_k_parts; ++k) {
482  k_tiles[k].lower = lower + k*kts;
483  k_tiles[k].upper = std::min(upper+1, lower + (k+1)*kts);
484  }
485  size_type num_k = i_tiles[idx].parts[jdx].parts[0].parts.size();
486  for (size_type l=0; l<num_k; ++l) {
487  size_type i = i_tiles[idx].parts[jdx].parts[0].parts[l].i;
488  size_type j = i_tiles[idx].parts[jdx].parts[0].parts[l].j;
489  size_type k = i_tiles[idx].parts[jdx].parts[0].parts[l].k;
490  value_type cijk = i_tiles[idx].parts[jdx].parts[0].parts[l].cijk;
491 
492  // Find which part k belongs to
493  size_type kdx = 0;
494  while (kdx < num_k_parts && k >= k_tiles[kdx].lower) ++kdx;
495  --kdx;
496  TEUCHOS_ASSERT(kdx >= 0 && kdx < num_k_parts);
497 
498  Coord coord;
499  coord.i = i; coord.j = j; coord.k = k; coord.cijk = cijk;
500  k_tiles[kdx].parts.push_back(coord);
501  ++num_coord;
502  if (j != k) ++num_coord;
503  }
504 
505  // Eliminate parts with zero size
506  Array<KTile> k_tiles2;
507  for (size_type k=0; k<num_k_parts; ++k) {
508  if (k_tiles[k].parts.size() > 0)
509  k_tiles2.push_back(k_tiles[k]);
510  }
511  i_tiles[idx].parts[jdx].parts.swap(k_tiles2);
512  }
513  }
514  TEUCHOS_ASSERT(num_coord == Cijk.num_entries());
515 
516  // Compute number of non-zeros for each row in each part
517  size_type total_num_rows = 0, max_num_rows = 0, entry_count = 0;
518  size_type max_num_j_parts = 0, max_num_k_parts = 0;
519  Array< Array< Array< Array<size_type> > > > coord_work(num_i_parts);
520  for (size_type idx=0; idx<num_i_parts; ++idx) {
521  size_type num_j_parts = i_tiles[idx].parts.size();
522  max_num_j_parts = std::max(max_num_j_parts, num_j_parts);
523  coord_work[idx].resize(num_j_parts);
524  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
525  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
526  max_num_k_parts = std::max(max_num_k_parts, num_k_parts);
527  coord_work[idx][jdx].resize(num_k_parts);
528  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
529  size_type num_rows = i_tiles[idx].upper - i_tiles[idx].lower + 1;
530  total_num_rows += num_rows;
531  max_num_rows = std::max(max_num_rows, num_rows);
532  coord_work[idx][jdx][kdx].resize(num_rows, 0);
533 
534  size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.size();
535  for (size_type c=0; c<nc; ++c) {
536  size_type i = i_tiles[idx].parts[jdx].parts[kdx].parts[c].i;
537  size_type i_begin = i_tiles[idx].lower;
538  ++(coord_work[idx][jdx][kdx][i-i_begin]);
539  ++entry_count;
540  }
541  }
542  }
543  }
544 
545  // Pad each row to have size divisible by alignment size
546  for (size_type idx=0; idx<num_i_parts; ++idx) {
547  size_type num_j_parts = i_tiles[idx].parts.size();
548  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
549  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
550  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
551  size_type sz = coord_work[idx][jdx][kdx].size();
552  for (size_type i = 0; i < sz; ++i) {
553  const size_t rem = coord_work[idx][jdx][kdx][i] % tensor_align;
554  if (rem > 0) {
555  const size_t pad = tensor_align - rem;
556  coord_work[idx][jdx][kdx][i] += pad;
557  entry_count += pad;
558  }
559  }
560  }
561  }
562  }
563 
564  // Allocate tensor data
566  tensor.m_value = value_array_type("value", entry_count);
567  tensor.m_coord = coord_array_type("coord", entry_count);
568  tensor.m_coord2 = coord2_array_type("coord2", entry_count);
569  tensor.m_i_begin = i_begin_type("i_begin", num_i_parts);
570  tensor.m_i_size = i_size_type("i_size", num_i_parts);
571  tensor.m_num_j = num_j_type("num_j", num_i_parts);
572  tensor.m_j_begin = j_begin_type("j_begin", num_i_parts, max_num_j_parts);
573  tensor.m_j_size = j_size_type("j_size", num_i_parts, max_num_j_parts);
574  tensor.m_num_k = num_k_type("num_k", num_i_parts, max_num_j_parts);
575  tensor.m_k_begin = k_begin_type("k_begin", num_i_parts, max_num_j_parts,
576  max_num_k_parts);
577  tensor.m_k_size = k_size_type("k_size", num_i_parts, max_num_j_parts,
578  max_num_k_parts);
579  tensor.m_row_map = row_map_type("row_map", num_i_parts,
580  max_num_j_parts, max_num_k_parts,
581  max_num_rows+1);
582  tensor.m_num_entry = num_entry_type("num_entry", num_i_parts,
583  max_num_j_parts, max_num_k_parts,
584  max_num_rows);
585  tensor.m_dimension = basis.size();
586  tensor.m_max_i_tile_size = i_tile_size;
588  tensor.m_num_i = num_i_parts;
589 
590  // Create mirror, is a view if is host memory
591  typename value_array_type::HostMirror host_value =
593  typename coord_array_type::HostMirror host_coord =
595  typename coord2_array_type::HostMirror host_coord2 =
597  typename i_begin_type::HostMirror host_i_begin =
599  typename i_size_type::HostMirror host_i_size =
601  typename num_j_type::HostMirror host_num_j =
603  typename j_begin_type::HostMirror host_j_begin =
605  typename j_size_type::HostMirror host_j_size =
607  typename num_k_type::HostMirror host_num_k =
609  typename k_begin_type::HostMirror host_k_begin =
611  typename k_size_type::HostMirror host_k_size =
613  typename row_map_type::HostMirror host_row_map =
615  typename num_entry_type::HostMirror host_num_entry =
617 
618  // Compute row map
619  size_type sum = 0;
620  for (size_type idx=0; idx<num_i_parts; ++idx) {
621  size_type num_j_parts = i_tiles[idx].parts.size();
622  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
623  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
624  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
625  size_type nc = coord_work[idx][jdx][kdx].size();
626  host_row_map(idx,jdx,kdx,0) = sum;
627  for (size_type t=0; t<nc; ++t) {
628  sum += coord_work[idx][jdx][kdx][t];
629  host_row_map(idx,jdx,kdx,t+1) = sum;
630  host_num_entry(idx,jdx,kdx,t) = 0;
631  }
632  }
633  }
634  }
635 
636  // Copy per part row offsets back into coord_work
637  for (size_type idx=0; idx<num_i_parts; ++idx) {
638  size_type num_j_parts = i_tiles[idx].parts.size();
639  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
640  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
641  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
642  size_type nc = coord_work[idx][jdx][kdx].size();
643  for (size_type t=0; t<nc; ++t) {
644  coord_work[idx][jdx][kdx][t] = host_row_map(idx,jdx,kdx,t);
645  }
646  }
647  }
648  }
649 
650  // Fill in coordinate and value arrays
651  for (size_type idx=0; idx<num_i_parts; ++idx) {
652  host_i_begin(idx) = i_tiles[idx].lower;
653  host_i_size(idx) = i_tiles[idx].upper - i_tiles[idx].lower;
654  TEUCHOS_ASSERT(host_i_size(idx) <= i_tile_size);
655  size_type num_j_parts = i_tiles[idx].parts.size();
656  host_num_j(idx) = num_j_parts;
657  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
658  host_j_begin(idx,jdx) = i_tiles[idx].parts[jdx].lower;
659  host_j_size(idx,jdx) = i_tiles[idx].parts[jdx].upper -
660  i_tiles[idx].parts[jdx].lower;
661  TEUCHOS_ASSERT(host_j_size(idx,jdx) <= max_jk_tile_size);
662  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
663  host_num_k(idx,jdx) = num_k_parts;
664  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
665  host_k_begin(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].lower;
666  host_k_size(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].upper -
667  i_tiles[idx].parts[jdx].parts[kdx].lower;
668  TEUCHOS_ASSERT(host_k_size(idx,jdx,kdx) <= max_jk_tile_size);
669 
670  size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.size();
671  for (size_type t=0; t<nc; ++t) {
672  Coord s = i_tiles[idx].parts[jdx].parts[kdx].parts[t];
673  const size_type i = s.i;
674  const size_type j = s.j;
675  const size_type k = s.k;
676  const value_type c = s.cijk;
677 
678  const size_type row = i - host_i_begin(idx);
679  const size_type n = coord_work[idx][jdx][kdx][row];
680  ++coord_work[idx][jdx][kdx][row];
681 
682  host_value(n) = (j != k) ? c : 0.5*c;
683  host_coord2(n,0) = j - host_j_begin(idx,jdx);
684  host_coord2(n,1) = k - host_k_begin(idx,jdx,kdx);
685  host_coord(n) = (host_coord2(n,1) << 16) | host_coord2(n,0);
686 
687  ++host_num_entry(idx,jdx,kdx,row);
688  ++tensor.m_nnz;
689  }
690  }
691  }
692  }
693 
694  // Copy data to device if necessary
695  Kokkos::deep_copy(tensor.m_value, host_value);
696  Kokkos::deep_copy(tensor.m_coord, host_coord);
697  Kokkos::deep_copy(tensor.m_coord2, host_coord2);
698  Kokkos::deep_copy(tensor.m_i_begin, host_i_begin);
699  Kokkos::deep_copy(tensor.m_i_size, host_i_size);
700  Kokkos::deep_copy(tensor.m_num_j, host_num_j);
701  Kokkos::deep_copy(tensor.m_j_begin, host_j_begin);
702  Kokkos::deep_copy(tensor.m_j_size, host_j_size);
703  Kokkos::deep_copy(tensor.m_num_k, host_num_k);
704  Kokkos::deep_copy(tensor.m_k_begin, host_k_begin);
705  Kokkos::deep_copy(tensor.m_k_size, host_k_size);
706  Kokkos::deep_copy(tensor.m_row_map, host_row_map);
707  Kokkos::deep_copy(tensor.m_num_entry, host_num_entry);
708 
709  tensor.m_flops = 0;
710  for (size_type idx=0; idx<num_i_parts; ++idx) {
711  size_type num_j_parts = i_tiles[idx].parts.size();
712  for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
713  size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
714  for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
715  for (size_type i = 0; i < host_i_size(idx); ++i) {
716  tensor.m_flops += 5*host_num_entry(idx,jdx,kdx,i) + 1;
717  }
718  }
719  }
720  }
721 
722  return tensor;
723  }
724 };
725 
726 template< class Device, typename OrdinalType, typename ValueType >
727 SimpleTiledCrsProductTensor<ValueType, Device>
731  const Teuchos::ParameterList& params)
732 {
734  basis, Cijk, params);
735 }
736 
737 template < typename ValueType, typename Device >
738 class BlockMultiply< SimpleTiledCrsProductTensor< ValueType , Device > >
739 {
740 public:
741 
742  typedef typename Device::size_type size_type ;
744 
745  template< typename MatrixValue , typename VectorValue >
746  KOKKOS_INLINE_FUNCTION
747  static void apply( const tensor_type & tensor ,
748  const MatrixValue * const a ,
749  const VectorValue * const x ,
750  VectorValue * const y )
751  {
752  const size_type block_size = 2;
754 
755  const size_type n_i_tile = tensor.num_i_tiles();
756  for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
757  const size_type i_begin = tensor.i_begin(i_tile);
758  const size_type i_size = tensor.i_size(i_tile);
759 
760  const size_type n_j_tile = tensor.num_j_tiles(i_tile);
761  for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
762  const size_type j_begin = tensor.j_begin(i_tile, j_tile);
763  //const size_type j_size = tensor.j_size(i_tile, j_tile);
764 
765  const size_type n_k_tile = tensor.num_k_tiles(i_tile, j_tile);
766  for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
767  const size_type k_begin = tensor.k_begin(i_tile, j_tile, k_tile);
768  //const size_type k_size = tensor.k_size(i_tile, j_tile, k_tile);
769 
770  for (size_type i=0; i<i_size; ++i) {
771 
772  const size_type nEntry =
773  tensor.num_entry(i_tile,j_tile,k_tile,i);
774  const size_type iEntryBeg =
775  tensor.entry_begin(i_tile,j_tile,k_tile,i);
776  const size_type iEntryEnd = iEntryBeg + nEntry;
777  size_type iEntry = iEntryBeg;
778 
779  VectorValue ytmp = 0 ;
780 
781  // Do entries with a blocked loop of size block_size
782  if (block_size > 1) {
783  const size_type nBlock = nEntry / block_size;
784  const size_type nEntryB = nBlock * block_size;
785  const size_type iEnd = iEntryBeg + nEntryB;
786 
787  TV vy;
788  vy.zero();
789  int j[block_size], k[block_size];
790 
791  for ( ; iEntry < iEnd ; iEntry += block_size ) {
792 
793  for (size_type ii=0; ii<block_size; ++ii) {
794  j[ii] = tensor.coord(iEntry+ii,0) + j_begin;
795  k[ii] = tensor.coord(iEntry+ii,1) + k_begin;
796  }
797  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
798  c(&(tensor.value(iEntry)));
799 
800  // vy += c * ( aj * xk + ak * xj)
801  aj.times_equal(xk);
802  ak.times_equal(xj);
803  aj.plus_equal(ak);
804  c.times_equal(aj);
805  vy.plus_equal(c);
806 
807  }
808 
809  ytmp += vy.sum();
810  }
811 
812  // Do remaining entries with a scalar loop
813  for ( ; iEntry<iEntryEnd; ++iEntry) {
814  const size_type j = tensor.coord(iEntry,0) + j_begin;
815  const size_type k = tensor.coord(iEntry,1) + k_begin;
816 
817  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
818  }
819 
820  y[i+i_begin] += ytmp;
821 
822  }
823  }
824  }
825  }
826  }
827 
828  KOKKOS_INLINE_FUNCTION
829  static size_type matrix_size( const tensor_type & tensor )
830  { return tensor.dimension(); }
831 
832  KOKKOS_INLINE_FUNCTION
833  static size_type vector_size( const tensor_type & tensor )
834  { return tensor.dimension(); }
835 };
836 
837 } /* namespace Stokhos */
838 
839 //----------------------------------------------------------------------------
840 //----------------------------------------------------------------------------
841 
842 #endif /* #ifndef STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION size_type entry_end(const size_type i, const size_type j, const size_type k, const size_type r) const
End entries for tile (i,j,k) and row r.
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop&#39;s per multiply-add.
k_iterator k_begin() const
Iterator pointing to first k entry.
SimpleTiledCrsProductTensor(const SimpleTiledCrsProductTensor &rhs)
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
T & get(ParameterList &l, const std::string &name)
Kokkos::View< size_type **, execution_space > j_begin_type
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
KOKKOS_INLINE_FUNCTION size_type num_i_tiles() const
Number i-tiles.
KOKKOS_INLINE_FUNCTION size_type k_size(const size_type i, const size_type j, const size_type k) const
Number of entries with for tile &#39;i,j&#39;.
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
Kokkos::View< value_type[], execution_space > value_array_type
Kokkos::View< value_type[], execution_space > vec_type
KOKKOS_INLINE_FUNCTION size_type j_size(const size_type i, const size_type j) const
Number of entries with for tile &#39;i,j&#39;.
Kokkos::View< size_type **, execution_space > j_size_type
SimpleTiledCrsProductTensor & operator=(const SimpleTiledCrsProductTensor &rhs)
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
Kokkos::View< size_type ***, execution_space > k_size_type
KOKKOS_INLINE_FUNCTION size_type max_i_tile_size() const
Max size of any i tile.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
Kokkos::View< size_type *, execution_space > i_size_type
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y)
Kokkos::View< size_type ***, execution_space > k_begin_type
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Kokkos::View< size_type *, execution_space > i_begin_type
KOKKOS_INLINE_FUNCTION size_type num_k_tiles(const size_type i, const size_type j) const
Number k-tiles.
static SimpleTiledCrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params)
Kokkos::View< size_type[], execution_space > coord_array_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
SimpleTiledCrsProductTensor< ValueType, Device > create_simple_tiled_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params)
friend friend void swap(Array< T2 > &a1, Array< T2 > &a2)
void resize(size_type new_size, const value_type &x=value_type())
KOKKOS_INLINE_FUNCTION size_type entry_begin(const size_type i, const size_type j, const size_type k, const size_type r) const
Begin entries for tile (i,j,k) and row r.
k_iterator k_end() const
Iterator pointing to last k entry.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION size_type j_begin(const size_type i, const size_type j) const
Begin entries with for tile &#39;i,j&#39;.
KOKKOS_INLINE_FUNCTION size_type i_begin(const size_type i) const
Begin entries with for tile &#39;i&#39;.
Kokkos::View< size_type **, execution_space > num_k_type
void push_back(const value_type &x)
Kokkos::View< size_type *, execution_space > num_j_type
KOKKOS_INLINE_FUNCTION size_type num_j_tiles(const size_type i) const
Number j-tiles.
size_type size() const
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
#define TEUCHOS_ASSERT(assertion_test)
KOKKOS_INLINE_FUNCTION size_type max_jk_tile_size() const
Max size of any j/k tile.
KOKKOS_INLINE_FUNCTION size_type num_entry(const size_type i, const size_type j, const size_type k, const size_type r) const
Number of entries for tile (i,j,k) and row r.
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > num_entry_type
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero&#39;s.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
int n
KOKKOS_INLINE_FUNCTION size_type k_begin(const size_type i, const size_type j, const size_type k) const
Begin entries with for tile &#39;i,j,k&#39;.
KOKKOS_INLINE_FUNCTION size_type i_size(const size_type i) const
Number of entries with for tile &#39;i&#39;.
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP...> >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP...> >::value >::type sum(const Kokkos::View< RD, RP...> &r, const Kokkos::View< XD, XP...> &x)
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > row_map_type
virtual ordinal_type size() const =0
Return total size of basis.
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
ordinal_type num_entries() const
Return number of non-zero entries.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)