Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_DynamicThreadedStorage_cuda.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #if defined( __CUDA_ARCH__ )
11 
12 namespace Stokhos {
13 
14  template <typename ordinal_t, typename value_t>
15  class DynamicThreadedStorage<ordinal_t, value_t, Kokkos::Cuda> {
16  public:
17 
18  static const bool is_static = false;
19  static const int static_size = 0;
20  static const bool supports_reset = true;
21 
22  typedef ordinal_t ordinal_type;
23  typedef value_t value_type;
24  typedef Kokkos::Cuda execution_space;
25  typedef value_type& reference;
26  typedef volatile value_type& volatile_reference;
27  typedef const value_type& const_reference;
28  typedef const volatile value_type& const_volatile_reference;
29  typedef value_type* pointer;
30  typedef volatile value_type* volatile_pointer;
31  typedef const value_type* const_pointer;
32  typedef const volatile value_type* const_volatile_pointer;
34 
36  template <typename ord_t, typename val_t = value_t , typename dev_t = Kokkos::Cuda >
37  struct apply {
38  typedef DynamicThreadedStorage<ord_t,val_t,dev_t> type;
39  };
40 
42  __device__
43  DynamicThreadedStorage(const ordinal_type& sz = 1,
44  const value_type& x = value_type(0.0)) :
45  sz_(sz), stride_(num_threads()), total_sz_(sz_*stride_) {
46  allocate_coeff_array(coeff_, is_owned_, total_sz_, x);
47  }
48 
50  __device__
51  DynamicThreadedStorage(const ordinal_type& sz, const value_type* x) :
52  sz_(sz), stride_(num_threads()), total_sz_(sz_*stride_) {
53  allocate_coeff_array(coeff_, is_owned_, total_sz_, x);
54  }
55 
57  __device__
58  DynamicThreadedStorage(const ordinal_type& sz, pointer v, bool owned) :
59  coeff_(v), sz_(sz), stride_(num_threads()), total_sz_(sz_*stride_),
60  is_owned_(owned) {}
61 
63  __device__
64  DynamicThreadedStorage(const DynamicThreadedStorage& s) :
65  sz_(s.sz_), stride_(s.stride_), total_sz_(s.total_sz_) {
66  allocate_coeff_array(coeff_, is_owned_, total_sz_);
67  for (ordinal_type i=0; i<total_sz_; i+=stride_)
68  coeff_[i] = s.coeff_[i];
69  }
70 
72  __device__
73  DynamicThreadedStorage(const volatile DynamicThreadedStorage& s) :
74  sz_(s.sz_), stride_(s.stride_), total_sz_(s.total_sz_) {
75  allocate_coeff_array(coeff_, is_owned_, total_sz_);
76  for (ordinal_type i=0; i<total_sz_; i+=stride_)
77  coeff_[i] = s.coeff_[i];
78  }
79 
81  __device__
82  ~DynamicThreadedStorage() {
83  destroy_coeff_array(coeff_, is_owned_, total_sz_);
84  }
85 
87  __device__
88  DynamicThreadedStorage& operator=(const DynamicThreadedStorage& s) {
89  if (&s != this) {
90  if (s.sz_ != sz_) {
91  destroy_coeff_array(coeff_, is_owned_, total_sz_);
92  sz_ = s.sz_;
93  stride_ = s.stride_;
94  total_sz_ = sz_*stride_;
95  allocate_coeff_array(coeff_, is_owned_, total_sz_);
96  for (ordinal_type i=0; i<total_sz_; i+=stride_)
97  coeff_[i] = s.coeff_[i];
98  }
99  else {
100  for (ordinal_type i=0; i<total_sz_; i+=stride_)
101  coeff_[i] = s.coeff_[i];
102  }
103  }
104  return *this;
105  }
106 
108  __device__
109  DynamicThreadedStorage&
110  operator=(const volatile DynamicThreadedStorage& s) {
111  if (&s != this) {
112  if (s.sz_ != sz_) {
113  destroy_coeff_array(coeff_, is_owned_, total_sz_);
114  sz_ = s.sz_;
115  stride_ = s.stride_;
116  total_sz_ = sz_*stride_;
117  allocate_coeff_array(coeff_, is_owned_, total_sz_);
118  for (ordinal_type i=0; i<total_sz_; i+=stride_)
119  coeff_[i] = s.coeff_[i];
120  }
121  else {
122  for (ordinal_type i=0; i<total_sz_; i+=stride_)
123  coeff_[i] = s.coeff_[i];
124  }
125  }
126  return *this;
127  }
128 
130  __device__
131  volatile DynamicThreadedStorage&
132  operator=(const DynamicThreadedStorage& s) volatile {
133  if (&s != this) {
134  if (s.sz_ != sz_) {
135  destroy_coeff_array(coeff_, is_owned_, total_sz_);
136  sz_ = s.sz_;
137  stride_ = s.stride_;
138  total_sz_ = sz_*stride_;
139  allocate_coeff_array(coeff_, is_owned_, total_sz_);
140  for (ordinal_type i=0; i<total_sz_; i+=stride_)
141  coeff_[i] = s.coeff_[i];
142  }
143  else {
144  for (ordinal_type i=0; i<total_sz_; i+=stride_)
145  coeff_[i] = s.coeff_[i];
146  }
147  }
148  return *this;
149  }
150 
152  __device__
153  volatile DynamicThreadedStorage&
154  operator=(const volatile DynamicThreadedStorage& s) volatile {
155  if (&s != this) {
156  if (s.sz_ != sz_) {
157  destroy_coeff_array(coeff_, is_owned_, total_sz_);
158  sz_ = s.sz_;
159  stride_ = s.stride_;
160  total_sz_ = sz_*stride_;
161  allocate_coeff_array(coeff_, is_owned_, total_sz_);
162  for (ordinal_type i=0; i<total_sz_; i+=stride_)
163  coeff_[i] = s.coeff_[i];
164  }
165  else {
166  for (ordinal_type i=0; i<total_sz_; i+=stride_)
167  coeff_[i] = s.coeff_[i];
168  }
169  }
170  return *this;
171  }
172 
174  __device__
175  void init(const_reference v) {
176  for (ordinal_type i=0; i<total_sz_; i+=stride_)
177  coeff_[i] = v;
178  }
179 
181  __device__
182  void init(const_reference v) volatile {
183  for (ordinal_type i=0; i<total_sz_; i+=stride_)
184  coeff_[i] = v;
185  }
186 
188  __device__
189  void init(const_pointer v, const ordinal_type& sz = 0) {
190  ordinal_type my_sz = stride_*sz;
191  if (sz == 0)
192  my_sz = total_sz_;
193  for (ordinal_type i=0; i<my_sz; i+=stride_)
194  coeff_[i] = v[i];
195  }
196 
198  __device__
199  void init(const_pointer v, const ordinal_type& sz = 0) volatile {
200  ordinal_type my_sz = stride_*sz;
201  if (sz == 0)
202  my_sz = total_sz_;
203  for (ordinal_type i=0; i<my_sz; i+=stride_)
204  coeff_[i] = v[i];
205  }
206 
208  __device__
209  void load(pointer v) {
210  for (ordinal_type i=0; i<total_sz_; i+=stride_)
211  coeff_[i] = v[i];
212  }
213 
215  __device__
216  void load(pointer v) volatile {
217  for (ordinal_type i=0; i<total_sz_; i+=stride_)
218  coeff_[i] = v[i];
219  }
220 
222  __device__
223  void resize(const ordinal_type& sz) {
224  if (sz != sz_) {
225  value_type *coeff_new;
226  bool owned_new;
227  ordinal_type total_sz_new = sz*stride_;
228  allocate_coeff_array(coeff_new, owned_new, total_sz_new);
229  ordinal_type my_tsz = total_sz_;
230  if (total_sz_ > total_sz_new)
231  my_tsz = total_sz_new;
232  for (ordinal_type i=0; i<my_tsz; i+=stride_)
233  coeff_new[i] = coeff_[i];
234  destroy_coeff_array(coeff_, is_owned_, total_sz_);
235  coeff_ = coeff_new;
236  sz_ = sz;
237  total_sz_ = total_sz_new;
238  is_owned_ = owned_new;
239  }
240  }
241 
243  __device__
244  void resize(const ordinal_type& sz) volatile {
245  if (sz != sz_) {
246  value_type *coeff_new;
247  bool owned_new;
248  ordinal_type total_sz_new = sz*stride_;
249  allocate_coeff_array(coeff_new, owned_new, total_sz_new);
250  ordinal_type my_tsz = total_sz_;
251  if (total_sz_ > total_sz_new)
252  my_tsz = total_sz_new;
253  for (ordinal_type i=0; i<my_tsz; i+=stride_)
254  coeff_new[i] = coeff_[i];
255  destroy_coeff_array(coeff_, is_owned_, total_sz_);
256  coeff_ = coeff_new;
257  sz_ = sz;
258  total_sz_ = total_sz_new;
259  is_owned_ = owned_new;
260  }
261  }
262 
264  __device__
265  void shallowReset(pointer v, const ordinal_type& sz,
266  const ordinal_type& stride, bool owned) {
267  destroy_coeff_array(coeff_, is_owned_, total_sz_);
268  coeff_ = v;
269  sz_ = sz;
270  stride_ = stride;
271  total_sz_ = sz_*stride_;
272  is_owned_ = owned;
273  }
274 
276  __device__
277  void shallowReset(pointer v, const ordinal_type& sz,
278  const ordinal_type& stride, bool owned) volatile {
279  destroy_coeff_array(coeff_, is_owned_, total_sz_);
280  coeff_ = v;
281  sz_ = sz;
282  stride_ = stride;
283  total_sz_ = sz_*stride_;
284  is_owned_ = owned;
285  }
286 
288  __device__
289  ordinal_type size() const { return sz_; }
290 
292  __device__
293  ordinal_type size() const volatile { return sz_; }
294 
296  KOKKOS_INLINE_FUNCTION
297  const_reference operator[] (const ordinal_type& i) const {
298  return coeff_[i*stride_];
299  }
300 
302  KOKKOS_INLINE_FUNCTION
303  const_volatile_reference operator[] (const ordinal_type& i) const volatile {
304  return coeff_[i*stride_];
305  }
306 
308  KOKKOS_INLINE_FUNCTION
309  reference operator[] (const ordinal_type& i) {
310  return coeff_[i*stride_];
311  }
312 
314  KOKKOS_INLINE_FUNCTION
315  volatile_reference operator[] (const ordinal_type& i) volatile {
316  return coeff_[i*stride_];
317  }
318 
319  template <int i>
320  KOKKOS_INLINE_FUNCTION
321  reference getCoeff() { return coeff_[i*stride_]; }
322 
323  template <int i>
324  KOKKOS_INLINE_FUNCTION
325  volatile_reference getCoeff() volatile { return coeff_[i*stride_]; }
326 
327  template <int i>
328  KOKKOS_INLINE_FUNCTION
329  const_reference getCoeff() const { return coeff_[i*stride_]; }
330 
331  template <int i>
332  KOKKOS_INLINE_FUNCTION
333  const_volatile_reference getCoeff() const volatile { return coeff_[i*stride_]; }
334 
336  KOKKOS_INLINE_FUNCTION
337  const_volatile_pointer coeff() const volatile { return coeff_; }
338 
340  KOKKOS_INLINE_FUNCTION
341  const_pointer coeff() const { return coeff_; }
342 
344  KOKKOS_INLINE_FUNCTION
345  volatile_pointer coeff() volatile { return coeff_; }
346 
348  KOKKOS_INLINE_FUNCTION
349  pointer coeff() { return coeff_; }
350 
351  protected:
352 
354  __device__
355  ordinal_type num_threads() const {
356  return blockDim.x*blockDim.y*blockDim.z;
357  }
358 
360  __device__
361  ordinal_type num_threads() const volatile {
362  return blockDim.x*blockDim.y*blockDim.z;
363  }
364 
366  __device__
367  ordinal_type thread_index() const {
368  return threadIdx.x + (threadIdx.y + threadIdx.z*blockDim.y)*blockDim.x;
369  }
370 
372  __device__
373  ordinal_type thread_index() const volatile {
374  return threadIdx.x + (threadIdx.y + threadIdx.z*blockDim.y)*blockDim.x;
375  }
376 
378  __device__
379  void allocate_coeff_array(pointer& c, bool& owned,
380  ordinal_type total_size,
381  const value_type& x = value_type(0.0)) {
382 
383  // Allocate coefficient array on thread 0
384  __shared__ pointer ptr;
385  ordinal_type tidx = thread_index();
386  if (tidx == 0) {
387  ptr = ds::get_and_fill(total_size,x);
388  owned = true;
389  }
390  else
391  owned = false;
392  __syncthreads();
393 
394  // Give each thread its portion of the array
395  c = ptr + tidx;
396  }
397 
399  __device__
400  void allocate_coeff_array(pointer& c, bool& owned,
401  ordinal_type total_size,
402  const value_type& x = value_type(0.0)) volatile {
403 
404  // Allocate coefficient array on thread 0
405  __shared__ pointer ptr;
406  ordinal_type tidx = thread_index();
407  if (tidx == 0) {
408  ptr = ds::get_and_fill(total_size,x);
409  owned = true;
410  }
411  else
412  owned = false;
413  __syncthreads();
414 
415  // Give each thread its portion of the array
416  c = ptr + tidx;
417  }
418 
420  __device__
421  void allocate_coeff_array(pointer& c, bool& owned,
422  ordinal_type total_size,
423  const value_type* x) {
424 
425  // Allocate coefficient array on thread 0
426  __shared__ pointer ptr;
427  ordinal_type tidx = thread_index();
428  if (tidx == 0) {
429  ptr = ds::get_and_fill(x, total_size);
430  owned = true;
431  }
432  else
433  owned = false;
434  __syncthreads();
435 
436  // Give each thread its portion of the array
437  c = ptr + tidx;
438  }
439 
441  __device__
442  void destroy_coeff_array(pointer c, bool owned, ordinal_type total_size) {
443  __syncthreads();
444  if (owned)
445  ds::destroy_and_release(c, total_size);
446  }
447 
449  __device__
450  void destroy_coeff_array(pointer c, bool owned, ordinal_type total_size) volatile {
451  __syncthreads();
452  if (owned)
453  ds::destroy_and_release(c, total_size);
454  }
455 
456  private:
457 
459  pointer coeff_;
460 
462  ordinal_type sz_;
463 
465  ordinal_type stride_;
466 
468  ordinal_type total_sz_;
469 
471  bool is_owned_;
472 
473  };
474 
475 }
476 
477 #endif
Kokkos::DefaultExecutionSpace execution_space
Dynamic array allocation class that is specialized for scalar i.e., fundamental or built-in types (fl...