Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Sacado_DynamicArrayTraits.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 #ifndef SACADO_DYNAMICARRAYTRAITS_HPP
31 #define SACADO_DYNAMICARRAYTRAITS_HPP
32 
33 #include <new>
34 #include <cstring>
35 #include <stdint.h>
36 
37 #include "Sacado_Traits.hpp"
38 #if defined(HAVE_SACADO_KOKKOSCORE)
39 #include "Kokkos_Core.hpp"
40 #if defined(KOKKOS_ENABLE_CUDA)
41 #include "Cuda/Kokkos_Cuda_Vectorization.hpp"
42 #endif
43 #if !defined(SACADO_DISABLE_CUDA_IN_KOKKOS)
44 #include "Kokkos_MemoryPool.hpp"
45 #endif
46 #endif
47 
48 namespace Sacado {
49 
50  template <typename ExecSpace>
51  void createGlobalMemoryPool(const ExecSpace& space
52  , const size_t min_total_alloc_size
53  , const uint32_t min_block_alloc_size
54  , const uint32_t max_block_alloc_size
55  , const uint32_t min_superblock_size
56  ) {}
57 
58  template <typename ExecSpace>
59  void destroyGlobalMemoryPool(const ExecSpace& space) {}
60 
61 #if 0 && defined(HAVE_SACADO_KOKKOSCORE) && defined(KOKKOS_ENABLE_OPENMP)
62  namespace Impl {
63  extern const Kokkos::MemoryPool<Kokkos::OpenMP>* global_sacado_openmp_memory_pool;
64  }
65 
66  inline void
67  createGlobalMemoryPool(const ExecSpace& space
68  , const size_t min_total_alloc_size
69  , const uint32_t min_block_alloc_size
70  , const uint32_t max_block_alloc_size
71  , const uint32_t min_superblock_size
72  )
73  {
74  typedef Kokkos::MemoryPool<Kokkos::OpenMP> pool_t;
75  Impl::global_sacado_openmp_memory_pool =
76  new pool_t(typename Kokkos::OpenMP::memory_space(),
77  min_total_alloc_size,
78  min_block_alloc_size,
79  max_block_alloc_size,
80  min_superblock_size);
81  }
82 
83  inline void destroyGlobalMemoryPool(const Kokkos::OpenMP& space)
84  {
85  delete Impl::global_sacado_openmp_memory_pool;
86  }
87 #endif
88 
89 #if defined(HAVE_SACADO_KOKKOSCORE) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
90 
91  namespace Impl {
92 
93  extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_host;
94  extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_device;
95 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
96  extern __device__ const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device;
97 #else
98  __device__ const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device = 0;
99 #endif
100 
101  struct SetMemoryPoolPtr {
102  Kokkos::MemoryPool<Kokkos::Cuda>* pool_device;
103  __device__ inline void operator()(int) const {
104  global_sacado_cuda_memory_pool_on_device = pool_device;
105  };
106  };
107 
108  }
109 
110  // For some reason we get memory errors if these functions are defined in
111  // Sacado_DynamicArrayTraits.cpp
112  inline void
113  createGlobalMemoryPool(const Kokkos::Cuda& space
114  , const size_t min_total_alloc_size
115  , const uint32_t min_block_alloc_size
116  , const uint32_t max_block_alloc_size
117  , const uint32_t min_superblock_size
118  )
119  {
120  typedef Kokkos::MemoryPool<Kokkos::Cuda> pool_t;
121  pool_t* pool =
122  new pool_t(typename Kokkos::Cuda::memory_space(),
123  min_total_alloc_size,
124  min_block_alloc_size,
125  max_block_alloc_size,
126  min_superblock_size);
127  Impl::SetMemoryPoolPtr f;
128  CUDA_SAFE_CALL( cudaMalloc( &f.pool_device, sizeof(pool_t) ) );
129  CUDA_SAFE_CALL( cudaMemcpy( f.pool_device, pool,
130  sizeof(pool_t),
131  cudaMemcpyHostToDevice ) );
132  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,1),f);
133  Impl::global_sacado_cuda_memory_pool_host = pool;
134  Impl::global_sacado_cuda_memory_pool_device = f.pool_device;
135  }
136 
137  inline void destroyGlobalMemoryPool(const Kokkos::Cuda& space)
138  {
139  CUDA_SAFE_CALL( cudaFree( (void*) Impl::global_sacado_cuda_memory_pool_device ) );
140  delete Impl::global_sacado_cuda_memory_pool_host;
141  }
142 
143 #endif
144 
145 #if !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
146 
147  namespace Impl {
148 
149  // Compute warp lane/thread index
150  __device__ inline int warpLane(const int warp_size = 32) {
151  return ( threadIdx.x + (threadIdx.y + threadIdx.z*blockDim.y)*blockDim.x ) % warp_size;
152  }
153 
154  // Reduce y across the warp and broadcast to all lanes
155  template <typename T>
156  __device__ inline T warpReduce(T y, const int warp_size = 32) {
157  for (int i=1; i<warp_size; i*=2) {
158  y += Kokkos::shfl_down(y, i, warp_size);
159  }
160  y = Kokkos::shfl(y, 0, warp_size);
161  return y;
162  }
163 
164  // Non-inclusive plus-scan up the warp, replacing the first entry with 0
165  template <typename T>
166  __device__ inline int warpScan(T y, const int warp_size = 32) {
167  const int lane = warpLane();
168  y = Kokkos::shfl_up(y, 1, warp_size);
169  if (lane == 0)
170  y = T(0);
171  for (int i=1; i<warp_size; i*=2) {
172  T t = Kokkos::shfl_up(y, i, warp_size);
173  if (lane > i)
174  y += t;
175  }
176  return y;
177  }
178 
179  template <typename T>
180  __device__ inline T warpBcast(T y, int id, const int warp_size = 32) {
181  return Kokkos::shfl(y, id, warp_size);
182  }
183 
184  }
185 
186 #endif
187 
188  namespace Impl {
189 
190  template <typename T>
192  static T* ds_alloc(const int sz) {
193 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ )
194  T* m;
195  CUDA_SAFE_CALL( cudaMallocManaged( (void**) &m, sz*sizeof(T), cudaMemAttachGlobal ) );
196 #elif defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
197  T* m = 0;
198  const int total_sz = warpReduce(sz);
199  const int lane = warpLane();
200  if (total_sz > 0 && lane == 0) {
201  m = static_cast<T*>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*sizeof(T)));
202  if (m == 0)
203  Kokkos::abort("Allocation failed. Kokkos memory pool is out of memory");
204  }
205  m = warpBcast(m,0);
206  m += warpScan(sz);
207 #elif 0 && defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP)
208  T* m = 0;
209  if (sz > 0) {
210  if (global_sacado_openmp_memory_pool != 0) {
211  m = static_cast<T*>(global_sacado_openmp_memory_pool->allocate(sz*sizeof(T)));
212  if (m == 0)
213  Kokkos::abort("Allocation failed. Kokkos memory pool is out of memory");
214  }
215  else
216  m = static_cast<T* >(operator new(sz*sizeof(T)));
217  }
218 #else
219  T* m = static_cast<T* >(operator new(sz*sizeof(T)));
220 #if defined(HAVE_SACADO_KOKKOSCORE)
221  if (m == 0)
222  Kokkos::abort("Allocation failed.");
223 #endif
224 #endif
225  return m;
226  }
227 
228  template <typename T>
230  static void ds_free(T* m, int sz) {
231 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ )
232  if (sz > 0)
233  CUDA_SAFE_CALL( cudaFree(m) );
234 #elif defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
235  const int total_sz = warpReduce(sz);
236  const int lane = warpLane();
237  if (total_sz > 0 && lane == 0) {
238  global_sacado_cuda_memory_pool_on_device->deallocate((void*) m, total_sz*sizeof(T));
239  }
240 #elif 0 && defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP)
241  if (sz > 0) {
242  if (global_sacado_openmp_memory_pool != 0)
243  global_sacado_openmp_memory_pool->deallocate((void*) m, sz*sizeof(T));
244  else
245  operator delete((void*) m);
246  }
247 #else
248  if (sz > 0)
249  operator delete((void*) m);
250 #endif
251  }
252 
253  }
254 
258  template <typename T, bool isScalar = IsScalarType<T>::value>
259  struct ds_array {
260 
263  static T* get(int sz) {
264  if (sz > 0) {
265  T* m = Impl::ds_alloc<T>(sz);
266  T* p = m;
267  for (int i=0; i<sz; ++i)
268  new (p++) T();
269  return m;
270  }
271  return NULL;
272  }
273 
276  static T* get_and_fill(int sz) {
277  if (sz > 0) {
278  T* m = Impl::ds_alloc<T>(sz);
279  T* p = m;
280  for (int i=0; i<sz; ++i)
281  new (p++) T(0.0);
282  return m;
283  }
284  return NULL;
285  }
286 
292  static T* get_and_fill(const T* src, int sz) {
293  if (sz > 0) {
294  T* m = Impl::ds_alloc<T>(sz);
295  T* p = m;
296  for (int i=0; i<sz; ++i)
297  new (p++) T(*(src++));
298  return m;
299  }
300  return NULL;
301  }
302 
308  static T* strided_get_and_fill(const T* src, int stride, int sz) {
309  if (sz > 0) {
310  T* m = Impl::ds_alloc<T>(sz);
311  T* p = m;
312  for (int i=0; i<sz; ++i) {
313  new (p++) T(*(src));
314  src += stride;
315  }
316  return m;
317  }
318  return NULL;
319  }
320 
323  static void copy(const T* src, T* dest, int sz) {
324  for (int i=0; i<sz; ++i)
325  *(dest++) = *(src++);
326  }
327 
330  static void strided_copy(const T* src, int src_stride,
331  T* dest, int dest_stride, int sz) {
332  for (int i=0; i<sz; ++i) {
333  *(dest) = *(src);
334  dest += dest_stride;
335  src += src_stride;
336  }
337  }
338 
341  static void zero(T* dest, int sz) {
342  for (int i=0; i<sz; ++i)
343  *(dest++) = T(0.);
344  }
345 
348  static void strided_zero(T* dest, int stride, int sz) {
349  for (int i=0; i<sz; ++i) {
350  *(dest) = T(0.);
351  dest += stride;
352  }
353  }
354 
357  static void destroy_and_release(T* m, int sz) {
358  T* e = m+sz;
359  for (T* b = m; b!=e; b++)
360  b->~T();
361  Impl::ds_free(m, sz);
362  }
363  };
364 
365 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
366 
367  namespace Impl {
368 
369  template <typename T>
371  static T* ds_strided_alloc(const int sz) {
372  T* m = 0;
373  // Only do strided memory allocations when we are doing hierarchical
374  // parallelism with a vector dimension of 32. The limitation on the
375  // memory pool allowing only a single thread in a warp to allocate
376  // makes it too difficult to do otherwise.
377  if (blockDim.x == 32) {
378  //const int lane = warpLane();
379  const int lane = threadIdx.x;
380  if (sz > 0 && lane == 0) {
381 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
382  m = static_cast<T*>(global_sacado_cuda_memory_pool_on_device->allocate(sz*sizeof(T)));
383  if (m == 0)
384  Kokkos::abort("Allocation failed. Kokkos memory pool is out of memory");
385 #else
386  m = static_cast<T* >(operator new(sz*sizeof(T)));
387 #if defined(HAVE_SACADO_KOKKOSCORE)
388  if (m == 0)
389  Kokkos::abort("Allocation failed.");
390 #endif
391 #endif
392  }
393  m = warpBcast(m,0,blockDim.x);
394  }
395  else {
396  if (sz > 0) {
397  m = static_cast<T* >(operator new(sz*sizeof(T)));
398 #if defined(HAVE_SACADO_KOKKOSCORE)
399  if (m == 0)
400  Kokkos::abort("Allocation failed.");
401 #endif
402  }
403  }
404 
405  return m;
406  }
407 
408  template <typename T>
410  static void ds_strided_free(T* m, int sz) {
411  if (blockDim.x == 32) {
412  // const int lane = warpLane();
413  const int lane = threadIdx.x;
414  if (sz > 0 && lane == 0) {
415 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
416  global_sacado_cuda_memory_pool_on_device->deallocate((void*) m, sz*sizeof(T));
417 #else
418  operator delete((void*) m);
419 #endif
420  }
421  }
422  else {
423  if (sz > 0)
424  operator delete((void*) m);
425  }
426 
427  }
428 
429  }
430 
435  template <typename T>
436  struct ds_array<T,true> {
437 
440  static T* get(int sz) {
441  if (sz > 0) {
442  T* m = Impl::ds_strided_alloc<T>(sz);
443  return m;
444  }
445  return NULL;
446  }
447 
450  static T* get_and_fill(int sz) {
451  if (sz > 0) {
452  T* m = Impl::ds_strided_alloc<T>(sz);
453  for (int i=threadIdx.x; i<sz; i+=blockDim.x)
454  m[i] = 0.0;
455  return m;
456  }
457  return NULL;
458  }
459 
465  static T* get_and_fill(const T* src, int sz) {
466  if (sz > 0) {
467  T* m = Impl::ds_strided_alloc<T>(sz);
468  for (int i=threadIdx.x; i<sz; i+=blockDim.x)
469  m[i] = src[i];
470  return m;
471  }
472  return NULL;
473  }
474 
480  static T* strided_get_and_fill(const T* src, int stride, int sz) {
481  if (sz > 0) {
482  T* m = Impl::ds_strided_alloc<T>(sz);
483  for (int i=threadIdx.x; i<sz; i+=blockDim.x)
484  m[i] = src[i*stride];
485  return m;
486  }
487  return NULL;
488  }
489 
492  static void copy(const T* src, T* dest, int sz) {
493  if (sz > 0)
494  for (int i=threadIdx.x; i<sz; i+=blockDim.x)
495  dest[i] = src[i];
496  }
497 
500  static void strided_copy(const T* src, int src_stride,
501  T* dest, int dest_stride, int sz) {
502  for (int i=threadIdx.x; i<sz; i+=blockDim.x) {
503  dest[i*dest_stride] = src[i*src_stride];
504  }
505  }
506 
509  static void zero(T* dest, int sz) {
510  if (sz > 0)
511  for (int i=threadIdx.x; i<sz; i+=blockDim.x)
512  dest[i] = T(0.);
513  }
514 
517  static void strided_zero(T* dest, int stride, int sz) {
518  for (int i=threadIdx.x; i<sz; i+=blockDim.x) {
519  dest[i*stride] = T(0.);
520  }
521  }
522 
525  static void destroy_and_release(T* m, int sz) {
526  Impl::ds_strided_free(m, sz);
527  }
528  };
529 
530 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
531 
532  namespace Impl {
533 
534  template <typename T>
536  static T* ds_strided_alloc(const int sz) {
537  T* m = 0;
538  // Only do strided memory allocations when we are doing hierarchical
539  // parallelism with a vector dimension of 32. The limitation on the
540  // memory pool allowing only a single thread in a warp to allocate
541  // makes it too difficult to do otherwise.
542  if (blockDim.x == 32) {
543  // const int total_sz = warpReduce(sz);
544  // const int lane = warpLane();
545  const int total_sz = warpReduce(sz, blockDim.x);
546  const int lane = threadIdx.x;
547  if (total_sz > 0 && lane == 0) {
548 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
549  m = static_cast<T*>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*sizeof(T)));
550  if (m == 0)
551  Kokkos::abort("Allocation failed. Kokkos memory pool is out of memory");
552 #else
553  m = static_cast<T* >(operator new(total_sz*sizeof(T)));
554 #if defined(HAVE_SACADO_KOKKOSCORE)
555  if (m == 0)
556  Kokkos::abort("Allocation failed.");
557 #endif
558 #endif
559  }
560  m = warpBcast(m,0,blockDim.x);
561  m += lane;
562  }
563  else {
564  if (sz > 0) {
565  m = static_cast<T* >(operator new(sz*sizeof(T)));
566 #if defined(HAVE_SACADO_KOKKOSCORE)
567  if (m == 0)
568  Kokkos::abort("Allocation failed.");
569 #endif
570  }
571  }
572 
573  return m;
574  }
575 
576  template <typename T>
578  static void ds_strided_free(T* m, int sz) {
579  if (blockDim.x == 32) {
580  // const int total_sz = warpReduce(sz);
581  // const int lane = warpLane();
582  const int total_sz = warpReduce(sz, blockDim.x);
583  const int lane = threadIdx.x;
584  if (total_sz > 0 && lane == 0) {
585 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
586  global_sacado_cuda_memory_pool_on_device->deallocate((void*) m, total_sz*sizeof(T));
587 #else
588  operator delete((void*) m);
589 #endif
590  }
591  }
592  else {
593  if (sz > 0)
594  operator delete((void*) m);
595  }
596  }
597  }
598 
603  template <typename T>
604  struct ds_array<T,true> {
605 
608  static T* get(int sz) {
609  if (sz > 0) {
610  T* m = Impl::ds_strided_alloc<T>(sz);
611  return m;
612  }
613  return NULL;
614  }
615 
618  static T* get_and_fill(int sz) {
619  if (sz > 0) {
620  T* m = Impl::ds_strided_alloc<T>(sz);
621  for (int i=0; i<sz; ++i)
622  m[i*blockDim.x] = 0.0;
623  return m;
624  }
625  return NULL;
626  }
627 
633  static T* get_and_fill(const T* src, int sz) {
634  if (sz > 0) {
635  T* m = Impl::ds_strided_alloc<T>(sz);
636  for (int i=0; i<sz; ++i)
637  m[i*blockDim.x] = src[i*blockDim.x];
638  return m;
639  }
640  return NULL;
641  }
642 
648  static T* strided_get_and_fill(const T* src, int stride, int sz) {
649  if (sz > 0) {
650  T* m = Impl::ds_strided_alloc<T>(sz);
651  for (int i=0; i<sz; ++i)
652  m[i*blockDim.x] = src[i*stride];
653  return m;
654  }
655  return NULL;
656  }
657 
660  static void copy(const T* src, T* dest, int sz) {
661  if (sz > 0)
662  for (int i=0; i<sz; ++i)
663  dest[i*blockDim.x] = src[i*blockDim.x];
664  }
665 
668  static void strided_copy(const T* src, int src_stride,
669  T* dest, int dest_stride, int sz) {
670  for (int i=0; i<sz; ++i) {
671  *(dest) = *(src);
672  dest += dest_stride;
673  src += src_stride;
674  }
675  }
676 
679  static void zero(T* dest, int sz) {
680  if (sz > 0)
681  for (int i=0; i<sz; ++i)
682  dest[i*blockDim.x] = T(0.);
683  }
684 
687  static void strided_zero(T* dest, int stride, int sz) {
688  for (int i=0; i<sz; ++i) {
689  *(dest) = T(0.);
690  dest += stride;
691  }
692  }
693 
696  static void destroy_and_release(T* m, int sz) {
697  Impl::ds_strided_free(m, sz);
698  }
699  };
700 
701 #else
702 
707  template <typename T>
708  struct ds_array<T,true> {
709 
712  static T* get(int sz) {
713  if (sz > 0) {
714  T* m = Impl::ds_alloc<T>(sz);
715  return m;
716  }
717  return NULL;
718  }
719 
722  static T* get_and_fill(int sz) {
723  if (sz > 0) {
724  T* m = Impl::ds_alloc<T>(sz);
725 #ifdef __CUDACC__
726  for (int i=0; i<sz; ++i)
727  m[i] = 0.0;
728 #else
729  std::memset(m,0,sz*sizeof(T));
730 #endif
731  return m;
732  }
733  return NULL;
734  }
735 
741  static T* get_and_fill(const T* src, int sz) {
742  if (sz > 0) {
743  T* m = Impl::ds_alloc<T>(sz);
744  for (int i=0; i<sz; ++i)
745  m[i] = src[i];
746  return m;
747  }
748  return NULL;
749  }
750 
756  static T* strided_get_and_fill(const T* src, int stride, int sz) {
757  if (sz > 0) {
758  T* m = Impl::ds_alloc<T>(sz);
759  for (int i=0; i<sz; ++i)
760  m[i] = src[i*stride];
761  return m;
762  }
763  return NULL;
764  }
765 
768  static void copy(const T* src, T* dest, int sz) {
769  if (sz > 0 && dest != NULL && src != NULL)
770 #ifdef __CUDACC__
771  for (int i=0; i<sz; ++i)
772  dest[i] = src[i];
773 #else
774  std::memcpy(dest,src,sz*sizeof(T));
775 #endif
776  }
777 
780  static void strided_copy(const T* src, int src_stride,
781  T* dest, int dest_stride, int sz) {
782  for (int i=0; i<sz; ++i) {
783  *(dest) = *(src);
784  dest += dest_stride;
785  src += src_stride;
786  }
787  }
788 
791  static void zero(T* dest, int sz) {
792  if (sz > 0 && dest != NULL)
793 #ifdef __CUDACC__
794  for (int i=0; i<sz; ++i)
795  dest[i] = T(0.);
796 #else
797  std::memset(dest,0,sz*sizeof(T));
798 #endif
799  }
800 
803  static void strided_zero(T* dest, int stride, int sz) {
804  for (int i=0; i<sz; ++i) {
805  *(dest) = T(0.);
806  dest += stride;
807  }
808  }
809 
812  static void destroy_and_release(T* m, int sz) {
813  Impl::ds_free(m, sz);
814  }
815  };
816 
817 #endif
818 
819 } // namespace Sacado
820 
821 #endif // SACADO_DYNAMICARRAY_HPP
static KOKKOS_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
static KOKKOS_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static KOKKOS_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
void f()
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
static KOKKOS_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
expr true
static KOKKOS_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static KOKKOS_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
#define KOKKOS_INLINE_FUNCTION
#define T
Definition: Sacado_rad.hpp:573
static KOKKOS_INLINE_FUNCTION T * ds_alloc(const int sz)
static KOKKOS_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.
static KOKKOS_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.
static KOKKOS_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
static KOKKOS_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
static KOKKOS_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
static KOKKOS_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
static KOKKOS_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
static KOKKOS_INLINE_FUNCTION void ds_free(T *m, int sz)
static KOKKOS_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.
void destroyGlobalMemoryPool(const ExecSpace &space)
static KOKKOS_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
Dynamic array allocation class that works for any type.
static KOKKOS_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.