10 #ifndef SACADO_DYNAMICARRAYTRAITS_HPP
11 #define SACADO_DYNAMICARRAYTRAITS_HPP
18 #if defined(HAVE_SACADO_KOKKOS)
19 #include "Kokkos_Core.hpp"
24 template <
typename ExecSpace>
26 ,
const size_t min_total_alloc_size
27 ,
const uint32_t min_block_alloc_size
28 ,
const uint32_t max_block_alloc_size
29 ,
const uint32_t min_superblock_size
32 template <
typename ExecSpace>
35 #if 0 && defined(HAVE_SACADO_KOKKOS) && defined(KOKKOS_ENABLE_OPENMP)
37 extern const Kokkos::MemoryPool<Kokkos::OpenMP>* global_sacado_openmp_memory_pool;
42 ,
const size_t min_total_alloc_size
43 ,
const uint32_t min_block_alloc_size
44 ,
const uint32_t max_block_alloc_size
45 ,
const uint32_t min_superblock_size
48 typedef Kokkos::MemoryPool<Kokkos::OpenMP> pool_t;
49 Impl::global_sacado_openmp_memory_pool =
50 new pool_t(
typename Kokkos::OpenMP::memory_space(),
59 delete Impl::global_sacado_openmp_memory_pool;
63 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
67 extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_host;
68 extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_device;
69 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
70 extern __device__
const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device;
72 __device__
const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device = 0;
75 struct SetMemoryPoolPtr {
76 Kokkos::MemoryPool<Kokkos::Cuda>* pool_device;
77 __device__
inline void operator()(
int)
const {
78 global_sacado_cuda_memory_pool_on_device = pool_device;
88 ,
const size_t min_total_alloc_size
89 ,
const uint32_t min_block_alloc_size
90 ,
const uint32_t max_block_alloc_size
91 ,
const uint32_t min_superblock_size
94 typedef Kokkos::MemoryPool<Kokkos::Cuda> pool_t;
96 new pool_t(
typename Kokkos::Cuda::memory_space(),
100 min_superblock_size);
101 Impl::SetMemoryPoolPtr
f;
102 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMalloc( &f.pool_device,
sizeof(pool_t) ) );
103 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMemcpy( f.pool_device, pool,
105 cudaMemcpyHostToDevice ) );
106 Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,1),f);
107 Impl::global_sacado_cuda_memory_pool_host = pool;
108 Impl::global_sacado_cuda_memory_pool_device = f.pool_device;
113 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaFree( (
void*) Impl::global_sacado_cuda_memory_pool_device ) );
114 delete Impl::global_sacado_cuda_memory_pool_host;
119 #if !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
124 __device__
inline int warpLane(
const int warp_size = 32) {
125 return ( threadIdx.x + (threadIdx.y + threadIdx.z*blockDim.y)*blockDim.x ) % warp_size;
129 template <
typename T>
130 __device__
inline T warpReduce(
T y,
const int warp_size = 32) {
131 for (
int i=1;
i<warp_size;
i*=2) {
132 y += Kokkos::shfl_down(y,
i, warp_size);
134 y = Kokkos::shfl(y, 0, warp_size);
139 template <
typename T>
140 __device__
inline int warpScan(
T y,
const int warp_size = 32) {
141 const int lane = warpLane();
142 y = Kokkos::shfl_up(y, 1, warp_size);
145 for (
int i=1;
i<warp_size;
i*=2) {
146 T t = Kokkos::shfl_up(y,
i, warp_size);
153 template <
typename T>
154 __device__
inline T warpBcast(
T y,
int id,
const int warp_size = 32) {
155 return Kokkos::shfl(y,
id, warp_size);
164 template <
typename T>
167 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ )
170 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMallocManaged( (
void**) &m, sz*
sizeof(
T), cudaMemAttachGlobal ) );
171 #elif defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
174 const int total_sz = warpReduce(sz);
175 const int lane = warpLane();
176 if (total_sz > 0 && lane == 0) {
177 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*
sizeof(
T)));
179 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
183 #elif 0 && defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP)
186 if (global_sacado_openmp_memory_pool != 0) {
187 m =
static_cast<T*
>(global_sacado_openmp_memory_pool->allocate(sz*
sizeof(
T)));
189 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
192 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
197 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
198 #if defined(HAVE_SACADO_KOKKOS)
200 Kokkos::abort(
"Allocation failed.");
207 template <
typename T>
210 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ )
212 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaFree(m) );
213 #elif defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
214 const int total_sz = warpReduce(sz);
215 const int lane = warpLane();
216 if (total_sz > 0 && lane == 0) {
217 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, total_sz*
sizeof(
T));
219 #elif 0 && defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP)
221 if (global_sacado_openmp_memory_pool != 0)
222 global_sacado_openmp_memory_pool->deallocate((
void*) m, sz*
sizeof(
T));
224 operator delete((
void*) m);
228 operator delete((
void*) m);
242 static T*
get(
int sz) {
243 T* m = Impl::ds_alloc<T>(sz);
245 for (
int i=0;
i<sz; ++
i)
253 T* m = Impl::ds_alloc<T>(sz);
255 for (
int i=0;
i<sz; ++
i)
266 T* m = Impl::ds_alloc<T>(sz);
268 for (
int i=0;
i<sz; ++
i)
269 new (p++)
T(*(src++));
279 T* m = Impl::ds_alloc<T>(sz);
281 for (
int i=0;
i<sz; ++
i) {
290 static void copy(
const T* src,
T* dest,
int sz) {
291 for (
int i=0;
i<sz; ++
i)
292 *(dest++) = *(src++);
298 T* dest,
int dest_stride,
int sz) {
299 for (
int i=0;
i<sz; ++
i) {
309 for (
int i=0;
i<sz; ++
i)
316 for (
int i=0;
i<sz; ++
i) {
326 for (
T* b = m; b!=e; b++)
332 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
336 template <
typename T>
338 static T* ds_strided_alloc(
const int sz) {
344 if (blockDim.x == 32) {
346 const int lane = threadIdx.x;
347 if (sz > 0 && lane == 0) {
348 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
349 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(sz*
sizeof(
T)));
351 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
353 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
354 #if defined(HAVE_SACADO_KOKKOS)
356 Kokkos::abort(
"Allocation failed.");
360 m = warpBcast(m,0,blockDim.x);
364 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
365 #if defined(HAVE_SACADO_KOKKOS)
367 Kokkos::abort(
"Allocation failed.");
375 template <
typename T>
377 static void ds_strided_free(
T* m,
int sz) {
378 if (blockDim.x == 32) {
380 const int lane = threadIdx.x;
381 if (sz > 0 && lane == 0) {
382 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
383 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, sz*
sizeof(
T));
385 operator delete((
void*) m);
391 operator delete((
void*) m);
402 template <
typename T>
403 struct ds_array<
T,
true> {
407 static T*
get(
int sz) {
408 T* m = Impl::ds_strided_alloc<T>(sz);
415 T* m = Impl::ds_strided_alloc<T>(sz);
416 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
427 T* m = Impl::ds_strided_alloc<T>(sz);
428 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
439 T* m = Impl::ds_strided_alloc<T>(sz);
440 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
441 m[
i] = src[
i*stride];
447 static void copy(
const T* src,
T* dest,
int sz) {
448 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
455 T* dest,
int dest_stride,
int sz) {
456 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x) {
457 dest[
i*dest_stride] = src[
i*src_stride];
463 static void zero(
T* dest,
int sz) {
464 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
471 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x) {
472 dest[
i*stride] =
T(0.);
479 Impl::ds_strided_free(m, sz);
483 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
487 template <
typename T>
489 static T* ds_strided_alloc(
const int sz) {
495 if (blockDim.x == 32) {
498 const int total_sz = warpReduce(sz, blockDim.x);
499 const int lane = threadIdx.x;
500 if (total_sz > 0 && lane == 0) {
501 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
502 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*
sizeof(
T)));
504 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
506 m =
static_cast<T*
>(
operator new(total_sz*
sizeof(
T)));
507 #if defined(HAVE_SACADO_KOKKOS)
509 Kokkos::abort(
"Allocation failed.");
513 m = warpBcast(m,0,blockDim.x);
518 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
519 #if defined(HAVE_SACADO_KOKKOS)
521 Kokkos::abort(
"Allocation failed.");
529 template <
typename T>
531 static void ds_strided_free(
T* m,
int sz) {
532 if (blockDim.x == 32) {
535 const int total_sz = warpReduce(sz, blockDim.x);
536 const int lane = threadIdx.x;
537 if (total_sz > 0 && lane == 0) {
538 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
539 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, total_sz*
sizeof(
T));
541 operator delete((
void*) m);
547 operator delete((
void*) m);
556 template <
typename T>
557 struct ds_array<
T,
true> {
561 static T*
get(
int sz) {
562 T* m = Impl::ds_strided_alloc<T>(sz);
569 T* m = Impl::ds_strided_alloc<T>(sz);
570 for (
int i=0;
i<sz; ++
i)
571 m[
i*blockDim.x] = 0.0;
581 T* m = Impl::ds_strided_alloc<T>(sz);
582 for (
int i=0;
i<sz; ++
i)
583 m[
i*blockDim.x] = src[
i*blockDim.x];
593 T* m = Impl::ds_strided_alloc<T>(sz);
594 for (
int i=0;
i<sz; ++
i)
595 m[
i*blockDim.x] = src[
i*stride];
601 static void copy(
const T* src,
T* dest,
int sz) {
602 for (
int i=0;
i<sz; ++
i)
603 dest[
i*blockDim.x] = src[
i*blockDim.x];
609 T* dest,
int dest_stride,
int sz) {
610 for (
int i=0;
i<sz; ++
i) {
619 static void zero(
T* dest,
int sz) {
620 for (
int i=0;
i<sz; ++
i)
621 dest[
i*blockDim.x] =
T(0.);
627 for (
int i=0;
i<sz; ++
i) {
636 Impl::ds_strided_free(m, sz);
646 template <
typename T>
651 static T*
get(
int sz) {
652 T* m = Impl::ds_alloc<T>(sz);
659 T* m = Impl::ds_alloc<T>(sz);
660 #if defined(__CUDACC__ ) || defined(__HIPCC__ )
661 for (
int i=0;
i<sz; ++
i)
665 std::memset(m,0,sz*
sizeof(
T));
676 T* m = Impl::ds_alloc<T>(sz);
677 for (
int i=0;
i<sz; ++
i)
688 T* m = Impl::ds_alloc<T>(sz);
689 for (
int i=0;
i<sz; ++
i)
690 m[
i] = src[
i*stride];
696 static void copy(
const T* src,
T* dest,
int sz) {
697 if (sz > 0 && dest != NULL && src != NULL)
698 #if defined( __CUDACC__) || defined(__HIPCC__ )
699 for (
int i=0;
i<sz; ++
i)
702 std::memcpy(dest,src,sz*
sizeof(
T));
709 T* dest,
int dest_stride,
int sz) {
710 for (
int i=0;
i<sz; ++
i) {
720 if (sz > 0 && dest != NULL)
721 #if defined(__CUDACC__ ) || defined(__HIPCC__ )
722 for (
int i=0;
i<sz; ++
i)
725 std::memset(dest,0,sz*
sizeof(
T));
732 for (
int i=0;
i<sz; ++
i) {
749 #endif // SACADO_DYNAMICARRAY_HPP
static SACADO_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
static SACADO_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static SACADO_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
static SACADO_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
static SACADO_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION void ds_free(T *m, int sz)
static SACADO_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static SACADO_INLINE_FUNCTION T * ds_alloc(const int sz)
static SACADO_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
void destroyGlobalMemoryPool(const ExecSpace &space)
#define SACADO_INLINE_FUNCTION
static SACADO_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.
static SACADO_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
Dynamic array allocation class that works for any type.
static SACADO_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.