30 #ifndef SACADO_DYNAMICARRAYTRAITS_HPP
31 #define SACADO_DYNAMICARRAYTRAITS_HPP
38 #if defined(HAVE_SACADO_KOKKOS)
39 #include "Kokkos_Core.hpp"
44 template <
typename ExecSpace>
46 ,
const size_t min_total_alloc_size
47 ,
const uint32_t min_block_alloc_size
48 ,
const uint32_t max_block_alloc_size
49 ,
const uint32_t min_superblock_size
52 template <
typename ExecSpace>
55 #if 0 && defined(HAVE_SACADO_KOKKOS) && defined(KOKKOS_ENABLE_OPENMP)
57 extern const Kokkos::MemoryPool<Kokkos::OpenMP>* global_sacado_openmp_memory_pool;
62 ,
const size_t min_total_alloc_size
63 ,
const uint32_t min_block_alloc_size
64 ,
const uint32_t max_block_alloc_size
65 ,
const uint32_t min_superblock_size
68 typedef Kokkos::MemoryPool<Kokkos::OpenMP> pool_t;
69 Impl::global_sacado_openmp_memory_pool =
70 new pool_t(
typename Kokkos::OpenMP::memory_space(),
79 delete Impl::global_sacado_openmp_memory_pool;
83 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
87 extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_host;
88 extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_device;
89 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
90 extern __device__
const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device;
92 __device__
const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device = 0;
95 struct SetMemoryPoolPtr {
96 Kokkos::MemoryPool<Kokkos::Cuda>* pool_device;
97 __device__
inline void operator()(
int)
const {
98 global_sacado_cuda_memory_pool_on_device = pool_device;
108 ,
const size_t min_total_alloc_size
109 ,
const uint32_t min_block_alloc_size
110 ,
const uint32_t max_block_alloc_size
111 ,
const uint32_t min_superblock_size
114 typedef Kokkos::MemoryPool<Kokkos::Cuda> pool_t;
116 new pool_t(
typename Kokkos::Cuda::memory_space(),
117 min_total_alloc_size,
118 min_block_alloc_size,
119 max_block_alloc_size,
120 min_superblock_size);
121 Impl::SetMemoryPoolPtr
f;
122 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMalloc( &f.pool_device,
sizeof(pool_t) ) );
123 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMemcpy( f.pool_device, pool,
125 cudaMemcpyHostToDevice ) );
126 Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,1),f);
127 Impl::global_sacado_cuda_memory_pool_host = pool;
128 Impl::global_sacado_cuda_memory_pool_device = f.pool_device;
133 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaFree( (
void*) Impl::global_sacado_cuda_memory_pool_device ) );
134 delete Impl::global_sacado_cuda_memory_pool_host;
139 #if !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
144 __device__
inline int warpLane(
const int warp_size = 32) {
145 return ( threadIdx.x + (threadIdx.y + threadIdx.z*blockDim.y)*blockDim.x ) % warp_size;
149 template <
typename T>
150 __device__
inline T warpReduce(
T y,
const int warp_size = 32) {
151 for (
int i=1;
i<warp_size;
i*=2) {
152 y += Kokkos::shfl_down(y,
i, warp_size);
154 y = Kokkos::shfl(y, 0, warp_size);
159 template <
typename T>
160 __device__
inline int warpScan(
T y,
const int warp_size = 32) {
161 const int lane = warpLane();
162 y = Kokkos::shfl_up(y, 1, warp_size);
165 for (
int i=1;
i<warp_size;
i*=2) {
166 T t = Kokkos::shfl_up(y,
i, warp_size);
173 template <
typename T>
174 __device__
inline T warpBcast(
T y,
int id,
const int warp_size = 32) {
175 return Kokkos::shfl(y,
id, warp_size);
184 template <
typename T>
187 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ )
190 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMallocManaged( (
void**) &m, sz*
sizeof(
T), cudaMemAttachGlobal ) );
191 #elif defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
194 const int total_sz = warpReduce(sz);
195 const int lane = warpLane();
196 if (total_sz > 0 && lane == 0) {
197 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*
sizeof(
T)));
199 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
203 #elif 0 && defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP)
206 if (global_sacado_openmp_memory_pool != 0) {
207 m =
static_cast<T*
>(global_sacado_openmp_memory_pool->allocate(sz*
sizeof(
T)));
209 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
212 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
217 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
218 #if defined(HAVE_SACADO_KOKKOS)
220 Kokkos::abort(
"Allocation failed.");
227 template <
typename T>
230 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ )
232 KOKKOS_IMPL_CUDA_SAFE_CALL( cudaFree(m) );
233 #elif defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
234 const int total_sz = warpReduce(sz);
235 const int lane = warpLane();
236 if (total_sz > 0 && lane == 0) {
237 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, total_sz*
sizeof(
T));
239 #elif 0 && defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP)
241 if (global_sacado_openmp_memory_pool != 0)
242 global_sacado_openmp_memory_pool->deallocate((
void*) m, sz*
sizeof(
T));
244 operator delete((
void*) m);
248 operator delete((
void*) m);
262 static T*
get(
int sz) {
263 T* m = Impl::ds_alloc<T>(sz);
265 for (
int i=0;
i<sz; ++
i)
273 T* m = Impl::ds_alloc<T>(sz);
275 for (
int i=0;
i<sz; ++
i)
286 T* m = Impl::ds_alloc<T>(sz);
288 for (
int i=0;
i<sz; ++
i)
289 new (p++)
T(*(src++));
299 T* m = Impl::ds_alloc<T>(sz);
301 for (
int i=0;
i<sz; ++
i) {
310 static void copy(
const T* src,
T* dest,
int sz) {
311 for (
int i=0;
i<sz; ++
i)
312 *(dest++) = *(src++);
318 T* dest,
int dest_stride,
int sz) {
319 for (
int i=0;
i<sz; ++
i) {
329 for (
int i=0;
i<sz; ++
i)
336 for (
int i=0;
i<sz; ++
i) {
346 for (
T* b = m; b!=e; b++)
352 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
356 template <
typename T>
358 static T* ds_strided_alloc(
const int sz) {
364 if (blockDim.x == 32) {
366 const int lane = threadIdx.x;
367 if (sz > 0 && lane == 0) {
368 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
369 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(sz*
sizeof(
T)));
371 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
373 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
374 #if defined(HAVE_SACADO_KOKKOS)
376 Kokkos::abort(
"Allocation failed.");
380 m = warpBcast(m,0,blockDim.x);
384 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
385 #if defined(HAVE_SACADO_KOKKOS)
387 Kokkos::abort(
"Allocation failed.");
395 template <
typename T>
397 static void ds_strided_free(
T* m,
int sz) {
398 if (blockDim.x == 32) {
400 const int lane = threadIdx.x;
401 if (sz > 0 && lane == 0) {
402 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
403 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, sz*
sizeof(
T));
405 operator delete((
void*) m);
411 operator delete((
void*) m);
422 template <
typename T>
423 struct ds_array<
T,
true> {
427 static T*
get(
int sz) {
428 T* m = Impl::ds_strided_alloc<T>(sz);
435 T* m = Impl::ds_strided_alloc<T>(sz);
436 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
447 T* m = Impl::ds_strided_alloc<T>(sz);
448 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
459 T* m = Impl::ds_strided_alloc<T>(sz);
460 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
461 m[
i] = src[
i*stride];
467 static void copy(
const T* src,
T* dest,
int sz) {
468 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
475 T* dest,
int dest_stride,
int sz) {
476 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x) {
477 dest[
i*dest_stride] = src[
i*src_stride];
483 static void zero(
T* dest,
int sz) {
484 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
491 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x) {
492 dest[
i*stride] =
T(0.);
499 Impl::ds_strided_free(m, sz);
503 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
507 template <
typename T>
509 static T* ds_strided_alloc(
const int sz) {
515 if (blockDim.x == 32) {
518 const int total_sz = warpReduce(sz, blockDim.x);
519 const int lane = threadIdx.x;
520 if (total_sz > 0 && lane == 0) {
521 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
522 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*
sizeof(
T)));
524 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
526 m =
static_cast<T*
>(
operator new(total_sz*
sizeof(
T)));
527 #if defined(HAVE_SACADO_KOKKOS)
529 Kokkos::abort(
"Allocation failed.");
533 m = warpBcast(m,0,blockDim.x);
538 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
539 #if defined(HAVE_SACADO_KOKKOS)
541 Kokkos::abort(
"Allocation failed.");
549 template <
typename T>
551 static void ds_strided_free(
T* m,
int sz) {
552 if (blockDim.x == 32) {
555 const int total_sz = warpReduce(sz, blockDim.x);
556 const int lane = threadIdx.x;
557 if (total_sz > 0 && lane == 0) {
558 #if defined(HAVE_SACADO_KOKKOS) && defined(SACADO_KOKKOS_USE_MEMORY_POOL)
559 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, total_sz*
sizeof(
T));
561 operator delete((
void*) m);
567 operator delete((
void*) m);
576 template <
typename T>
577 struct ds_array<
T,
true> {
581 static T*
get(
int sz) {
582 T* m = Impl::ds_strided_alloc<T>(sz);
589 T* m = Impl::ds_strided_alloc<T>(sz);
590 for (
int i=0;
i<sz; ++
i)
591 m[
i*blockDim.x] = 0.0;
601 T* m = Impl::ds_strided_alloc<T>(sz);
602 for (
int i=0;
i<sz; ++
i)
603 m[
i*blockDim.x] = src[
i*blockDim.x];
613 T* m = Impl::ds_strided_alloc<T>(sz);
614 for (
int i=0;
i<sz; ++
i)
615 m[
i*blockDim.x] = src[
i*stride];
621 static void copy(
const T* src,
T* dest,
int sz) {
622 for (
int i=0;
i<sz; ++
i)
623 dest[
i*blockDim.x] = src[
i*blockDim.x];
629 T* dest,
int dest_stride,
int sz) {
630 for (
int i=0;
i<sz; ++
i) {
639 static void zero(
T* dest,
int sz) {
640 for (
int i=0;
i<sz; ++
i)
641 dest[
i*blockDim.x] =
T(0.);
647 for (
int i=0;
i<sz; ++
i) {
656 Impl::ds_strided_free(m, sz);
666 template <
typename T>
671 static T*
get(
int sz) {
672 T* m = Impl::ds_alloc<T>(sz);
679 T* m = Impl::ds_alloc<T>(sz);
680 #if defined(__CUDACC__ ) || defined(__HIPCC__ )
681 for (
int i=0;
i<sz; ++
i)
685 std::memset(m,0,sz*
sizeof(
T));
696 T* m = Impl::ds_alloc<T>(sz);
697 for (
int i=0;
i<sz; ++
i)
708 T* m = Impl::ds_alloc<T>(sz);
709 for (
int i=0;
i<sz; ++
i)
710 m[
i] = src[
i*stride];
716 static void copy(
const T* src,
T* dest,
int sz) {
717 if (sz > 0 && dest != NULL && src != NULL)
718 #if defined( __CUDACC__) || defined(__HIPCC__ )
719 for (
int i=0;
i<sz; ++
i)
722 std::memcpy(dest,src,sz*
sizeof(
T));
729 T* dest,
int dest_stride,
int sz) {
730 for (
int i=0;
i<sz; ++
i) {
740 if (sz > 0 && dest != NULL)
741 #if defined(__CUDACC__ ) || defined(__HIPCC__ )
742 for (
int i=0;
i<sz; ++
i)
745 std::memset(dest,0,sz*
sizeof(
T));
752 for (
int i=0;
i<sz; ++
i) {
769 #endif // SACADO_DYNAMICARRAY_HPP
static SACADO_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
static SACADO_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static SACADO_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
static SACADO_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
static SACADO_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION void ds_free(T *m, int sz)
static SACADO_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static SACADO_INLINE_FUNCTION T * ds_alloc(const int sz)
static SACADO_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
void destroyGlobalMemoryPool(const ExecSpace &space)
#define SACADO_INLINE_FUNCTION
static SACADO_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.
static SACADO_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
Dynamic array allocation class that works for any type.
static SACADO_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.