45 #ifndef KOKKOS_MEMORYPOOL_HPP
46 #define KOKKOS_MEMORYPOOL_HPP
48 #include <Kokkos_Core_fwd.hpp>
51 #include <impl/Kokkos_ConcurrentBitset.hpp>
52 #include <impl/Kokkos_Error.hpp>
53 #include <impl/Kokkos_SharedAlloc.hpp>
65 void memory_pool_bounds_verification(
size_t min_block_alloc_size,
66 size_t max_block_alloc_size,
67 size_t min_superblock_size,
68 size_t max_superblock_size,
69 size_t max_block_per_superblock,
70 size_t min_total_alloc_size);
76 template <
typename DeviceType>
79 typedef typename Kokkos::Impl::concurrent_bitset CB;
81 enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
82 enum : uint32_t { state_shift = CB::state_shift };
83 enum : uint32_t { state_used_mask = CB::state_used_mask };
84 enum : uint32_t { state_header_mask = CB::state_header_mask };
85 enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
86 enum : uint32_t { max_bit_count = CB::max_bit_count };
88 enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
110 typedef typename DeviceType::memory_space base_memory_space;
114 base_memory_space>::accessible
117 typedef Kokkos::Impl::SharedAllocationTracker Tracker;
118 typedef Kokkos::Impl::SharedAllocationRecord<base_memory_space> Record;
121 uint32_t *m_sb_state_array;
122 uint32_t m_sb_state_size;
123 uint32_t m_sb_size_lg2;
124 uint32_t m_max_block_size_lg2;
125 uint32_t m_min_block_size_lg2;
127 int32_t m_hint_offset;
128 int32_t m_data_offset;
129 int32_t m_unused_padding;
132 using memory_space =
typename DeviceType::memory_space;
135 enum : uint32_t { max_superblock_size = 1LU << 31 };
136 enum : uint32_t { max_block_per_superblock = max_bit_count };
140 KOKKOS_INLINE_FUNCTION
141 bool operator==(MemoryPool
const &other)
const {
142 return m_sb_state_array == other.m_sb_state_array;
145 KOKKOS_INLINE_FUNCTION
146 size_t capacity() const noexcept {
147 return size_t(m_sb_count) << m_sb_size_lg2;
150 KOKKOS_INLINE_FUNCTION
151 size_t min_block_size() const noexcept {
152 return (1LU << m_min_block_size_lg2);
155 KOKKOS_INLINE_FUNCTION
156 size_t max_block_size() const noexcept {
157 return (1LU << m_max_block_size_lg2);
160 struct usage_statistics {
161 size_t capacity_bytes;
162 size_t superblock_bytes;
163 size_t max_block_bytes;
164 size_t min_block_bytes;
165 size_t capacity_superblocks;
166 size_t consumed_superblocks;
167 size_t consumed_blocks;
168 size_t consumed_bytes;
169 size_t reserved_blocks;
170 size_t reserved_bytes;
173 void get_usage_statistics(usage_statistics &stats)
const {
176 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
178 uint32_t *
const sb_state_array =
179 accessible ? m_sb_state_array : (uint32_t *)host.
allocate(alloc_size);
182 Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
183 sb_state_array, m_sb_state_array, alloc_size);
186 stats.superblock_bytes = (1LU << m_sb_size_lg2);
187 stats.max_block_bytes = (1LU << m_max_block_size_lg2);
188 stats.min_block_bytes = (1LU << m_min_block_size_lg2);
189 stats.capacity_bytes = stats.superblock_bytes * m_sb_count;
190 stats.capacity_superblocks = m_sb_count;
191 stats.consumed_superblocks = 0;
192 stats.consumed_blocks = 0;
193 stats.consumed_bytes = 0;
194 stats.reserved_blocks = 0;
195 stats.reserved_bytes = 0;
197 const uint32_t *sb_state_ptr = sb_state_array;
199 for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) {
200 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift;
202 if (block_count_lg2) {
203 const uint32_t block_count = 1u << block_count_lg2;
204 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2;
205 const uint32_t block_size = 1u << block_size_lg2;
206 const uint32_t block_used = (*sb_state_ptr) & state_used_mask;
208 stats.consumed_superblocks++;
209 stats.consumed_blocks += block_used;
210 stats.consumed_bytes += block_used * block_size;
211 stats.reserved_blocks += block_count - block_used;
212 stats.reserved_bytes += (block_count - block_used) * block_size;
221 void print_state(std::ostream &s)
const {
224 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
226 uint32_t *
const sb_state_array =
227 accessible ? m_sb_state_array : (uint32_t *)host.
allocate(alloc_size);
230 Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
231 sb_state_array, m_sb_state_array, alloc_size);
234 const uint32_t *sb_state_ptr = sb_state_array;
236 s <<
"pool_size(" << (size_t(m_sb_count) << m_sb_size_lg2) <<
")"
237 <<
" superblock_size(" << (1LU << m_sb_size_lg2) <<
")" << std::endl;
239 for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) {
241 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift;
242 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2;
243 const uint32_t block_count = 1u << block_count_lg2;
244 const uint32_t block_used = (*sb_state_ptr) & state_used_mask;
246 s <<
"Superblock[ " << i <<
" / " << m_sb_count <<
" ] {"
247 <<
" block_size(" << (1 << block_size_lg2) <<
")"
248 <<
" block_count( " << block_used <<
" / " << block_count <<
" )"
260 KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) =
default;
261 KOKKOS_DEFAULTED_FUNCTION MemoryPool(
const MemoryPool &) =
default;
262 KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) =
default;
263 KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(
const MemoryPool &) =
default;
265 KOKKOS_INLINE_FUNCTION MemoryPool()
267 m_sb_state_array(nullptr),
270 m_max_block_size_lg2(0),
271 m_min_block_size_lg2(0),
275 m_unused_padding(0) {}
291 MemoryPool(
const base_memory_space &memspace,
292 const size_t min_total_alloc_size,
size_t min_block_alloc_size = 0,
293 size_t max_block_alloc_size = 0,
size_t min_superblock_size = 0)
295 m_sb_state_array(nullptr),
298 m_max_block_size_lg2(0),
299 m_min_block_size_lg2(0),
303 m_unused_padding(0) {
304 const uint32_t int_align_lg2 = 3;
305 const uint32_t int_align_mask = (1u << int_align_lg2) - 1;
306 const uint32_t default_min_block_size = 1u << 6;
307 const uint32_t default_max_block_size = 1u << 12;
308 const uint32_t default_min_superblock_size = 1u << 20;
313 if (0 == min_block_alloc_size) {
316 min_superblock_size =
317 std::min(
size_t(default_min_superblock_size), min_total_alloc_size);
319 min_block_alloc_size =
320 std::min(
size_t(default_min_block_size), min_superblock_size);
322 max_block_alloc_size =
323 std::min(
size_t(default_max_block_size), min_superblock_size);
324 }
else if (0 == min_superblock_size) {
330 const size_t max_superblock =
331 min_block_alloc_size * max_block_per_superblock;
333 min_superblock_size =
334 std::min(max_superblock,
335 std::min(
size_t(max_superblock_size), min_total_alloc_size));
338 if (0 == max_block_alloc_size) {
339 max_block_alloc_size = min_superblock_size;
353 Kokkos::Impl::memory_pool_bounds_verification(
354 min_block_alloc_size, max_block_alloc_size, min_superblock_size,
355 max_superblock_size, max_block_per_superblock, min_total_alloc_size);
361 m_min_block_size_lg2 =
362 Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
364 m_max_block_size_lg2 =
365 Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
368 Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
374 const uint64_t sb_size_mask = (1LU << m_sb_size_lg2) - 1;
376 m_sb_count = (min_total_alloc_size + sb_size_mask) >> m_sb_size_lg2;
383 const uint32_t max_block_count_lg2 = m_sb_size_lg2 - m_min_block_size_lg2;
386 (CB::buffer_bound_lg2(max_block_count_lg2) + int_align_mask) &
392 const size_t all_sb_state_size =
393 (m_sb_count * m_sb_state_size + int_align_mask) & ~int_align_mask;
397 const int32_t number_block_sizes =
398 1 + m_max_block_size_lg2 - m_min_block_size_lg2;
403 const int32_t block_size_array_size =
404 (number_block_sizes + int_align_mask) & ~int_align_mask;
406 m_hint_offset = all_sb_state_size;
407 m_data_offset = m_hint_offset + block_size_array_size * HINT_PER_BLOCK_SIZE;
411 const size_t header_size = m_data_offset *
sizeof(uint32_t);
412 const size_t alloc_size =
413 header_size + (size_t(m_sb_count) << m_sb_size_lg2);
415 Record *rec = Record::allocate(memspace,
"MemoryPool", alloc_size);
417 m_tracker.assign_allocated_record_to_uninitialized(rec);
419 m_sb_state_array = (uint32_t *)rec->data();
423 uint32_t *
const sb_state_array =
424 accessible ? m_sb_state_array : (uint32_t *)host.
allocate(header_size);
426 for (int32_t i = 0; i < m_data_offset; ++i) sb_state_array[i] = 0;
430 for (int32_t i = 0; i < number_block_sizes; ++i) {
431 const uint32_t block_size_lg2 = i + m_min_block_size_lg2;
432 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
433 const uint32_t block_state = block_count_lg2 << state_shift;
434 const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE;
440 const int32_t jbeg = (i * m_sb_count) / number_block_sizes;
441 const int32_t jend = ((i + 1) * m_sb_count) / number_block_sizes;
443 sb_state_array[hint_begin] = uint32_t(jbeg);
444 sb_state_array[hint_begin + 1] = uint32_t(jbeg);
446 for (int32_t j = jbeg; j < jend; ++j) {
447 sb_state_array[j * m_sb_state_size] = block_state;
454 Kokkos::Impl::DeepCopy<base_memory_space, Kokkos::HostSpace>(
455 m_sb_state_array, sb_state_array, header_size);
459 Kokkos::memory_fence();
469 KOKKOS_FORCEINLINE_FUNCTION
470 uint32_t get_block_size_lg2(uint32_t n)
const noexcept {
471 const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains(n);
473 return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i;
478 KOKKOS_INLINE_FUNCTION
479 uint32_t allocate_block_size(uint64_t alloc_size)
const noexcept {
480 return alloc_size <= (1UL << m_max_block_size_lg2)
481 ? (1UL << get_block_size_lg2(uint32_t(alloc_size)))
496 void *allocate(
size_t alloc_size, int32_t attempt_limit = 1) const noexcept {
497 if (
size_t(1LU << m_max_block_size_lg2) < alloc_size) {
499 "Kokkos MemoryPool allocation request exceeded specified maximum "
503 if (0 == alloc_size)
return nullptr;
507 const uint32_t block_size_lg2 = get_block_size_lg2(alloc_size);
512 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
513 const uint32_t block_state = block_count_lg2 << state_shift;
514 const uint32_t block_count = 1u << block_count_lg2;
520 volatile uint32_t *
const hint_sb_id_ptr =
523 + HINT_PER_BLOCK_SIZE
524 * (block_size_lg2 - m_min_block_size_lg2);
526 const int32_t sb_id_begin = int32_t(hint_sb_id_ptr[1]);
532 const uint32_t block_id_hint =
533 (uint32_t)(Kokkos::Impl::clock_tic()
534 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
537 + (threadIdx.x + blockDim.x * threadIdx.y)
542 uint32_t sb_state = block_state;
546 volatile uint32_t *sb_state_array =
nullptr;
548 while (attempt_limit) {
549 int32_t hint_sb_id = -1;
554 sb_id = hint_sb_id = int32_t(*hint_sb_id_ptr);
556 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
563 if (sb_state == (state_header_mask & *sb_state_array)) {
568 const uint32_t count_lg2 = sb_state >> state_shift;
569 const uint32_t mask = (1u << count_lg2) - 1;
572 sb_state_array, count_lg2, block_id_hint & mask, sb_state);
579 if (0 <= result.
first) {
581 const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2;
585 p = ((
char *)(m_sb_state_array + m_data_offset)) +
586 (uint64_t(sb_id) << m_sb_size_lg2)
587 + (uint64_t(result.
first) << size_lg2);
590 printf(
" MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
591 , (uintptr_t)m_sb_state_array
615 sb_state = block_state;
618 bool update_hint =
false;
619 int32_t sb_id_empty = -1;
620 int32_t sb_id_large = -1;
621 uint32_t sb_state_large = 0;
623 sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size;
625 for (int32_t i = 0,
id = sb_id_begin; i < m_sb_count; ++i) {
630 const uint32_t full_state = *sb_state_array;
631 const uint32_t used = full_state & state_used_mask;
632 const uint32_t state = full_state & state_header_mask;
634 if (state == block_state) {
637 if (used < block_count) {
644 update_hint = used + 1 < block_count;
648 }
else if (0 == used) {
651 if (-1 == sb_id_empty) {
658 }
else if ((-1 == sb_id_empty ) &&
659 (-1 == sb_id_large ) &&
660 (state < block_state ) &&
662 (used < (1u << (state >> state_shift)))) {
668 sb_state_large = state;
673 if (++
id < m_sb_count) {
674 sb_state_array += m_sb_state_size;
677 sb_state_array = m_sb_state_array;
687 if (0 <= sb_id_empty) {
696 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
701 const uint32_t state_empty = state_header_mask & *sb_state_array;
705 state_empty == Kokkos::atomic_compare_exchange(
706 sb_state_array, state_empty, block_state);
707 }
else if (0 <= sb_id_large) {
711 sb_state = sb_state_large;
713 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
721 Kokkos::atomic_compare_exchange(hint_sb_id_ptr, uint32_t(hint_sb_id),
738 KOKKOS_INLINE_FUNCTION
739 void deallocate(
void *p,
size_t ) const noexcept {
740 if (
nullptr == p)
return;
744 ((
char *)p) - ((
char *)(m_sb_state_array + m_data_offset));
747 const int ok_contains =
748 (0 <= d) && (
size_t(d) < (size_t(m_sb_count) << m_sb_size_lg2));
750 int ok_block_aligned = 0;
751 int ok_dealloc_once = 0;
754 const int sb_id = d >> m_sb_size_lg2;
757 volatile uint32_t *
const sb_state_array =
758 m_sb_state_array + (sb_id * m_sb_state_size);
760 const uint32_t block_state = (*sb_state_array) & state_header_mask;
761 const uint32_t block_size_lg2 =
762 m_sb_size_lg2 - (block_state >> state_shift);
764 ok_block_aligned = 0 == (d & ((1UL << block_size_lg2) - 1));
766 if (ok_block_aligned) {
771 (d & (ptrdiff_t(1LU << m_sb_size_lg2) - 1)) >> block_size_lg2;
773 const int result = CB::release(sb_state_array, bit, block_state);
775 ok_dealloc_once = 0 <= result;
778 printf(
" MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
779 , (uintptr_t)m_sb_state_array
782 , (1u << block_size_lg2)
783 , (1u << (m_sb_size_lg2 - block_size_lg2))
790 if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
792 printf(
" MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
793 , (uintptr_t)m_sb_state_array
796 ,
int(ok_block_aligned)
797 ,
int(ok_dealloc_once) );
799 Kokkos::abort(
"Kokkos MemoryPool::deallocate given erroneous pointer");
805 KOKKOS_INLINE_FUNCTION
806 int number_of_superblocks() const noexcept {
return m_sb_count; }
808 KOKKOS_INLINE_FUNCTION
809 void superblock_state(
int sb_id,
int &block_size,
int &block_count_capacity,
810 int &block_count_used)
const noexcept {
812 block_count_capacity = 0;
813 block_count_used = 0;
816 Kokkos::Impl::ActiveExecutionMemorySpace,
817 base_memory_space>::accessible) {
820 const uint32_t state =
821 ((uint32_t
volatile *)m_sb_state_array)[sb_id * m_sb_state_size];
823 const uint32_t block_count_lg2 = state >> state_shift;
824 const uint32_t block_used = state & state_used_mask;
826 block_size = 1LU << (m_sb_size_lg2 - block_count_lg2);
827 block_count_capacity = 1LU << block_count_lg2;
828 block_count_used = block_used;
void * allocate(const size_t arg_alloc_size) const
Allocate untracked memory in the space.
KOKKOS_INLINE_FUNCTION bool operator==(complex< RealType1 > const &x, complex< RealType2 > const &y) noexcept
Binary == operator for complex complex.
Replacement for std::pair that works on CUDA devices.
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
first_type first
The first element of the pair.
Memory management for host memory.
Declaration of parallel operators.
second_type second
The second element of the pair.
Access relationship between DstMemorySpace and SrcMemorySpace.