17 #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
18 #include <Kokkos_Macros.hpp>
20 "Including non-public Kokkos header files is not allowed.");
22 #ifndef KOKKOS_MEMORYPOOL_HPP
23 #define KOKKOS_MEMORYPOOL_HPP
25 #include <Kokkos_Core_fwd.hpp>
28 #include <impl/Kokkos_ConcurrentBitset.hpp>
29 #include <impl/Kokkos_Error.hpp>
30 #include <impl/Kokkos_SharedAlloc.hpp>
43 void memory_pool_bounds_verification(
size_t min_block_alloc_size,
44 size_t max_block_alloc_size,
45 size_t min_superblock_size,
46 size_t max_superblock_size,
47 size_t max_block_per_superblock,
48 size_t min_total_alloc_size);
56 void _print_memory_pool_state(std::ostream &s, uint32_t
const *sb_state_ptr,
57 int32_t sb_count, uint32_t sb_size_lg2,
58 uint32_t sb_state_size, uint32_t state_shift,
59 uint32_t state_used_mask);
63 template <
typename DeviceType>
66 using CB = Kokkos::Impl::concurrent_bitset;
68 enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
69 enum : uint32_t { state_shift = CB::state_shift };
70 enum : uint32_t { state_used_mask = CB::state_used_mask };
71 enum : uint32_t { state_header_mask = CB::state_header_mask };
72 enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
73 enum : uint32_t { max_bit_count = CB::max_bit_count };
75 enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
97 using base_memory_space =
typename DeviceType::memory_space;
101 base_memory_space>::accessible
104 using Tracker = Kokkos::Impl::SharedAllocationTracker;
105 using Record = Kokkos::Impl::SharedAllocationRecord<base_memory_space>;
108 uint32_t *m_sb_state_array;
109 uint32_t m_sb_state_size;
110 uint32_t m_sb_size_lg2;
111 uint32_t m_max_block_size_lg2;
112 uint32_t m_min_block_size_lg2;
114 int32_t m_hint_offset;
115 int32_t m_data_offset;
116 int32_t m_unused_padding;
119 using memory_space =
typename DeviceType::memory_space;
122 enum : uint32_t { max_superblock_size = 1LU << 31 };
123 enum : uint32_t { max_block_per_superblock = max_bit_count };
127 KOKKOS_INLINE_FUNCTION
128 bool operator==(MemoryPool
const &other)
const {
129 return m_sb_state_array == other.m_sb_state_array;
132 KOKKOS_INLINE_FUNCTION
133 size_t capacity() const noexcept {
134 return size_t(m_sb_count) << m_sb_size_lg2;
137 KOKKOS_INLINE_FUNCTION
138 size_t min_block_size() const noexcept {
139 return (1LU << m_min_block_size_lg2);
142 KOKKOS_INLINE_FUNCTION
143 size_t max_block_size() const noexcept {
144 return (1LU << m_max_block_size_lg2);
147 struct usage_statistics {
148 size_t capacity_bytes;
149 size_t superblock_bytes;
150 size_t max_block_bytes;
151 size_t min_block_bytes;
152 size_t capacity_superblocks;
153 size_t consumed_superblocks;
154 size_t consumed_blocks;
155 size_t consumed_bytes;
156 size_t reserved_blocks;
157 size_t reserved_bytes;
162 template <
typename ExecutionSpace = Kokkos::DefaultHostExecutionSpace>
163 void get_usage_statistics(usage_statistics &stats)
const {
166 std::is_same_v<ExecutionSpace, Kokkos::DefaultHostExecutionSpace>);
168 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
170 uint32_t *
const sb_state_array =
171 accessible ? m_sb_state_array : (uint32_t *)host.
allocate(alloc_size);
174 Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
175 ExecutionSpace{}, sb_state_array, m_sb_state_array, alloc_size);
177 "MemoryPool::get_usage_statistics(): fence after copying state "
178 "array to HostSpace");
181 stats.superblock_bytes = (1LU << m_sb_size_lg2);
182 stats.max_block_bytes = (1LU << m_max_block_size_lg2);
183 stats.min_block_bytes = (1LU << m_min_block_size_lg2);
184 stats.capacity_bytes = stats.superblock_bytes * m_sb_count;
185 stats.capacity_superblocks = m_sb_count;
186 stats.consumed_superblocks = 0;
187 stats.consumed_blocks = 0;
188 stats.consumed_bytes = 0;
189 stats.reserved_blocks = 0;
190 stats.reserved_bytes = 0;
192 const uint32_t *sb_state_ptr = sb_state_array;
194 for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) {
195 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift;
197 if (block_count_lg2) {
198 const uint32_t block_count = 1u << block_count_lg2;
199 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2;
200 const uint32_t block_size = 1u << block_size_lg2;
201 const uint32_t block_used = (*sb_state_ptr) & state_used_mask;
203 stats.consumed_superblocks++;
204 stats.consumed_blocks += block_used;
205 stats.consumed_bytes +=
static_cast<size_t>(block_used) * block_size;
206 stats.reserved_blocks += block_count - block_used;
207 stats.reserved_bytes +=
208 static_cast<size_t>(block_count - block_used) * block_size;
219 template <
typename ExecutionSpace = Kokkos::DefaultHostExecutionSpace>
220 void print_state(std::ostream &s)
const {
223 std::is_same_v<ExecutionSpace, Kokkos::DefaultHostExecutionSpace>);
225 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
227 uint32_t *
const sb_state_array =
228 accessible ? m_sb_state_array : (uint32_t *)host.
allocate(alloc_size);
231 Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
232 ExecutionSpace{}, sb_state_array, m_sb_state_array, alloc_size);
234 "MemoryPool::print_state(): fence after copying state array to "
238 Impl::_print_memory_pool_state(s, sb_state_array, m_sb_count, m_sb_size_lg2,
239 m_sb_state_size, state_shift,
249 KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) =
default;
250 KOKKOS_DEFAULTED_FUNCTION MemoryPool(
const MemoryPool &) =
default;
251 KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) =
default;
252 KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(
const MemoryPool &) =
default;
254 KOKKOS_INLINE_FUNCTION MemoryPool()
256 m_sb_state_array(nullptr),
259 m_max_block_size_lg2(0),
260 m_min_block_size_lg2(0),
264 m_unused_padding(0) {}
280 MemoryPool(
const base_memory_space &memspace,
281 const size_t min_total_alloc_size,
size_t min_block_alloc_size = 0,
282 size_t max_block_alloc_size = 0,
size_t min_superblock_size = 0)
284 m_sb_state_array(nullptr),
287 m_max_block_size_lg2(0),
288 m_min_block_size_lg2(0),
292 m_unused_padding(0) {
293 const uint32_t int_align_lg2 = 3;
294 const uint32_t int_align_mask = (1u << int_align_lg2) - 1;
295 const uint32_t default_min_block_size = 1u << 6;
296 const uint32_t default_max_block_size = 1u << 12;
297 const uint32_t default_min_superblock_size = 1u << 20;
302 if (0 == min_block_alloc_size) {
305 min_superblock_size =
306 std::min(
size_t(default_min_superblock_size), min_total_alloc_size);
308 min_block_alloc_size =
309 std::min(
size_t(default_min_block_size), min_superblock_size);
311 max_block_alloc_size =
312 std::min(
size_t(default_max_block_size), min_superblock_size);
313 }
else if (0 == min_superblock_size) {
319 const size_t max_superblock =
320 min_block_alloc_size * max_block_per_superblock;
322 min_superblock_size =
323 std::min(max_superblock,
324 std::min(
size_t(max_superblock_size), min_total_alloc_size));
327 if (0 == max_block_alloc_size) {
328 max_block_alloc_size = min_superblock_size;
342 Kokkos::Impl::memory_pool_bounds_verification(
343 min_block_alloc_size, max_block_alloc_size, min_superblock_size,
344 max_superblock_size, max_block_per_superblock, min_total_alloc_size);
350 m_min_block_size_lg2 =
351 Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
353 m_max_block_size_lg2 =
354 Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
357 Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
363 const uint64_t sb_size_mask = (1LU << m_sb_size_lg2) - 1;
365 m_sb_count = (min_total_alloc_size + sb_size_mask) >> m_sb_size_lg2;
372 const uint32_t max_block_count_lg2 = m_sb_size_lg2 - m_min_block_size_lg2;
375 (CB::buffer_bound_lg2(max_block_count_lg2) + int_align_mask) &
381 const size_t all_sb_state_size =
382 (m_sb_count * m_sb_state_size + int_align_mask) & ~int_align_mask;
386 const int32_t number_block_sizes =
387 1 + m_max_block_size_lg2 - m_min_block_size_lg2;
392 const int32_t block_size_array_size =
393 (number_block_sizes + int_align_mask) & ~int_align_mask;
395 m_hint_offset = all_sb_state_size;
396 m_data_offset = m_hint_offset + block_size_array_size * HINT_PER_BLOCK_SIZE;
400 const size_t header_size = m_data_offset *
sizeof(uint32_t);
401 const size_t alloc_size =
402 header_size + (size_t(m_sb_count) << m_sb_size_lg2);
404 Record *rec = Record::allocate(memspace,
"Kokkos::MemoryPool", alloc_size);
406 m_tracker.assign_allocated_record_to_uninitialized(rec);
408 m_sb_state_array = (uint32_t *)rec->data();
412 uint32_t *
const sb_state_array =
413 accessible ? m_sb_state_array : (uint32_t *)host.
allocate(header_size);
415 for (int32_t i = 0; i < m_data_offset; ++i) sb_state_array[i] = 0;
419 for (int32_t i = 0; i < number_block_sizes; ++i) {
420 const uint32_t block_size_lg2 = i + m_min_block_size_lg2;
421 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
422 const uint32_t block_state = block_count_lg2 << state_shift;
423 const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE;
429 const int32_t jbeg = (i * m_sb_count) / number_block_sizes;
430 const int32_t jend = ((i + 1) * m_sb_count) / number_block_sizes;
432 sb_state_array[hint_begin] = uint32_t(jbeg);
433 sb_state_array[hint_begin + 1] = uint32_t(jbeg);
435 for (int32_t j = jbeg; j < jend; ++j) {
436 sb_state_array[j * m_sb_state_size] = block_state;
443 Kokkos::Impl::DeepCopy<base_memory_space, Kokkos::HostSpace>(
444 typename base_memory_space::execution_space{}, m_sb_state_array,
445 sb_state_array, header_size);
447 "MemoryPool::MemoryPool(): fence after copying state array from "
452 Kokkos::memory_fence();
462 KOKKOS_FORCEINLINE_FUNCTION
463 uint32_t get_block_size_lg2(uint32_t n)
const noexcept {
464 const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains(n);
466 return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i;
471 KOKKOS_INLINE_FUNCTION
472 uint32_t allocate_block_size(uint64_t alloc_size)
const noexcept {
473 return alloc_size <= (1UL << m_max_block_size_lg2)
474 ? (1UL << get_block_size_lg2(uint32_t(alloc_size)))
489 void *allocate(
size_t alloc_size, int32_t attempt_limit = 1) const noexcept {
490 if (
size_t(1LU << m_max_block_size_lg2) < alloc_size) {
492 "Kokkos MemoryPool allocation request exceeded specified maximum "
496 if (0 == alloc_size)
return nullptr;
500 const uint32_t block_size_lg2 = get_block_size_lg2(alloc_size);
505 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
506 const uint32_t block_state = block_count_lg2 << state_shift;
507 const uint32_t block_count = 1u << block_count_lg2;
513 volatile uint32_t *
const hint_sb_id_ptr =
516 + HINT_PER_BLOCK_SIZE
517 * (block_size_lg2 - m_min_block_size_lg2);
519 const int32_t sb_id_begin = int32_t(hint_sb_id_ptr[1]);
524 #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
525 const uint32_t block_id_hint = alloc_size;
527 const uint32_t block_id_hint =
528 (uint32_t)(Kokkos::Impl::clock_tic()
529 #ifdef __CUDA_ARCH__ // FIXME_CUDA
532 + (threadIdx.x + blockDim.x * threadIdx.y)
538 uint32_t sb_state = block_state;
542 volatile uint32_t *sb_state_array =
nullptr;
544 while (attempt_limit) {
545 int32_t hint_sb_id = -1;
550 sb_id = hint_sb_id = int32_t(*hint_sb_id_ptr);
552 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
559 if (sb_state == (state_header_mask & *sb_state_array)) {
564 const uint32_t count_lg2 = sb_state >> state_shift;
565 const uint32_t mask = (1u << count_lg2) - 1;
568 sb_state_array, count_lg2, block_id_hint & mask, sb_state);
575 if (0 <= result.
first) {
577 const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2;
581 p = ((
char *)(m_sb_state_array + m_data_offset)) +
582 (uint64_t(sb_id) << m_sb_size_lg2)
583 + (uint64_t(result.
first) << size_lg2);
598 sb_state = block_state;
601 bool update_hint =
false;
602 int32_t sb_id_empty = -1;
603 int32_t sb_id_large = -1;
604 uint32_t sb_state_large = 0;
606 sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size;
608 for (int32_t i = 0,
id = sb_id_begin; i < m_sb_count; ++i) {
613 const uint32_t full_state = *sb_state_array;
614 const uint32_t used = full_state & state_used_mask;
615 const uint32_t state = full_state & state_header_mask;
617 if (state == block_state) {
620 if (used < block_count) {
627 update_hint = used + 1 < block_count;
631 }
else if (0 == used) {
634 if (-1 == sb_id_empty) {
641 }
else if ((-1 == sb_id_empty ) &&
642 (-1 == sb_id_large ) &&
643 (state < block_state ) &&
645 (used < (1u << (state >> state_shift)))) {
651 sb_state_large = state;
656 if (++
id < m_sb_count) {
657 sb_state_array += m_sb_state_size;
660 sb_state_array = m_sb_state_array;
670 if (0 <= sb_id_empty) {
679 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
684 const uint32_t state_empty = state_header_mask & *sb_state_array;
688 state_empty == Kokkos::atomic_compare_exchange(
689 sb_state_array, state_empty, block_state);
690 }
else if (0 <= sb_id_large) {
694 sb_state = sb_state_large;
696 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
704 Kokkos::atomic_compare_exchange(hint_sb_id_ptr, uint32_t(hint_sb_id),
721 KOKKOS_INLINE_FUNCTION
722 void deallocate(
void *p,
size_t ) const noexcept {
723 if (
nullptr == p)
return;
727 static_cast<char *
>(p) -
728 reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
731 const int ok_contains =
732 (0 <= d) && (
size_t(d) < (size_t(m_sb_count) << m_sb_size_lg2));
734 int ok_block_aligned = 0;
735 int ok_dealloc_once = 0;
738 const int sb_id = d >> m_sb_size_lg2;
741 volatile uint32_t *
const sb_state_array =
742 m_sb_state_array + (sb_id * m_sb_state_size);
744 const uint32_t block_state = (*sb_state_array) & state_header_mask;
745 const uint32_t block_size_lg2 =
746 m_sb_size_lg2 - (block_state >> state_shift);
748 ok_block_aligned = 0 == (d & ((1UL << block_size_lg2) - 1));
750 if (ok_block_aligned) {
755 (d & (ptrdiff_t(1LU << m_sb_size_lg2) - 1)) >> block_size_lg2;
757 const int result = CB::release(sb_state_array, bit, block_state);
759 ok_dealloc_once = 0 <= result;
763 if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
764 Kokkos::abort(
"Kokkos MemoryPool::deallocate given erroneous pointer");
770 KOKKOS_INLINE_FUNCTION
771 int number_of_superblocks() const noexcept {
return m_sb_count; }
773 KOKKOS_INLINE_FUNCTION
774 void superblock_state(
int sb_id,
int &block_size,
int &block_count_capacity,
775 int &block_count_used)
const noexcept {
777 block_count_capacity = 0;
778 block_count_used = 0;
780 bool can_access_state_array = []() {
782 (
return SpaceAccessibility<DefaultHostExecutionSpace,
783 base_memory_space>::accessible;))
785 (return SpaceAccessibility<DefaultExecutionSpace,
786 base_memory_space>::accessible;))
789 if (can_access_state_array) {
792 const uint32_t state =
793 ((uint32_t
volatile *)m_sb_state_array)[sb_id * m_sb_state_size];
795 const uint32_t block_count_lg2 = state >> state_shift;
796 const uint32_t block_used = state & state_used_mask;
798 block_size = 1LU << (m_sb_size_lg2 - block_count_lg2);
799 block_count_capacity = 1LU << block_count_lg2;
800 block_count_used = block_used;
void * allocate(const ExecutionSpace &, const size_t arg_alloc_size) const
Allocate untracked memory in the space.
Replacement for std::pair that works on CUDA devices.
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
first_type first
The first element of the pair.
Memory management for host memory.
Declaration of parallel operators.
Access relationship between DstMemorySpace and SrcMemorySpace.