44 #ifndef KOKKOS_MEMORYPOOL_HPP
45 #define KOKKOS_MEMORYPOOL_HPP
47 #include <Kokkos_Core_fwd.hpp>
50 #include <impl/Kokkos_ConcurrentBitset.hpp>
51 #include <impl/Kokkos_Error.hpp>
52 #include <impl/Kokkos_SharedAlloc.hpp>
64 void memory_pool_bounds_verification
65 (
size_t min_block_alloc_size
66 ,
size_t max_block_alloc_size
67 ,
size_t min_superblock_size
68 ,
size_t max_superblock_size
69 ,
size_t max_block_per_superblock
70 ,
size_t min_total_alloc_size
77 template<
typename DeviceType >
81 typedef typename Kokkos::Impl::concurrent_bitset CB ;
83 enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
84 enum : uint32_t { state_shift = CB::state_shift };
85 enum : uint32_t { state_used_mask = CB::state_used_mask };
86 enum : uint32_t { state_header_mask = CB::state_header_mask };
87 enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
88 enum : uint32_t { max_bit_count = CB::max_bit_count };
90 enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
112 typedef typename DeviceType::memory_space base_memory_space ;
116 , base_memory_space >::accessible };
118 typedef Kokkos::Impl::SharedAllocationTracker Tracker ;
119 typedef Kokkos::Impl::SharedAllocationRecord
120 < base_memory_space > Record ;
123 uint32_t * m_sb_state_array ;
124 uint32_t m_sb_state_size ;
125 uint32_t m_sb_size_lg2 ;
126 uint32_t m_max_block_size_lg2 ;
127 uint32_t m_min_block_size_lg2 ;
129 int32_t m_hint_offset ;
130 int32_t m_data_offset ;
131 int32_t m_unused_padding ;
136 enum : uint32_t { max_superblock_size = 1LU << 31 };
137 enum : uint32_t { max_block_per_superblock = max_bit_count };
141 KOKKOS_INLINE_FUNCTION
142 size_t capacity() const noexcept
143 {
return size_t(m_sb_count) << m_sb_size_lg2 ; }
145 KOKKOS_INLINE_FUNCTION
146 size_t min_block_size() const noexcept
147 {
return ( 1LU << m_min_block_size_lg2 ); }
149 KOKKOS_INLINE_FUNCTION
150 size_t max_block_size() const noexcept
151 {
return ( 1LU << m_max_block_size_lg2 ); }
153 struct usage_statistics {
154 size_t capacity_bytes ;
155 size_t superblock_bytes ;
156 size_t max_block_bytes ;
157 size_t min_block_bytes ;
158 size_t capacity_superblocks ;
159 size_t consumed_superblocks ;
160 size_t consumed_blocks ;
161 size_t consumed_bytes ;
162 size_t reserved_blocks ;
163 size_t reserved_bytes ;
166 void get_usage_statistics( usage_statistics & stats )
const
170 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
172 uint32_t *
const sb_state_array =
173 accessible ? m_sb_state_array : (uint32_t *) host.
allocate(alloc_size);
175 if ( ! accessible ) {
176 Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
177 ( sb_state_array , m_sb_state_array , alloc_size );
180 stats.superblock_bytes = ( 1LU << m_sb_size_lg2 );
181 stats.max_block_bytes = ( 1LU << m_max_block_size_lg2 );
182 stats.min_block_bytes = ( 1LU << m_min_block_size_lg2 );
183 stats.capacity_bytes = stats.superblock_bytes * m_sb_count ;
184 stats.capacity_superblocks = m_sb_count ;
185 stats.consumed_superblocks = 0 ;
186 stats.consumed_blocks = 0 ;
187 stats.consumed_bytes = 0 ;
188 stats.reserved_blocks = 0 ;
189 stats.reserved_bytes = 0 ;
191 const uint32_t * sb_state_ptr = sb_state_array ;
193 for ( int32_t i = 0 ; i < m_sb_count
194 ; ++i , sb_state_ptr += m_sb_state_size ) {
196 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
198 if ( block_count_lg2 ) {
199 const uint32_t block_count = 1u << block_count_lg2 ;
200 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
201 const uint32_t block_size = 1u << block_size_lg2 ;
202 const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
204 stats.consumed_superblocks++ ;
205 stats.consumed_blocks += block_used ;
206 stats.consumed_bytes += block_used * block_size ;
207 stats.reserved_blocks += block_count - block_used ;
208 stats.reserved_bytes += (block_count - block_used ) * block_size ;
212 if ( ! accessible ) {
213 host.
deallocate( sb_state_array, alloc_size );
217 void print_state( std::ostream & s )
const
221 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
223 uint32_t *
const sb_state_array =
224 accessible ? m_sb_state_array : (uint32_t *) host.
allocate(alloc_size);
226 if ( ! accessible ) {
227 Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
228 ( sb_state_array , m_sb_state_array , alloc_size );
231 const uint32_t * sb_state_ptr = sb_state_array ;
233 s <<
"pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) <<
")"
234 <<
" superblock_size(" << ( 1LU << m_sb_size_lg2 ) <<
")" << std::endl ;
236 for ( int32_t i = 0 ; i < m_sb_count
237 ; ++i , sb_state_ptr += m_sb_state_size ) {
239 if ( *sb_state_ptr ) {
241 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
242 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
243 const uint32_t block_count = 1u << block_count_lg2 ;
244 const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
246 s <<
"Superblock[ " << i <<
" / " << m_sb_count <<
" ] {"
247 <<
" block_size(" << ( 1 << block_size_lg2 ) <<
")"
248 <<
" block_count( " << block_used
249 <<
" / " << block_count <<
" )"
254 if ( ! accessible ) {
255 host.
deallocate( sb_state_array, alloc_size );
261 #ifdef KOKKOS_CUDA_9_DEFAULTED_BUG_WORKAROUND
262 KOKKOS_INLINE_FUNCTION MemoryPool( MemoryPool && rhs )
263 : m_tracker(std::move(rhs.m_tracker))
264 , m_sb_state_array(std::move(rhs.m_sb_state_array))
265 , m_sb_state_size(std::move(rhs.m_sb_state_size))
266 , m_sb_size_lg2(std::move(rhs.m_sb_size_lg2))
267 , m_max_block_size_lg2(std::move(rhs.m_max_block_size_lg2))
268 , m_min_block_size_lg2(std::move(rhs.m_min_block_size_lg2))
269 , m_sb_count(std::move(rhs.m_sb_count))
270 , m_hint_offset(std::move(rhs.m_hint_offset))
271 , m_data_offset(std::move(rhs.m_data_offset))
274 KOKKOS_INLINE_FUNCTION MemoryPool(
const MemoryPool & rhs )
275 : m_tracker(rhs.m_tracker)
276 , m_sb_state_array(rhs.m_sb_state_array)
277 , m_sb_state_size(rhs.m_sb_state_size)
278 , m_sb_size_lg2(rhs.m_sb_size_lg2)
279 , m_max_block_size_lg2(rhs.m_max_block_size_lg2)
280 , m_min_block_size_lg2(rhs.m_min_block_size_lg2)
281 , m_sb_count(rhs.m_sb_count)
282 , m_hint_offset(rhs.m_hint_offset)
283 , m_data_offset(rhs.m_data_offset)
286 KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( MemoryPool && rhs )
288 m_tracker = std::move(rhs.m_tracker);
289 m_sb_state_array = std::move(rhs.m_sb_state_array);
290 m_sb_state_size = std::move(rhs.m_sb_state_size);
291 m_sb_size_lg2 = std::move(rhs.m_sb_size_lg2);
292 m_max_block_size_lg2 = std::move(rhs.m_max_block_size_lg2);
293 m_min_block_size_lg2 = std::move(rhs.m_min_block_size_lg2);
294 m_sb_count = std::move(rhs.m_sb_count);
295 m_hint_offset = std::move(rhs.m_hint_offset);
296 m_data_offset = std::move(rhs.m_data_offset);
299 KOKKOS_INLINE_FUNCTION MemoryPool & operator = (
const MemoryPool & rhs )
301 m_tracker = rhs.m_tracker;
302 m_sb_state_array = rhs.m_sb_state_array;
303 m_sb_state_size = rhs.m_sb_state_size;
304 m_sb_size_lg2 = rhs.m_sb_size_lg2;
305 m_max_block_size_lg2 = rhs.m_max_block_size_lg2;
306 m_min_block_size_lg2 = rhs.m_min_block_size_lg2;
307 m_sb_count = rhs.m_sb_count;
308 m_hint_offset = rhs.m_hint_offset;
309 m_data_offset = rhs.m_data_offset;
313 KOKKOS_INLINE_FUNCTION MemoryPool( MemoryPool && ) = default ;
314 KOKKOS_INLINE_FUNCTION MemoryPool(
const MemoryPool & ) = default ;
315 KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( MemoryPool && ) = default ;
316 KOKKOS_INLINE_FUNCTION MemoryPool & operator = (
const MemoryPool & ) = default ;
319 KOKKOS_INLINE_FUNCTION MemoryPool()
321 , m_sb_state_array(0)
324 , m_max_block_size_lg2(0)
325 , m_min_block_size_lg2(0)
329 , m_unused_padding(0)
346 MemoryPool(
const base_memory_space & memspace
347 ,
const size_t min_total_alloc_size
348 ,
size_t min_block_alloc_size = 0
349 ,
size_t max_block_alloc_size = 0
350 ,
size_t min_superblock_size = 0
353 , m_sb_state_array(0)
356 , m_max_block_size_lg2(0)
357 , m_min_block_size_lg2(0)
361 , m_unused_padding(0)
363 const uint32_t int_align_lg2 = 3 ;
364 const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
365 const uint32_t default_min_block_size = 1u << 6 ;
366 const uint32_t default_max_block_size = 1u << 12 ;
367 const uint32_t default_min_superblock_size = 1u << 20 ;
372 if ( 0 == min_block_alloc_size ) {
375 min_superblock_size =
376 std::min(
size_t(default_min_superblock_size)
377 , min_total_alloc_size );
379 min_block_alloc_size =
380 std::min(
size_t(default_min_block_size)
381 , min_superblock_size );
383 max_block_alloc_size =
384 std::min(
size_t(default_max_block_size)
385 , min_superblock_size );
387 else if ( 0 == min_superblock_size ) {
394 const size_t max_superblock =
395 min_block_alloc_size * max_block_per_superblock ;
397 min_superblock_size =
398 std::min( max_superblock ,
399 std::min(
size_t(max_superblock_size)
400 , min_total_alloc_size ) );
403 if ( 0 == max_block_alloc_size ) {
404 max_block_alloc_size = min_superblock_size ;
418 Kokkos::Impl::memory_pool_bounds_verification
419 ( min_block_alloc_size
420 , max_block_alloc_size
421 , min_superblock_size
422 , max_superblock_size
423 , max_block_per_superblock
424 , min_total_alloc_size
431 m_min_block_size_lg2 =
432 Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
434 m_max_block_size_lg2 =
435 Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
438 Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
444 const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
446 m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
453 const uint32_t max_block_count_lg2 =
454 m_sb_size_lg2 - m_min_block_size_lg2 ;
457 ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
462 const size_t all_sb_state_size =
463 ( m_sb_count * m_sb_state_size + int_align_mask ) & ~int_align_mask ;
467 const int32_t number_block_sizes =
468 1 + m_max_block_size_lg2 - m_min_block_size_lg2 ;
473 const int32_t block_size_array_size =
474 ( number_block_sizes + int_align_mask ) & ~int_align_mask ;
476 m_hint_offset = all_sb_state_size ;
477 m_data_offset = m_hint_offset +
478 block_size_array_size * HINT_PER_BLOCK_SIZE ;
482 const size_t header_size = m_data_offset *
sizeof(uint32_t);
483 const size_t alloc_size = header_size +
484 ( size_t(m_sb_count) << m_sb_size_lg2 );
486 Record * rec = Record::allocate( memspace ,
"MemoryPool" , alloc_size );
488 m_tracker.assign_allocated_record_to_uninitialized( rec );
490 m_sb_state_array = (uint32_t *) rec->data();
494 uint32_t *
const sb_state_array =
495 accessible ? m_sb_state_array
496 : (uint32_t *) host.
allocate(header_size);
498 for ( int32_t i = 0 ; i < m_data_offset ; ++i ) sb_state_array[i] = 0 ;
502 for ( int32_t i = 0 ; i < number_block_sizes ; ++i ) {
503 const uint32_t block_size_lg2 = i + m_min_block_size_lg2 ;
504 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
505 const uint32_t block_state = block_count_lg2 << state_shift ;
506 const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE ;
512 const int32_t jbeg = ( i * m_sb_count ) / number_block_sizes ;
513 const int32_t jend = ( ( i + 1 ) * m_sb_count ) / number_block_sizes ;
515 sb_state_array[ hint_begin ] = uint32_t(jbeg);
516 sb_state_array[ hint_begin + 1 ] = uint32_t(jbeg);
518 for ( int32_t j = jbeg ; j < jend ; ++j ) {
519 sb_state_array[ j * m_sb_state_size ] = block_state ;
525 if ( ! accessible ) {
526 Kokkos::Impl::DeepCopy< base_memory_space , Kokkos::HostSpace >
527 ( m_sb_state_array , sb_state_array , header_size );
529 host.
deallocate( sb_state_array, header_size );
532 Kokkos::memory_fence();
543 KOKKOS_FORCEINLINE_FUNCTION
544 uint32_t get_block_size_lg2( uint32_t n )
const noexcept
546 const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
548 return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i ;
554 KOKKOS_INLINE_FUNCTION
555 uint32_t allocate_block_size( uint64_t alloc_size )
const noexcept
557 return alloc_size <= (1UL << m_max_block_size_lg2)
558 ? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
573 void * allocate(
size_t alloc_size
574 , int32_t attempt_limit = 1 ) const noexcept
576 if (
size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
577 Kokkos::abort(
"Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
580 if ( 0 == alloc_size )
return (
void*) 0 ;
584 const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
589 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
590 const uint32_t block_state = block_count_lg2 << state_shift ;
591 const uint32_t block_count = 1u << block_count_lg2 ;
597 volatile uint32_t *
const hint_sb_id_ptr
600 + HINT_PER_BLOCK_SIZE
601 * ( block_size_lg2 - m_min_block_size_lg2 );
603 const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
609 const uint32_t block_id_hint =
610 (uint32_t)( Kokkos::Impl::clock_tic()
611 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
614 + ( threadIdx.x + blockDim.x * threadIdx.y )
619 uint32_t sb_state = block_state ;
623 volatile uint32_t * sb_state_array = 0 ;
625 while ( attempt_limit ) {
627 int32_t hint_sb_id = -1 ;
633 sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
635 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
642 if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
648 const uint32_t count_lg2 = sb_state >> state_shift ;
649 const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
652 CB::acquire_bounded_lg2( sb_state_array
654 , block_id_hint & mask
663 if ( 0 <= result.
first ) {
665 const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
669 p = ((
char*)( m_sb_state_array + m_data_offset ))
670 + ( uint64_t(sb_id) << m_sb_size_lg2 )
671 + ( uint64_t(result.
first) << size_lg2 );
674 printf(
" MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
675 , (uintptr_t)m_sb_state_array
699 sb_state = block_state ;
702 bool update_hint = false ;
703 int32_t sb_id_empty = -1 ;
704 int32_t sb_id_large = -1 ;
705 uint32_t sb_state_large = 0 ;
707 sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
709 for ( int32_t i = 0 ,
id = sb_id_begin ; i < m_sb_count ; ++i ) {
715 const uint32_t full_state = *sb_state_array ;
716 const uint32_t used = full_state & state_used_mask ;
717 const uint32_t state = full_state & state_header_mask ;
719 if ( state == block_state ) {
723 if ( used < block_count ) {
731 update_hint = used + 1 < block_count ;
736 else if ( 0 == used ) {
740 if ( -1 == sb_id_empty ) {
749 else if ( ( -1 == sb_id_empty ) &&
750 ( -1 == sb_id_large ) &&
751 ( state < block_state ) &&
753 ( used < ( 1u << ( state >> state_shift ) ) ) ) {
759 sb_state_large = state ;
764 if ( ++
id < m_sb_count ) {
765 sb_state_array += m_sb_state_size ;
769 sb_state_array = m_sb_state_array ;
779 if ( 0 <= sb_id_empty ) {
787 sb_id = sb_id_empty ;
789 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
794 const uint32_t state_empty = state_header_mask & *sb_state_array ;
799 Kokkos::atomic_compare_exchange
800 (sb_state_array,state_empty,block_state);
802 else if ( 0 <= sb_id_large ) {
806 sb_id = sb_id_large ;
807 sb_state = sb_state_large ;
809 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
818 Kokkos::atomic_compare_exchange
819 ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
835 KOKKOS_INLINE_FUNCTION
836 void deallocate(
void * p ,
size_t ) const noexcept
838 if ( 0 == p ) return ;
842 ((
char*)p) - ((
char*)( m_sb_state_array + m_data_offset ));
845 const int ok_contains =
846 ( 0 <= d ) && (
size_t(d) < ( size_t(m_sb_count) << m_sb_size_lg2 ) );
848 int ok_block_aligned = 0 ;
849 int ok_dealloc_once = 0 ;
853 const int sb_id = d >> m_sb_size_lg2 ;
856 volatile uint32_t *
const sb_state_array =
857 m_sb_state_array + ( sb_id * m_sb_state_size );
859 const uint32_t block_state = (*sb_state_array) & state_header_mask ;
860 const uint32_t block_size_lg2 =
861 m_sb_size_lg2 - ( block_state >> state_shift );
863 ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );
865 if ( ok_block_aligned ) {
871 ( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
874 CB::release( sb_state_array , bit , block_state );
876 ok_dealloc_once = 0 <= result ;
879 printf(
" MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
880 , (uintptr_t)m_sb_state_array
883 , (1u << block_size_lg2)
884 , (1u << (m_sb_size_lg2 - block_size_lg2))
891 if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
893 printf(
" MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
894 , (uintptr_t)m_sb_state_array
897 ,
int(ok_block_aligned)
898 ,
int(ok_dealloc_once) );
900 Kokkos::abort(
"Kokkos MemoryPool::deallocate given erroneous pointer");
906 KOKKOS_INLINE_FUNCTION
907 int number_of_superblocks() const noexcept {
return m_sb_count ; }
909 KOKKOS_INLINE_FUNCTION
910 void superblock_state(
int sb_id
912 ,
int & block_count_capacity
913 ,
int & block_count_used )
const noexcept
916 block_count_capacity = 0 ;
917 block_count_used = 0 ;
920 < Kokkos::Impl::ActiveExecutionMemorySpace
921 , base_memory_space >::accessible ) {
924 const uint32_t state =
925 ((uint32_t
volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
927 const uint32_t block_count_lg2 = state >> state_shift ;
928 const uint32_t block_used = state & state_used_mask ;
930 block_size = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
931 block_count_capacity = 1LU << block_count_lg2 ;
932 block_count_used = block_used ;
void * allocate(const size_t arg_alloc_size) const
Allocate untracked memory in the space.
Replacement for std::pair that works on CUDA devices.
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
first_type first
The first element of the pair.
Memory management for host memory.
Declaration of parallel operators.
second_type second
The second element of the pair.
Access relationship between DstMemorySpace and SrcMemorySpace.