44 #ifndef KOKKOS_MEMORYPOOL_HPP
45 #define KOKKOS_MEMORYPOOL_HPP
47 #include <Kokkos_Core_fwd.hpp>
50 #include <impl/Kokkos_ConcurrentBitset.hpp>
51 #include <impl/Kokkos_Error.hpp>
52 #include <impl/Kokkos_SharedAlloc.hpp>
64 void memory_pool_bounds_verification
65 (
size_t min_block_alloc_size
66 ,
size_t max_block_alloc_size
67 ,
size_t min_superblock_size
68 ,
size_t max_superblock_size
69 ,
size_t max_block_per_superblock
70 ,
size_t min_total_alloc_size
77 template<
typename DeviceType >
81 typedef typename Kokkos::Impl::concurrent_bitset CB ;
83 enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
84 enum : uint32_t { state_shift = CB::state_shift };
85 enum : uint32_t { state_used_mask = CB::state_used_mask };
86 enum : uint32_t { state_header_mask = CB::state_header_mask };
87 enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
88 enum : uint32_t { max_bit_count = CB::max_bit_count };
90 enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
112 typedef typename DeviceType::memory_space base_memory_space ;
116 , base_memory_space >::accessible };
118 typedef Kokkos::Impl::SharedAllocationTracker Tracker ;
119 typedef Kokkos::Impl::SharedAllocationRecord
120 < base_memory_space > Record ;
123 uint32_t * m_sb_state_array ;
124 uint32_t m_sb_state_size ;
125 uint32_t m_sb_size_lg2 ;
126 uint32_t m_max_block_size_lg2 ;
127 uint32_t m_min_block_size_lg2 ;
129 int32_t m_hint_offset ;
130 int32_t m_data_offset ;
131 int32_t m_unused_padding ;
135 using memory_space =
typename DeviceType::memory_space;
138 enum : uint32_t { max_superblock_size = 1LU << 31 };
139 enum : uint32_t { max_block_per_superblock = max_bit_count };
143 KOKKOS_INLINE_FUNCTION
144 bool operator==(MemoryPool
const& other)
const
145 {
return m_sb_state_array == other.m_sb_state_array; }
147 KOKKOS_INLINE_FUNCTION
148 size_t capacity() const noexcept
149 {
return size_t(m_sb_count) << m_sb_size_lg2 ; }
151 KOKKOS_INLINE_FUNCTION
152 size_t min_block_size() const noexcept
153 {
return ( 1LU << m_min_block_size_lg2 ); }
155 KOKKOS_INLINE_FUNCTION
156 size_t max_block_size() const noexcept
157 {
return ( 1LU << m_max_block_size_lg2 ); }
159 struct usage_statistics {
160 size_t capacity_bytes ;
161 size_t superblock_bytes ;
162 size_t max_block_bytes ;
163 size_t min_block_bytes ;
164 size_t capacity_superblocks ;
165 size_t consumed_superblocks ;
166 size_t consumed_blocks ;
167 size_t consumed_bytes ;
168 size_t reserved_blocks ;
169 size_t reserved_bytes ;
172 void get_usage_statistics( usage_statistics & stats )
const
176 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
178 uint32_t *
const sb_state_array =
179 accessible ? m_sb_state_array : (uint32_t *) host.
allocate(alloc_size);
181 if ( ! accessible ) {
182 Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
183 ( sb_state_array , m_sb_state_array , alloc_size );
186 stats.superblock_bytes = ( 1LU << m_sb_size_lg2 );
187 stats.max_block_bytes = ( 1LU << m_max_block_size_lg2 );
188 stats.min_block_bytes = ( 1LU << m_min_block_size_lg2 );
189 stats.capacity_bytes = stats.superblock_bytes * m_sb_count ;
190 stats.capacity_superblocks = m_sb_count ;
191 stats.consumed_superblocks = 0 ;
192 stats.consumed_blocks = 0 ;
193 stats.consumed_bytes = 0 ;
194 stats.reserved_blocks = 0 ;
195 stats.reserved_bytes = 0 ;
197 const uint32_t * sb_state_ptr = sb_state_array ;
199 for ( int32_t i = 0 ; i < m_sb_count
200 ; ++i , sb_state_ptr += m_sb_state_size ) {
202 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
204 if ( block_count_lg2 ) {
205 const uint32_t block_count = 1u << block_count_lg2 ;
206 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
207 const uint32_t block_size = 1u << block_size_lg2 ;
208 const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
210 stats.consumed_superblocks++ ;
211 stats.consumed_blocks += block_used ;
212 stats.consumed_bytes += block_used * block_size ;
213 stats.reserved_blocks += block_count - block_used ;
214 stats.reserved_bytes += (block_count - block_used ) * block_size ;
218 if ( ! accessible ) {
219 host.
deallocate( sb_state_array, alloc_size );
223 void print_state( std::ostream & s )
const
227 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
229 uint32_t *
const sb_state_array =
230 accessible ? m_sb_state_array : (uint32_t *) host.
allocate(alloc_size);
232 if ( ! accessible ) {
233 Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
234 ( sb_state_array , m_sb_state_array , alloc_size );
237 const uint32_t * sb_state_ptr = sb_state_array ;
239 s <<
"pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) <<
")"
240 <<
" superblock_size(" << ( 1LU << m_sb_size_lg2 ) <<
")" << std::endl ;
242 for ( int32_t i = 0 ; i < m_sb_count
243 ; ++i , sb_state_ptr += m_sb_state_size ) {
245 if ( *sb_state_ptr ) {
247 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
248 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
249 const uint32_t block_count = 1u << block_count_lg2 ;
250 const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
252 s <<
"Superblock[ " << i <<
" / " << m_sb_count <<
" ] {"
253 <<
" block_size(" << ( 1 << block_size_lg2 ) <<
")"
254 <<
" block_count( " << block_used
255 <<
" / " << block_count <<
" )"
260 if ( ! accessible ) {
261 host.
deallocate( sb_state_array, alloc_size );
267 #ifdef KOKKOS_CUDA_9_DEFAULTED_BUG_WORKAROUND
268 KOKKOS_INLINE_FUNCTION MemoryPool( MemoryPool && rhs )
269 : m_tracker(std::move(rhs.m_tracker))
270 , m_sb_state_array(std::move(rhs.m_sb_state_array))
271 , m_sb_state_size(std::move(rhs.m_sb_state_size))
272 , m_sb_size_lg2(std::move(rhs.m_sb_size_lg2))
273 , m_max_block_size_lg2(std::move(rhs.m_max_block_size_lg2))
274 , m_min_block_size_lg2(std::move(rhs.m_min_block_size_lg2))
275 , m_sb_count(std::move(rhs.m_sb_count))
276 , m_hint_offset(std::move(rhs.m_hint_offset))
277 , m_data_offset(std::move(rhs.m_data_offset))
280 KOKKOS_INLINE_FUNCTION MemoryPool(
const MemoryPool & rhs )
281 : m_tracker(rhs.m_tracker)
282 , m_sb_state_array(rhs.m_sb_state_array)
283 , m_sb_state_size(rhs.m_sb_state_size)
284 , m_sb_size_lg2(rhs.m_sb_size_lg2)
285 , m_max_block_size_lg2(rhs.m_max_block_size_lg2)
286 , m_min_block_size_lg2(rhs.m_min_block_size_lg2)
287 , m_sb_count(rhs.m_sb_count)
288 , m_hint_offset(rhs.m_hint_offset)
289 , m_data_offset(rhs.m_data_offset)
292 KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( MemoryPool && rhs )
294 m_tracker = std::move(rhs.m_tracker);
295 m_sb_state_array = std::move(rhs.m_sb_state_array);
296 m_sb_state_size = std::move(rhs.m_sb_state_size);
297 m_sb_size_lg2 = std::move(rhs.m_sb_size_lg2);
298 m_max_block_size_lg2 = std::move(rhs.m_max_block_size_lg2);
299 m_min_block_size_lg2 = std::move(rhs.m_min_block_size_lg2);
300 m_sb_count = std::move(rhs.m_sb_count);
301 m_hint_offset = std::move(rhs.m_hint_offset);
302 m_data_offset = std::move(rhs.m_data_offset);
305 KOKKOS_INLINE_FUNCTION MemoryPool & operator = (
const MemoryPool & rhs )
307 m_tracker = rhs.m_tracker;
308 m_sb_state_array = rhs.m_sb_state_array;
309 m_sb_state_size = rhs.m_sb_state_size;
310 m_sb_size_lg2 = rhs.m_sb_size_lg2;
311 m_max_block_size_lg2 = rhs.m_max_block_size_lg2;
312 m_min_block_size_lg2 = rhs.m_min_block_size_lg2;
313 m_sb_count = rhs.m_sb_count;
314 m_hint_offset = rhs.m_hint_offset;
315 m_data_offset = rhs.m_data_offset;
319 KOKKOS_INLINE_FUNCTION MemoryPool( MemoryPool && ) = default ;
320 KOKKOS_INLINE_FUNCTION MemoryPool(
const MemoryPool & ) = default ;
321 KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( MemoryPool && ) = default ;
322 KOKKOS_INLINE_FUNCTION MemoryPool & operator = (
const MemoryPool & ) = default ;
325 KOKKOS_INLINE_FUNCTION MemoryPool()
327 , m_sb_state_array(0)
330 , m_max_block_size_lg2(0)
331 , m_min_block_size_lg2(0)
335 , m_unused_padding(0)
352 MemoryPool(
const base_memory_space & memspace
353 ,
const size_t min_total_alloc_size
354 ,
size_t min_block_alloc_size = 0
355 ,
size_t max_block_alloc_size = 0
356 ,
size_t min_superblock_size = 0
359 , m_sb_state_array(0)
362 , m_max_block_size_lg2(0)
363 , m_min_block_size_lg2(0)
367 , m_unused_padding(0)
369 const uint32_t int_align_lg2 = 3 ;
370 const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
371 const uint32_t default_min_block_size = 1u << 6 ;
372 const uint32_t default_max_block_size = 1u << 12 ;
373 const uint32_t default_min_superblock_size = 1u << 20 ;
378 if ( 0 == min_block_alloc_size ) {
381 min_superblock_size =
382 std::min(
size_t(default_min_superblock_size)
383 , min_total_alloc_size );
385 min_block_alloc_size =
386 std::min(
size_t(default_min_block_size)
387 , min_superblock_size );
389 max_block_alloc_size =
390 std::min(
size_t(default_max_block_size)
391 , min_superblock_size );
393 else if ( 0 == min_superblock_size ) {
400 const size_t max_superblock =
401 min_block_alloc_size * max_block_per_superblock ;
403 min_superblock_size =
404 std::min( max_superblock ,
405 std::min(
size_t(max_superblock_size)
406 , min_total_alloc_size ) );
409 if ( 0 == max_block_alloc_size ) {
410 max_block_alloc_size = min_superblock_size ;
424 Kokkos::Impl::memory_pool_bounds_verification
425 ( min_block_alloc_size
426 , max_block_alloc_size
427 , min_superblock_size
428 , max_superblock_size
429 , max_block_per_superblock
430 , min_total_alloc_size
437 m_min_block_size_lg2 =
438 Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
440 m_max_block_size_lg2 =
441 Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
444 Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
450 const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
452 m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
459 const uint32_t max_block_count_lg2 =
460 m_sb_size_lg2 - m_min_block_size_lg2 ;
463 ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
468 const size_t all_sb_state_size =
469 ( m_sb_count * m_sb_state_size + int_align_mask ) & ~int_align_mask ;
473 const int32_t number_block_sizes =
474 1 + m_max_block_size_lg2 - m_min_block_size_lg2 ;
479 const int32_t block_size_array_size =
480 ( number_block_sizes + int_align_mask ) & ~int_align_mask ;
482 m_hint_offset = all_sb_state_size ;
483 m_data_offset = m_hint_offset +
484 block_size_array_size * HINT_PER_BLOCK_SIZE ;
488 const size_t header_size = m_data_offset *
sizeof(uint32_t);
489 const size_t alloc_size = header_size +
490 ( size_t(m_sb_count) << m_sb_size_lg2 );
492 Record * rec = Record::allocate( memspace ,
"MemoryPool" , alloc_size );
494 m_tracker.assign_allocated_record_to_uninitialized( rec );
496 m_sb_state_array = (uint32_t *) rec->data();
500 uint32_t *
const sb_state_array =
501 accessible ? m_sb_state_array
502 : (uint32_t *) host.
allocate(header_size);
504 for ( int32_t i = 0 ; i < m_data_offset ; ++i ) sb_state_array[i] = 0 ;
508 for ( int32_t i = 0 ; i < number_block_sizes ; ++i ) {
509 const uint32_t block_size_lg2 = i + m_min_block_size_lg2 ;
510 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
511 const uint32_t block_state = block_count_lg2 << state_shift ;
512 const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE ;
518 const int32_t jbeg = ( i * m_sb_count ) / number_block_sizes ;
519 const int32_t jend = ( ( i + 1 ) * m_sb_count ) / number_block_sizes ;
521 sb_state_array[ hint_begin ] = uint32_t(jbeg);
522 sb_state_array[ hint_begin + 1 ] = uint32_t(jbeg);
524 for ( int32_t j = jbeg ; j < jend ; ++j ) {
525 sb_state_array[ j * m_sb_state_size ] = block_state ;
531 if ( ! accessible ) {
532 Kokkos::Impl::DeepCopy< base_memory_space , Kokkos::HostSpace >
533 ( m_sb_state_array , sb_state_array , header_size );
535 host.
deallocate( sb_state_array, header_size );
538 Kokkos::memory_fence();
549 KOKKOS_FORCEINLINE_FUNCTION
550 uint32_t get_block_size_lg2( uint32_t n )
const noexcept
552 const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
554 return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i ;
560 KOKKOS_INLINE_FUNCTION
561 uint32_t allocate_block_size( uint64_t alloc_size )
const noexcept
563 return alloc_size <= (1UL << m_max_block_size_lg2)
564 ? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
579 void * allocate(
size_t alloc_size
580 , int32_t attempt_limit = 1 ) const noexcept
582 if (
size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
583 Kokkos::abort(
"Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
586 if ( 0 == alloc_size )
return (
void*) 0 ;
590 const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
595 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
596 const uint32_t block_state = block_count_lg2 << state_shift ;
597 const uint32_t block_count = 1u << block_count_lg2 ;
603 volatile uint32_t *
const hint_sb_id_ptr
606 + HINT_PER_BLOCK_SIZE
607 * ( block_size_lg2 - m_min_block_size_lg2 );
609 const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
615 const uint32_t block_id_hint =
616 (uint32_t)( Kokkos::Impl::clock_tic()
617 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
620 + ( threadIdx.x + blockDim.x * threadIdx.y )
625 uint32_t sb_state = block_state ;
629 volatile uint32_t * sb_state_array = 0 ;
631 while ( attempt_limit ) {
633 int32_t hint_sb_id = -1 ;
639 sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
641 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
648 if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
654 const uint32_t count_lg2 = sb_state >> state_shift ;
655 const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
658 CB::acquire_bounded_lg2( sb_state_array
660 , block_id_hint & mask
669 if ( 0 <= result.
first ) {
671 const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
675 p = ((
char*)( m_sb_state_array + m_data_offset ))
676 + ( uint64_t(sb_id) << m_sb_size_lg2 )
677 + ( uint64_t(result.
first) << size_lg2 );
680 printf(
" MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
681 , (uintptr_t)m_sb_state_array
705 sb_state = block_state ;
708 bool update_hint = false ;
709 int32_t sb_id_empty = -1 ;
710 int32_t sb_id_large = -1 ;
711 uint32_t sb_state_large = 0 ;
713 sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
715 for ( int32_t i = 0 ,
id = sb_id_begin ; i < m_sb_count ; ++i ) {
721 const uint32_t full_state = *sb_state_array ;
722 const uint32_t used = full_state & state_used_mask ;
723 const uint32_t state = full_state & state_header_mask ;
725 if ( state == block_state ) {
729 if ( used < block_count ) {
737 update_hint = used + 1 < block_count ;
742 else if ( 0 == used ) {
746 if ( -1 == sb_id_empty ) {
755 else if ( ( -1 == sb_id_empty ) &&
756 ( -1 == sb_id_large ) &&
757 ( state < block_state ) &&
759 ( used < ( 1u << ( state >> state_shift ) ) ) ) {
765 sb_state_large = state ;
770 if ( ++
id < m_sb_count ) {
771 sb_state_array += m_sb_state_size ;
775 sb_state_array = m_sb_state_array ;
785 if ( 0 <= sb_id_empty ) {
793 sb_id = sb_id_empty ;
795 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
800 const uint32_t state_empty = state_header_mask & *sb_state_array ;
805 Kokkos::atomic_compare_exchange
806 (sb_state_array,state_empty,block_state);
808 else if ( 0 <= sb_id_large ) {
812 sb_id = sb_id_large ;
813 sb_state = sb_state_large ;
815 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
824 Kokkos::atomic_compare_exchange
825 ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
841 KOKKOS_INLINE_FUNCTION
842 void deallocate(
void * p ,
size_t ) const noexcept
844 if ( 0 == p ) return ;
848 ((
char*)p) - ((
char*)( m_sb_state_array + m_data_offset ));
851 const int ok_contains =
852 ( 0 <= d ) && (
size_t(d) < ( size_t(m_sb_count) << m_sb_size_lg2 ) );
854 int ok_block_aligned = 0 ;
855 int ok_dealloc_once = 0 ;
859 const int sb_id = d >> m_sb_size_lg2 ;
862 volatile uint32_t *
const sb_state_array =
863 m_sb_state_array + ( sb_id * m_sb_state_size );
865 const uint32_t block_state = (*sb_state_array) & state_header_mask ;
866 const uint32_t block_size_lg2 =
867 m_sb_size_lg2 - ( block_state >> state_shift );
869 ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );
871 if ( ok_block_aligned ) {
877 ( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
880 CB::release( sb_state_array , bit , block_state );
882 ok_dealloc_once = 0 <= result ;
885 printf(
" MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
886 , (uintptr_t)m_sb_state_array
889 , (1u << block_size_lg2)
890 , (1u << (m_sb_size_lg2 - block_size_lg2))
897 if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
899 printf(
" MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
900 , (uintptr_t)m_sb_state_array
903 ,
int(ok_block_aligned)
904 ,
int(ok_dealloc_once) );
906 Kokkos::abort(
"Kokkos MemoryPool::deallocate given erroneous pointer");
912 KOKKOS_INLINE_FUNCTION
913 int number_of_superblocks() const noexcept {
return m_sb_count ; }
915 KOKKOS_INLINE_FUNCTION
916 void superblock_state(
int sb_id
918 ,
int & block_count_capacity
919 ,
int & block_count_used )
const noexcept
922 block_count_capacity = 0 ;
923 block_count_used = 0 ;
926 < Kokkos::Impl::ActiveExecutionMemorySpace
927 , base_memory_space >::accessible ) {
930 const uint32_t state =
931 ((uint32_t
volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
933 const uint32_t block_count_lg2 = state >> state_shift ;
934 const uint32_t block_used = state & state_used_mask ;
936 block_size = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
937 block_count_capacity = 1LU << block_count_lg2 ;
938 block_count_used = block_used ;
KOKKOS_INLINE_FUNCTION bool operator==(const complex< RealType1 > &x, const complex< RealType2 > &y)
Equality operator for two complex numbers.
void * allocate(const size_t arg_alloc_size) const
Allocate untracked memory in the space.
Replacement for std::pair that works on CUDA devices.
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
first_type first
The first element of the pair.
Memory management for host memory.
Declaration of parallel operators.
second_type second
The second element of the pair.
Access relationship between DstMemorySpace and SrcMemorySpace.