Kokkos Core Kernels Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Kokkos_MemoryPool.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 2.0
6 // Copyright (2014) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 
44 #ifndef KOKKOS_MEMORYPOOL_HPP
45 #define KOKKOS_MEMORYPOOL_HPP
46 
47 #include <Kokkos_Core_fwd.hpp>
48 #include <Kokkos_Parallel.hpp>
49 #include <Kokkos_Atomic.hpp>
50 #include <impl/Kokkos_ConcurrentBitset.hpp>
51 #include <impl/Kokkos_Error.hpp>
52 #include <impl/Kokkos_SharedAlloc.hpp>
53 
54 namespace Kokkos {
55 namespace Impl {
56 /* Report violation of size constraints:
57  * min_block_alloc_size <= max_block_alloc_size
58  * max_block_alloc_size <= min_superblock_size
59  * min_superblock_size <= max_superblock_size
60  * min_superblock_size <= min_total_alloc_size
61  * min_superblock_size <= min_block_alloc_size *
62  * max_block_per_superblock
63  */
64 void memory_pool_bounds_verification
65  ( size_t min_block_alloc_size
66  , size_t max_block_alloc_size
67  , size_t min_superblock_size
68  , size_t max_superblock_size
69  , size_t max_block_per_superblock
70  , size_t min_total_alloc_size
71  );
72 }
73 }
74 
75 namespace Kokkos {
76 
77 template< typename DeviceType >
78 class MemoryPool {
79 private:
80 
81  typedef typename Kokkos::Impl::concurrent_bitset CB ;
82 
83  enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
84  enum : uint32_t { state_shift = CB::state_shift };
85  enum : uint32_t { state_used_mask = CB::state_used_mask };
86  enum : uint32_t { state_header_mask = CB::state_header_mask };
87  enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
88  enum : uint32_t { max_bit_count = CB::max_bit_count };
89 
90  enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
91 
92  /* Each superblock has a concurrent bitset state
93  * which is an array of uint32_t integers.
94  * [ { block_count_lg2 : state_shift bits
95  * , used_block_count : ( 32 - state_shift ) bits
96  * }
97  * , { block allocation bit set }* ]
98  *
99  * As superblocks are assigned (allocated) to a block size
100  * and released (deallocated) back to empty the superblock state
101  * is concurrently updated.
102  */
103 
104  /* Mapping between block_size <-> block_state
105  *
106  * block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
107  * block_size = m_sb_size_lg2 - ( block_state >> state_shift )
108  *
109  * Thus A_block_size < B_block_size <=> A_block_state > B_block_state
110  */
111 
112  typedef typename DeviceType::memory_space base_memory_space ;
113 
114  enum { accessible =
116  , base_memory_space >::accessible };
117 
118  typedef Kokkos::Impl::SharedAllocationTracker Tracker ;
119  typedef Kokkos::Impl::SharedAllocationRecord
120  < base_memory_space > Record ;
121 
122  Tracker m_tracker ;
123  uint32_t * m_sb_state_array ;
124  uint32_t m_sb_state_size ;
125  uint32_t m_sb_size_lg2 ;
126  uint32_t m_max_block_size_lg2 ;
127  uint32_t m_min_block_size_lg2 ;
128  int32_t m_sb_count ;
129  int32_t m_hint_offset ; // Offset to K * #block_size array of hints
130  int32_t m_data_offset ; // Offset to 0th superblock data
131  int32_t m_unused_padding ;
132 
133 public:
134 
135  using memory_space = typename DeviceType::memory_space;
136 
138  enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ };
139  enum : uint32_t { max_block_per_superblock = max_bit_count };
140 
141  //--------------------------------------------------------------------------
142 
143  KOKKOS_INLINE_FUNCTION
144  bool operator==(MemoryPool const& other) const
145  { return m_sb_state_array == other.m_sb_state_array; }
146 
147  KOKKOS_INLINE_FUNCTION
148  size_t capacity() const noexcept
149  { return size_t(m_sb_count) << m_sb_size_lg2 ; }
150 
151  KOKKOS_INLINE_FUNCTION
152  size_t min_block_size() const noexcept
153  { return ( 1LU << m_min_block_size_lg2 ); }
154 
155  KOKKOS_INLINE_FUNCTION
156  size_t max_block_size() const noexcept
157  { return ( 1LU << m_max_block_size_lg2 ); }
158 
159  struct usage_statistics {
160  size_t capacity_bytes ;
161  size_t superblock_bytes ;
162  size_t max_block_bytes ;
163  size_t min_block_bytes ;
164  size_t capacity_superblocks ;
165  size_t consumed_superblocks ;
166  size_t consumed_blocks ;
167  size_t consumed_bytes ;
168  size_t reserved_blocks ;
169  size_t reserved_bytes ;
170  };
171 
172  void get_usage_statistics( usage_statistics & stats ) const
173  {
174  Kokkos::HostSpace host ;
175 
176  const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
177 
178  uint32_t * const sb_state_array =
179  accessible ? m_sb_state_array : (uint32_t *) host.allocate(alloc_size);
180 
181  if ( ! accessible ) {
182  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
183  ( sb_state_array , m_sb_state_array , alloc_size );
184  }
185 
186  stats.superblock_bytes = ( 1LU << m_sb_size_lg2 );
187  stats.max_block_bytes = ( 1LU << m_max_block_size_lg2 );
188  stats.min_block_bytes = ( 1LU << m_min_block_size_lg2 );
189  stats.capacity_bytes = stats.superblock_bytes * m_sb_count ;
190  stats.capacity_superblocks = m_sb_count ;
191  stats.consumed_superblocks = 0 ;
192  stats.consumed_blocks = 0 ;
193  stats.consumed_bytes = 0 ;
194  stats.reserved_blocks = 0 ;
195  stats.reserved_bytes = 0 ;
196 
197  const uint32_t * sb_state_ptr = sb_state_array ;
198 
199  for ( int32_t i = 0 ; i < m_sb_count
200  ; ++i , sb_state_ptr += m_sb_state_size ) {
201 
202  const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
203 
204  if ( block_count_lg2 ) {
205  const uint32_t block_count = 1u << block_count_lg2 ;
206  const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
207  const uint32_t block_size = 1u << block_size_lg2 ;
208  const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
209 
210  stats.consumed_superblocks++ ;
211  stats.consumed_blocks += block_used ;
212  stats.consumed_bytes += block_used * block_size ;
213  stats.reserved_blocks += block_count - block_used ;
214  stats.reserved_bytes += (block_count - block_used ) * block_size ;
215  }
216  }
217 
218  if ( ! accessible ) {
219  host.deallocate( sb_state_array, alloc_size );
220  }
221  }
222 
223  void print_state( std::ostream & s ) const
224  {
225  Kokkos::HostSpace host ;
226 
227  const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
228 
229  uint32_t * const sb_state_array =
230  accessible ? m_sb_state_array : (uint32_t *) host.allocate(alloc_size);
231 
232  if ( ! accessible ) {
233  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
234  ( sb_state_array , m_sb_state_array , alloc_size );
235  }
236 
237  const uint32_t * sb_state_ptr = sb_state_array ;
238 
239  s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")"
240  << " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ;
241 
242  for ( int32_t i = 0 ; i < m_sb_count
243  ; ++i , sb_state_ptr += m_sb_state_size ) {
244 
245  if ( *sb_state_ptr ) {
246 
247  const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
248  const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
249  const uint32_t block_count = 1u << block_count_lg2 ;
250  const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
251 
252  s << "Superblock[ " << i << " / " << m_sb_count << " ] {"
253  << " block_size(" << ( 1 << block_size_lg2 ) << ")"
254  << " block_count( " << block_used
255  << " / " << block_count << " )"
256  << std::endl ;
257  }
258  }
259 
260  if ( ! accessible ) {
261  host.deallocate( sb_state_array, alloc_size );
262  }
263  }
264 
265  //--------------------------------------------------------------------------
266 
267 #ifdef KOKKOS_CUDA_9_DEFAULTED_BUG_WORKAROUND
268  KOKKOS_INLINE_FUNCTION MemoryPool( MemoryPool && rhs )
269  : m_tracker(std::move(rhs.m_tracker))
270  , m_sb_state_array(std::move(rhs.m_sb_state_array))
271  , m_sb_state_size(std::move(rhs.m_sb_state_size))
272  , m_sb_size_lg2(std::move(rhs.m_sb_size_lg2))
273  , m_max_block_size_lg2(std::move(rhs.m_max_block_size_lg2))
274  , m_min_block_size_lg2(std::move(rhs.m_min_block_size_lg2))
275  , m_sb_count(std::move(rhs.m_sb_count))
276  , m_hint_offset(std::move(rhs.m_hint_offset))
277  , m_data_offset(std::move(rhs.m_data_offset))
278  {
279  }
280  KOKKOS_INLINE_FUNCTION MemoryPool( const MemoryPool & rhs )
281  : m_tracker(rhs.m_tracker)
282  , m_sb_state_array(rhs.m_sb_state_array)
283  , m_sb_state_size(rhs.m_sb_state_size)
284  , m_sb_size_lg2(rhs.m_sb_size_lg2)
285  , m_max_block_size_lg2(rhs.m_max_block_size_lg2)
286  , m_min_block_size_lg2(rhs.m_min_block_size_lg2)
287  , m_sb_count(rhs.m_sb_count)
288  , m_hint_offset(rhs.m_hint_offset)
289  , m_data_offset(rhs.m_data_offset)
290  {
291  }
292  KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( MemoryPool && rhs )
293  {
294  m_tracker = std::move(rhs.m_tracker);
295  m_sb_state_array = std::move(rhs.m_sb_state_array);
296  m_sb_state_size = std::move(rhs.m_sb_state_size);
297  m_sb_size_lg2 = std::move(rhs.m_sb_size_lg2);
298  m_max_block_size_lg2 = std::move(rhs.m_max_block_size_lg2);
299  m_min_block_size_lg2 = std::move(rhs.m_min_block_size_lg2);
300  m_sb_count = std::move(rhs.m_sb_count);
301  m_hint_offset = std::move(rhs.m_hint_offset);
302  m_data_offset = std::move(rhs.m_data_offset);
303  return *this;
304  }
305  KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( const MemoryPool & rhs )
306  {
307  m_tracker = rhs.m_tracker;
308  m_sb_state_array = rhs.m_sb_state_array;
309  m_sb_state_size = rhs.m_sb_state_size;
310  m_sb_size_lg2 = rhs.m_sb_size_lg2;
311  m_max_block_size_lg2 = rhs.m_max_block_size_lg2;
312  m_min_block_size_lg2 = rhs.m_min_block_size_lg2;
313  m_sb_count = rhs.m_sb_count;
314  m_hint_offset = rhs.m_hint_offset;
315  m_data_offset = rhs.m_data_offset;
316  return *this;
317  }
318 #else
319  KOKKOS_INLINE_FUNCTION MemoryPool( MemoryPool && ) = default ;
320  KOKKOS_INLINE_FUNCTION MemoryPool( const MemoryPool & ) = default ;
321  KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( MemoryPool && ) = default ;
322  KOKKOS_INLINE_FUNCTION MemoryPool & operator = ( const MemoryPool & ) = default ;
323 #endif
324 
325  KOKKOS_INLINE_FUNCTION MemoryPool()
326  : m_tracker()
327  , m_sb_state_array(0)
328  , m_sb_state_size(0)
329  , m_sb_size_lg2(0)
330  , m_max_block_size_lg2(0)
331  , m_min_block_size_lg2(0)
332  , m_sb_count(0)
333  , m_hint_offset(0)
334  , m_data_offset(0)
335  , m_unused_padding(0)
336  {}
337 
352  MemoryPool( const base_memory_space & memspace
353  , const size_t min_total_alloc_size
354  , size_t min_block_alloc_size = 0
355  , size_t max_block_alloc_size = 0
356  , size_t min_superblock_size = 0
357  )
358  : m_tracker()
359  , m_sb_state_array(0)
360  , m_sb_state_size(0)
361  , m_sb_size_lg2(0)
362  , m_max_block_size_lg2(0)
363  , m_min_block_size_lg2(0)
364  , m_sb_count(0)
365  , m_hint_offset(0)
366  , m_data_offset(0)
367  , m_unused_padding(0)
368  {
369  const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
370  const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
371  const uint32_t default_min_block_size = 1u << 6 ; /* 64 bytes */
372  const uint32_t default_max_block_size = 1u << 12 ;/* 4k bytes */
373  const uint32_t default_min_superblock_size = 1u << 20 ;/* 1M bytes */
374 
375  //--------------------------------------------------
376  // Default block and superblock sizes:
377 
378  if ( 0 == min_block_alloc_size ) {
379  // Default all sizes:
380 
381  min_superblock_size =
382  std::min( size_t(default_min_superblock_size)
383  , min_total_alloc_size );
384 
385  min_block_alloc_size =
386  std::min( size_t(default_min_block_size)
387  , min_superblock_size );
388 
389  max_block_alloc_size =
390  std::min( size_t(default_max_block_size)
391  , min_superblock_size );
392  }
393  else if ( 0 == min_superblock_size ) {
394 
395  // Choose superblock size as minimum of:
396  // max_block_per_superblock * min_block_size
397  // max_superblock_size
398  // min_total_alloc_size
399 
400  const size_t max_superblock =
401  min_block_alloc_size * max_block_per_superblock ;
402 
403  min_superblock_size =
404  std::min( max_superblock ,
405  std::min( size_t(max_superblock_size)
406  , min_total_alloc_size ) );
407  }
408 
409  if ( 0 == max_block_alloc_size ) {
410  max_block_alloc_size = min_superblock_size ;
411  }
412 
413  //--------------------------------------------------
414 
415  /* Enforce size constraints:
416  * min_block_alloc_size <= max_block_alloc_size
417  * max_block_alloc_size <= min_superblock_size
418  * min_superblock_size <= max_superblock_size
419  * min_superblock_size <= min_total_alloc_size
420  * min_superblock_size <= min_block_alloc_size *
421  * max_block_per_superblock
422  */
423 
424  Kokkos::Impl::memory_pool_bounds_verification
425  ( min_block_alloc_size
426  , max_block_alloc_size
427  , min_superblock_size
428  , max_superblock_size
429  , max_block_per_superblock
430  , min_total_alloc_size
431  );
432 
433  //--------------------------------------------------
434  // Block and superblock size is power of two:
435  // Maximum value is 'max_superblock_size'
436 
437  m_min_block_size_lg2 =
438  Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
439 
440  m_max_block_size_lg2 =
441  Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
442 
443  m_sb_size_lg2 =
444  Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
445 
446  {
447  // number of superblocks is multiple of superblock size that
448  // can hold min_total_alloc_size.
449 
450  const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
451 
452  m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
453  }
454 
455  {
456  // Any superblock can be assigned to the smallest size block
457  // Size the block bitset to maximum number of blocks
458 
459  const uint32_t max_block_count_lg2 =
460  m_sb_size_lg2 - m_min_block_size_lg2 ;
461 
462  m_sb_state_size =
463  ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
464  }
465 
466  // Array of all superblock states
467 
468  const size_t all_sb_state_size =
469  ( m_sb_count * m_sb_state_size + int_align_mask ) & ~int_align_mask ;
470 
471  // Number of block sizes
472 
473  const int32_t number_block_sizes =
474  1 + m_max_block_size_lg2 - m_min_block_size_lg2 ;
475 
476  // Array length for possible block sizes
477  // Hint array is one uint32_t per block size
478 
479  const int32_t block_size_array_size =
480  ( number_block_sizes + int_align_mask ) & ~int_align_mask ;
481 
482  m_hint_offset = all_sb_state_size ;
483  m_data_offset = m_hint_offset +
484  block_size_array_size * HINT_PER_BLOCK_SIZE ;
485 
486  // Allocation:
487 
488  const size_t header_size = m_data_offset * sizeof(uint32_t);
489  const size_t alloc_size = header_size +
490  ( size_t(m_sb_count) << m_sb_size_lg2 );
491 
492  Record * rec = Record::allocate( memspace , "MemoryPool" , alloc_size );
493 
494  m_tracker.assign_allocated_record_to_uninitialized( rec );
495 
496  m_sb_state_array = (uint32_t *) rec->data();
497 
498  Kokkos::HostSpace host ;
499 
500  uint32_t * const sb_state_array =
501  accessible ? m_sb_state_array
502  : (uint32_t *) host.allocate(header_size);
503 
504  for ( int32_t i = 0 ; i < m_data_offset ; ++i ) sb_state_array[i] = 0 ;
505 
506  // Initial assignment of empty superblocks to block sizes:
507 
508  for ( int32_t i = 0 ; i < number_block_sizes ; ++i ) {
509  const uint32_t block_size_lg2 = i + m_min_block_size_lg2 ;
510  const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
511  const uint32_t block_state = block_count_lg2 << state_shift ;
512  const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE ;
513 
514  // for block size index 'i':
515  // sb_id_hint = sb_state_array[ hint_begin ];
516  // sb_id_begin = sb_state_array[ hint_begin + 1 ];
517 
518  const int32_t jbeg = ( i * m_sb_count ) / number_block_sizes ;
519  const int32_t jend = ( ( i + 1 ) * m_sb_count ) / number_block_sizes ;
520 
521  sb_state_array[ hint_begin ] = uint32_t(jbeg);
522  sb_state_array[ hint_begin + 1 ] = uint32_t(jbeg);
523 
524  for ( int32_t j = jbeg ; j < jend ; ++j ) {
525  sb_state_array[ j * m_sb_state_size ] = block_state ;
526  }
527  }
528 
529  // Write out initialized state:
530 
531  if ( ! accessible ) {
532  Kokkos::Impl::DeepCopy< base_memory_space , Kokkos::HostSpace >
533  ( m_sb_state_array , sb_state_array , header_size );
534 
535  host.deallocate( sb_state_array, header_size );
536  }
537  else {
538  Kokkos::memory_fence();
539  }
540  }
541 
542  //--------------------------------------------------------------------------
543 
544 private:
545 
546  /* Given a size 'n' get the block size in which it can be allocated.
547  * Restrict lower bound to minimum block size.
548  */
549  KOKKOS_FORCEINLINE_FUNCTION
550  uint32_t get_block_size_lg2( uint32_t n ) const noexcept
551  {
552  const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
553 
554  return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i ;
555  }
556 
557 public:
558 
559  /* Return 0 for invalid block size */
560  KOKKOS_INLINE_FUNCTION
561  uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept
562  {
563  return alloc_size <= (1UL << m_max_block_size_lg2)
564  ? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
565  : 0 ;
566  }
567 
568  //--------------------------------------------------------------------------
578  KOKKOS_FUNCTION
579  void * allocate( size_t alloc_size
580  , int32_t attempt_limit = 1 ) const noexcept
581  {
582  if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
583  Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
584  }
585 
586  if ( 0 == alloc_size ) return (void*) 0 ;
587 
588  void * p = 0 ;
589 
590  const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
591 
592  // Allocation will fit within a superblock
593  // that has block sizes ( 1 << block_size_lg2 )
594 
595  const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
596  const uint32_t block_state = block_count_lg2 << state_shift ;
597  const uint32_t block_count = 1u << block_count_lg2 ;
598 
599  // Superblock hints for this block size:
600  // hint_sb_id_ptr[0] is the dynamically changing hint
601  // hint_sb_id_ptr[1] is the static start point
602 
603  volatile uint32_t * const hint_sb_id_ptr
604  = m_sb_state_array /* memory pool state array */
605  + m_hint_offset /* offset to hint portion of array */
606  + HINT_PER_BLOCK_SIZE /* number of hints per block size */
607  * ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */
608 
609  const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
610 
611  // Fast query clock register 'tic' to pseudo-randomize
612  // the guess for which block within a superblock should
613  // be claimed. If not available then a search occurs.
614 
615  const uint32_t block_id_hint =
616  (uint32_t)( Kokkos::Impl::clock_tic()
617 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
618  // Spread out potentially concurrent access
619  // by threads within a warp or thread block.
620  + ( threadIdx.x + blockDim.x * threadIdx.y )
621 #endif
622  );
623 
624  // expected state of superblock for allocation
625  uint32_t sb_state = block_state ;
626 
627  int32_t sb_id = -1 ;
628 
629  volatile uint32_t * sb_state_array = 0 ;
630 
631  while ( attempt_limit ) {
632 
633  int32_t hint_sb_id = -1 ;
634 
635  if ( sb_id < 0 ) {
636 
637  // No superblock specified, try the hint for this block size
638 
639  sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
640 
641  sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
642  }
643 
644  // Require:
645  // 0 <= sb_id
646  // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
647 
648  if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
649 
650  // This superblock state is as expected, for the moment.
651  // Attempt to claim a bit. The attempt updates the state
652  // so have already made sure the state header is as expected.
653 
654  const uint32_t count_lg2 = sb_state >> state_shift ;
655  const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
656 
657  const Kokkos::pair<int,int> result =
658  CB::acquire_bounded_lg2( sb_state_array
659  , count_lg2
660  , block_id_hint & mask
661  , sb_state
662  );
663 
664  // If result.first < 0 then failed to acquire
665  // due to either full or buffer was wrong state.
666  // Could be wrong state if a deallocation raced the
667  // superblock to empty before the acquire could succeed.
668 
669  if ( 0 <= result.first ) { // acquired a bit
670 
671  const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
672 
673  // Set the allocated block pointer
674 
675  p = ((char*)( m_sb_state_array + m_data_offset ))
676  + ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory
677  + ( uint64_t(result.first) << size_lg2 ); // block memory
678 
679 #if 0
680  printf( " MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
681  , (uintptr_t)m_sb_state_array
682  , (uintptr_t)p
683  , alloc_size
684  , sb_id
685  , sb_state
686  , (1u << size_lg2)
687  , (1u << count_lg2)
688  , result.first
689  , result.second );
690 #endif
691 
692  break ; // Success
693  }
694  }
695  //------------------------------------------------------------------
696  // Arrive here if failed to acquire a block.
697  // Must find a new superblock.
698 
699  // Start searching at designated index for this block size.
700  // Look for superblock that, in preferential order,
701  // 1) part-full superblock of this block size
702  // 2) empty superblock to claim for this block size
703  // 3) part-full superblock of the next larger block size
704 
705  sb_state = block_state ; // Expect to find the desired state
706  sb_id = -1 ;
707 
708  bool update_hint = false ;
709  int32_t sb_id_empty = -1 ;
710  int32_t sb_id_large = -1 ;
711  uint32_t sb_state_large = 0 ;
712 
713  sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
714 
715  for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
716 
717  // Query state of the candidate superblock.
718  // Note that the state may change at any moment
719  // as concurrent allocations and deallocations occur.
720 
721  const uint32_t full_state = *sb_state_array ;
722  const uint32_t used = full_state & state_used_mask ;
723  const uint32_t state = full_state & state_header_mask ;
724 
725  if ( state == block_state ) {
726 
727  // Superblock is assigned to this block size
728 
729  if ( used < block_count ) {
730 
731  // There is room to allocate one block
732 
733  sb_id = id ;
734 
735  // Is there room to allocate more than one block?
736 
737  update_hint = used + 1 < block_count ;
738 
739  break ;
740  }
741  }
742  else if ( 0 == used ) {
743 
744  // Superblock is empty
745 
746  if ( -1 == sb_id_empty ) {
747 
748  // Superblock is not assigned to this block size
749  // and is the first empty superblock encountered.
750  // Save this id to use if a partfull superblock is not found.
751 
752  sb_id_empty = id ;
753  }
754  }
755  else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
756  ( -1 == sb_id_large /* have not found a larger */ ) &&
757  ( state < block_state /* a larger block */ ) &&
758  // is not full:
759  ( used < ( 1u << ( state >> state_shift ) ) ) ) {
760  // First superblock encountered that is
761  // larger than this block size and
762  // has room for an allocation.
763  // Save this id to use of partfull or empty superblock not found
764  sb_id_large = id ;
765  sb_state_large = state ;
766  }
767 
768  // Iterate around the superblock array:
769 
770  if ( ++id < m_sb_count ) {
771  sb_state_array += m_sb_state_size ;
772  }
773  else {
774  id = 0 ;
775  sb_state_array = m_sb_state_array ;
776  }
777  }
778 
779  // printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
780 
781  if ( sb_id < 0 ) {
782 
783  // Did not find a partfull superblock for this block size.
784 
785  if ( 0 <= sb_id_empty ) {
786 
787  // Found first empty superblock following designated superblock
788  // Attempt to claim it for this block size.
789  // If the claim fails assume that another thread claimed it
790  // for this block size and try to use it anyway,
791  // but do not update hint.
792 
793  sb_id = sb_id_empty ;
794 
795  sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
796 
797  // If successfully changed assignment of empty superblock 'sb_id'
798  // to this block_size then update the hint.
799 
800  const uint32_t state_empty = state_header_mask & *sb_state_array ;
801 
802  // If this thread claims the empty block then update the hint
803  update_hint =
804  state_empty ==
805  Kokkos::atomic_compare_exchange
806  (sb_state_array,state_empty,block_state);
807  }
808  else if ( 0 <= sb_id_large ) {
809 
810  // Found a larger superblock with space available
811 
812  sb_id = sb_id_large ;
813  sb_state = sb_state_large ;
814 
815  sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
816  }
817  else {
818  // Did not find a potentially usable superblock
819  --attempt_limit ;
820  }
821  }
822 
823  if ( update_hint ) {
824  Kokkos::atomic_compare_exchange
825  ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
826  }
827  } // end allocation attempt loop
828  //--------------------------------------------------------------------
829 
830  return p ;
831  }
832  // end allocate
833  //--------------------------------------------------------------------------
834 
841  KOKKOS_INLINE_FUNCTION
842  void deallocate( void * p , size_t /* alloc_size */ ) const noexcept
843  {
844  if ( 0 == p ) return ;
845 
846  // Determine which superblock and block
847  const ptrdiff_t d =
848  ((char*)p) - ((char*)( m_sb_state_array + m_data_offset ));
849 
850  // Verify contained within the memory pool's superblocks:
851  const int ok_contains =
852  ( 0 <= d ) && ( size_t(d) < ( size_t(m_sb_count) << m_sb_size_lg2 ) );
853 
854  int ok_block_aligned = 0 ;
855  int ok_dealloc_once = 0 ;
856 
857  if ( ok_contains ) {
858 
859  const int sb_id = d >> m_sb_size_lg2 ;
860 
861  // State array for the superblock.
862  volatile uint32_t * const sb_state_array =
863  m_sb_state_array + ( sb_id * m_sb_state_size );
864 
865  const uint32_t block_state = (*sb_state_array) & state_header_mask ;
866  const uint32_t block_size_lg2 =
867  m_sb_size_lg2 - ( block_state >> state_shift );
868 
869  ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );
870 
871  if ( ok_block_aligned ) {
872 
873  // Map address to block's bit
874  // mask into superblock and then shift down for block index
875 
876  const uint32_t bit =
877  ( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
878 
879  const int result =
880  CB::release( sb_state_array , bit , block_state );
881 
882  ok_dealloc_once = 0 <= result ;
883 
884 #if 0
885  printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
886  , (uintptr_t)m_sb_state_array
887  , (uintptr_t)p
888  , sb_id
889  , (1u << block_size_lg2)
890  , (1u << (m_sb_size_lg2 - block_size_lg2))
891  , bit
892  , result );
893 #endif
894  }
895  }
896 
897  if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
898 #if 0
899  printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
900  , (uintptr_t)m_sb_state_array
901  , (uintptr_t)p
902  , int(ok_contains)
903  , int(ok_block_aligned)
904  , int(ok_dealloc_once) );
905 #endif
906  Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
907  }
908  }
909  // end deallocate
910  //--------------------------------------------------------------------------
911 
912  KOKKOS_INLINE_FUNCTION
913  int number_of_superblocks() const noexcept { return m_sb_count ; }
914 
915  KOKKOS_INLINE_FUNCTION
916  void superblock_state( int sb_id
917  , int & block_size
918  , int & block_count_capacity
919  , int & block_count_used ) const noexcept
920  {
921  block_size = 0 ;
922  block_count_capacity = 0 ;
923  block_count_used = 0 ;
924 
926  < Kokkos::Impl::ActiveExecutionMemorySpace
927  , base_memory_space >::accessible ) {
928  // Can access the state array
929 
930  const uint32_t state =
931  ((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
932 
933  const uint32_t block_count_lg2 = state >> state_shift ;
934  const uint32_t block_used = state & state_used_mask ;
935 
936  block_size = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
937  block_count_capacity = 1LU << block_count_lg2 ;
938  block_count_used = block_used ;
939  }
940  }
941 };
942 
943 } // namespace Kokkos
944 
945 #endif /* #ifndef KOKKOS_MEMORYPOOL_HPP */
946 
KOKKOS_INLINE_FUNCTION bool operator==(const complex< RealType1 > &x, const complex< RealType2 > &y)
Equality operator for two complex numbers.
void * allocate(const size_t arg_alloc_size) const
Allocate untracked memory in the space.
Replacement for std::pair that works on CUDA devices.
Definition: Kokkos_Pair.hpp:64
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
first_type first
The first element of the pair.
Definition: Kokkos_Pair.hpp:72
Memory management for host memory.
Declaration of parallel operators.
Atomic functions.
second_type second
The second element of the pair.
Definition: Kokkos_Pair.hpp:74
Access relationship between DstMemorySpace and SrcMemorySpace.