Kokkos Core Kernels Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Kokkos_MemoryPool.hpp
1 //@HEADER
2 // ************************************************************************
3 //
4 // Kokkos v. 4.0
5 // Copyright (2022) National Technology & Engineering
6 // Solutions of Sandia, LLC (NTESS).
7 //
8 // Under the terms of Contract DE-NA0003525 with NTESS,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
12 // See https://kokkos.org/LICENSE for license information.
13 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14 //
15 //@HEADER
16 
17 #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
18 #include <Kokkos_Macros.hpp>
19 static_assert(false,
20  "Including non-public Kokkos header files is not allowed.");
21 #endif
22 #ifndef KOKKOS_MEMORYPOOL_HPP
23 #define KOKKOS_MEMORYPOOL_HPP
24 
25 #include <Kokkos_Core_fwd.hpp>
26 #include <Kokkos_Parallel.hpp>
27 #include <Kokkos_Atomic.hpp>
28 #include <impl/Kokkos_ConcurrentBitset.hpp>
29 #include <impl/Kokkos_Error.hpp>
30 #include <impl/Kokkos_SharedAlloc.hpp>
31 
32 // NOLINTBEGIN(bugprone-implicit-widening-of-multiplication-result)
33 namespace Kokkos {
34 namespace Impl {
35 /* Report violation of size constraints:
36  * min_block_alloc_size <= max_block_alloc_size
37  * max_block_alloc_size <= min_superblock_size
38  * min_superblock_size <= max_superblock_size
39  * min_superblock_size <= min_total_alloc_size
40  * min_superblock_size <= min_block_alloc_size *
41  * max_block_per_superblock
42  */
43 void memory_pool_bounds_verification(size_t min_block_alloc_size,
44  size_t max_block_alloc_size,
45  size_t min_superblock_size,
46  size_t max_superblock_size,
47  size_t max_block_per_superblock,
48  size_t min_total_alloc_size);
49 } // namespace Impl
50 } // namespace Kokkos
51 
52 namespace Kokkos {
53 
54 namespace Impl {
55 
56 void _print_memory_pool_state(std::ostream &s, uint32_t const *sb_state_ptr,
57  int32_t sb_count, uint32_t sb_size_lg2,
58  uint32_t sb_state_size, uint32_t state_shift,
59  uint32_t state_used_mask);
60 
61 } // end namespace Impl
62 
63 template <typename DeviceType>
64 class MemoryPool {
65  private:
66  using CB = Kokkos::Impl::concurrent_bitset;
67 
68  enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
69  enum : uint32_t { state_shift = CB::state_shift };
70  enum : uint32_t { state_used_mask = CB::state_used_mask };
71  enum : uint32_t { state_header_mask = CB::state_header_mask };
72  enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
73  enum : uint32_t { max_bit_count = CB::max_bit_count };
74 
75  enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
76 
77  static KOKKOS_FUNCTION unsigned integral_power_of_two_that_contains(
78  const unsigned N) {
79  return N ? Kokkos::bit_width(N - 1) : 0;
80  }
81 
82  /* Each superblock has a concurrent bitset state
83  * which is an array of uint32_t integers.
84  * [ { block_count_lg2 : state_shift bits
85  * , used_block_count : ( 32 - state_shift ) bits
86  * }
87  * , { block allocation bit set }* ]
88  *
89  * As superblocks are assigned (allocated) to a block size
90  * and released (deallocated) back to empty the superblock state
91  * is concurrently updated.
92  */
93 
94  /* Mapping between block_size <-> block_state
95  *
96  * block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
97  * block_size = m_sb_size_lg2 - ( block_state >> state_shift )
98  *
99  * Thus A_block_size < B_block_size <=> A_block_state > B_block_state
100  */
101 
102  using base_memory_space = typename DeviceType::memory_space;
103 
104  enum {
106  base_memory_space>::accessible
107  };
108 
109  using Tracker = Kokkos::Impl::SharedAllocationTracker;
110  using Record = Kokkos::Impl::SharedAllocationRecord<base_memory_space>;
111 
112  Tracker m_tracker;
113  uint32_t *m_sb_state_array;
114  uint32_t m_sb_state_size;
115  uint32_t m_sb_size_lg2;
116  uint32_t m_max_block_size_lg2;
117  uint32_t m_min_block_size_lg2;
118  int32_t m_sb_count;
119  int32_t m_hint_offset; // Offset to K * #block_size array of hints
120  int32_t m_data_offset; // Offset to 0th superblock data
121  int32_t m_unused_padding;
122 
123  public:
124  using memory_space = typename DeviceType::memory_space;
125 
127  enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ };
128  enum : uint32_t { max_block_per_superblock = max_bit_count };
129 
130  //--------------------------------------------------------------------------
131 
132  KOKKOS_INLINE_FUNCTION
133  bool operator==(MemoryPool const &other) const {
134  return m_sb_state_array == other.m_sb_state_array;
135  }
136 
137  KOKKOS_INLINE_FUNCTION
138  size_t capacity() const noexcept {
139  return size_t(m_sb_count) << m_sb_size_lg2;
140  }
141 
142  KOKKOS_INLINE_FUNCTION
143  size_t min_block_size() const noexcept {
144  return (1LU << m_min_block_size_lg2);
145  }
146 
147  KOKKOS_INLINE_FUNCTION
148  size_t max_block_size() const noexcept {
149  return (1LU << m_max_block_size_lg2);
150  }
151 
152  struct usage_statistics {
153  size_t capacity_bytes;
154  size_t superblock_bytes;
155  size_t max_block_bytes;
156  size_t min_block_bytes;
157  size_t capacity_superblocks;
158  size_t consumed_superblocks;
159  size_t consumed_blocks;
160  size_t consumed_bytes;
161  size_t reserved_blocks;
162  size_t reserved_bytes;
163  };
164 
165  // This function is templated to avoid needing a full definition of
166  // DefaultHostExecutionSpace at class instantiation
167  template <typename ExecutionSpace = Kokkos::DefaultHostExecutionSpace>
168  void get_usage_statistics(usage_statistics &stats) const {
169  Kokkos::HostSpace host;
170  static_assert(
171  std::is_same_v<ExecutionSpace, Kokkos::DefaultHostExecutionSpace>);
172 
173  const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
174 
175  uint32_t *const sb_state_array =
176  accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size);
177 
178  if (!accessible) {
179  Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
180  ExecutionSpace{}, sb_state_array, m_sb_state_array, alloc_size);
181  Kokkos::fence(
182  "MemoryPool::get_usage_statistics(): fence after copying state "
183  "array to HostSpace");
184  }
185 
186  stats.superblock_bytes = (1LU << m_sb_size_lg2);
187  stats.max_block_bytes = (1LU << m_max_block_size_lg2);
188  stats.min_block_bytes = (1LU << m_min_block_size_lg2);
189  stats.capacity_bytes = stats.superblock_bytes * m_sb_count;
190  stats.capacity_superblocks = m_sb_count;
191  stats.consumed_superblocks = 0;
192  stats.consumed_blocks = 0;
193  stats.consumed_bytes = 0;
194  stats.reserved_blocks = 0;
195  stats.reserved_bytes = 0;
196 
197  const uint32_t *sb_state_ptr = sb_state_array;
198 
199  for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) {
200  const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift;
201 
202  if (block_count_lg2) {
203  const uint32_t block_count = 1u << block_count_lg2;
204  const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2;
205  const uint32_t block_size = 1u << block_size_lg2;
206  const uint32_t block_used = (*sb_state_ptr) & state_used_mask;
207 
208  stats.consumed_superblocks++;
209  stats.consumed_blocks += block_used;
210  stats.consumed_bytes += static_cast<size_t>(block_used) * block_size;
211  stats.reserved_blocks += block_count - block_used;
212  stats.reserved_bytes +=
213  static_cast<size_t>(block_count - block_used) * block_size;
214  }
215  }
216 
217  if (!accessible) {
218  host.deallocate(sb_state_array, alloc_size);
219  }
220  }
221 
222  // This function is templated to avoid needing a full definition of
223  // DefaultHostExecutionSpace at class instantiation
224  template <typename ExecutionSpace = Kokkos::DefaultHostExecutionSpace>
225  void print_state(std::ostream &s) const {
226  Kokkos::HostSpace host;
227  static_assert(
228  std::is_same_v<ExecutionSpace, Kokkos::DefaultHostExecutionSpace>);
229 
230  const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
231 
232  uint32_t *const sb_state_array =
233  accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size);
234 
235  if (!accessible) {
236  Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
237  ExecutionSpace{}, sb_state_array, m_sb_state_array, alloc_size);
238  Kokkos::fence(
239  "MemoryPool::print_state(): fence after copying state array to "
240  "HostSpace");
241  }
242 
243  Impl::_print_memory_pool_state(s, sb_state_array, m_sb_count, m_sb_size_lg2,
244  m_sb_state_size, state_shift,
245  state_used_mask);
246 
247  if (!accessible) {
248  host.deallocate(sb_state_array, alloc_size);
249  }
250  }
251 
252  //--------------------------------------------------------------------------
253 
254  KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default;
255  KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default;
256  KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default;
257  KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default;
258 
259  KOKKOS_INLINE_FUNCTION MemoryPool()
260  : m_tracker(),
261  m_sb_state_array(nullptr),
262  m_sb_state_size(0),
263  m_sb_size_lg2(0),
264  m_max_block_size_lg2(0),
265  m_min_block_size_lg2(0),
266  m_sb_count(0),
267  m_hint_offset(0),
268  m_data_offset(0),
269  m_unused_padding(0) {}
270 
285  MemoryPool(const base_memory_space &memspace,
286  const size_t min_total_alloc_size, size_t min_block_alloc_size = 0,
287  size_t max_block_alloc_size = 0, size_t min_superblock_size = 0)
288  : m_tracker(),
289  m_sb_state_array(nullptr),
290  m_sb_state_size(0),
291  m_sb_size_lg2(0),
292  m_max_block_size_lg2(0),
293  m_min_block_size_lg2(0),
294  m_sb_count(0),
295  m_hint_offset(0),
296  m_data_offset(0),
297  m_unused_padding(0) {
298  const uint32_t int_align_lg2 = 3; /* align as int[8] */
299  const uint32_t int_align_mask = (1u << int_align_lg2) - 1;
300  const uint32_t default_min_block_size = 1u << 6; /* 64 bytes */
301  const uint32_t default_max_block_size = 1u << 12; /* 4k bytes */
302  const uint32_t default_min_superblock_size = 1u << 20; /* 1M bytes */
303 
304  //--------------------------------------------------
305  // Default block and superblock sizes:
306 
307  if (0 == min_block_alloc_size) {
308  // Default all sizes:
309 
310  min_superblock_size =
311  std::min(size_t(default_min_superblock_size), min_total_alloc_size);
312 
313  min_block_alloc_size =
314  std::min(size_t(default_min_block_size), min_superblock_size);
315 
316  max_block_alloc_size =
317  std::min(size_t(default_max_block_size), min_superblock_size);
318  } else if (0 == min_superblock_size) {
319  // Choose superblock size as minimum of:
320  // max_block_per_superblock * min_block_size
321  // max_superblock_size
322  // min_total_alloc_size
323 
324  const size_t max_superblock =
325  min_block_alloc_size * max_block_per_superblock;
326 
327  min_superblock_size =
328  std::min(max_superblock,
329  std::min(size_t(max_superblock_size), min_total_alloc_size));
330  }
331 
332  if (0 == max_block_alloc_size) {
333  max_block_alloc_size = min_superblock_size;
334  }
335 
336  //--------------------------------------------------
337 
338  /* Enforce size constraints:
339  * min_block_alloc_size <= max_block_alloc_size
340  * max_block_alloc_size <= min_superblock_size
341  * min_superblock_size <= max_superblock_size
342  * min_superblock_size <= min_total_alloc_size
343  * min_superblock_size <= min_block_alloc_size *
344  * max_block_per_superblock
345  */
346 
347  Kokkos::Impl::memory_pool_bounds_verification(
348  min_block_alloc_size, max_block_alloc_size, min_superblock_size,
349  max_superblock_size, max_block_per_superblock, min_total_alloc_size);
350 
351  //--------------------------------------------------
352  // Block and superblock size is power of two:
353  // Maximum value is 'max_superblock_size'
354 
355  m_min_block_size_lg2 =
356  integral_power_of_two_that_contains(min_block_alloc_size);
357 
358  m_max_block_size_lg2 =
359  integral_power_of_two_that_contains(max_block_alloc_size);
360 
361  m_sb_size_lg2 = integral_power_of_two_that_contains(min_superblock_size);
362 
363  {
364  // number of superblocks is multiple of superblock size that
365  // can hold min_total_alloc_size.
366 
367  const uint64_t sb_size_mask = (1LU << m_sb_size_lg2) - 1;
368 
369  m_sb_count = (min_total_alloc_size + sb_size_mask) >> m_sb_size_lg2;
370  }
371 
372  {
373  // Any superblock can be assigned to the smallest size block
374  // Size the block bitset to maximum number of blocks
375 
376  const uint32_t max_block_count_lg2 = m_sb_size_lg2 - m_min_block_size_lg2;
377 
378  m_sb_state_size =
379  (CB::buffer_bound_lg2(max_block_count_lg2) + int_align_mask) &
380  ~int_align_mask;
381  }
382 
383  // Array of all superblock states
384 
385  const size_t all_sb_state_size =
386  (m_sb_count * m_sb_state_size + int_align_mask) & ~int_align_mask;
387 
388  // Number of block sizes
389 
390  const int32_t number_block_sizes =
391  1 + m_max_block_size_lg2 - m_min_block_size_lg2;
392 
393  // Array length for possible block sizes
394  // Hint array is one uint32_t per block size
395 
396  const int32_t block_size_array_size =
397  (number_block_sizes + int_align_mask) & ~int_align_mask;
398 
399  m_hint_offset = all_sb_state_size;
400  m_data_offset = m_hint_offset + block_size_array_size * HINT_PER_BLOCK_SIZE;
401 
402  // Allocation:
403 
404  const size_t header_size = m_data_offset * sizeof(uint32_t);
405  const size_t alloc_size =
406  header_size + (size_t(m_sb_count) << m_sb_size_lg2);
407 
408  Record *rec = Record::allocate(memspace, "Kokkos::MemoryPool", alloc_size);
409 
410  m_tracker.assign_allocated_record_to_uninitialized(rec);
411 
412  m_sb_state_array = (uint32_t *)rec->data();
413 
414  Kokkos::HostSpace host;
415 
416  uint32_t *const sb_state_array =
417  accessible ? m_sb_state_array : (uint32_t *)host.allocate(header_size);
418 
419  for (int32_t i = 0; i < m_data_offset; ++i) sb_state_array[i] = 0;
420 
421  // Initial assignment of empty superblocks to block sizes:
422 
423  for (int32_t i = 0; i < number_block_sizes; ++i) {
424  const uint32_t block_size_lg2 = i + m_min_block_size_lg2;
425  const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
426  const uint32_t block_state = block_count_lg2 << state_shift;
427  const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE;
428 
429  // for block size index 'i':
430  // sb_id_hint = sb_state_array[ hint_begin ];
431  // sb_id_begin = sb_state_array[ hint_begin + 1 ];
432 
433  const int32_t jbeg = (i * m_sb_count) / number_block_sizes;
434  const int32_t jend = ((i + 1) * m_sb_count) / number_block_sizes;
435 
436  sb_state_array[hint_begin] = uint32_t(jbeg);
437  sb_state_array[hint_begin + 1] = uint32_t(jbeg);
438 
439  for (int32_t j = jbeg; j < jend; ++j) {
440  sb_state_array[j * m_sb_state_size] = block_state;
441  }
442  }
443 
444  // Write out initialized state:
445 
446  if (!accessible) {
447  Kokkos::Impl::DeepCopy<base_memory_space, Kokkos::HostSpace>(
448  typename base_memory_space::execution_space{}, m_sb_state_array,
449  sb_state_array, header_size);
450  Kokkos::fence(
451  "MemoryPool::MemoryPool(): fence after copying state array from "
452  "HostSpace");
453 
454  host.deallocate(sb_state_array, header_size);
455  } else {
456  Kokkos::memory_fence();
457  }
458  }
459 
460  //--------------------------------------------------------------------------
461 
462  private:
463  /* Given a size 'n' get the block size in which it can be allocated.
464  * Restrict lower bound to minimum block size.
465  */
466  KOKKOS_FORCEINLINE_FUNCTION
467  uint32_t get_block_size_lg2(uint32_t n) const noexcept {
468  const unsigned i = integral_power_of_two_that_contains(n);
469 
470  return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i;
471  }
472 
473  public:
474  /* Return 0 for invalid block size */
475  KOKKOS_INLINE_FUNCTION
476  uint32_t allocate_block_size(uint64_t alloc_size) const noexcept {
477  return alloc_size <= (uint64_t(1) << m_max_block_size_lg2)
478  ? (1UL << get_block_size_lg2(uint32_t(alloc_size)))
479  : 0;
480  }
481 
482  //--------------------------------------------------------------------------
492  KOKKOS_FUNCTION
493  void *allocate(size_t alloc_size, int32_t attempt_limit = 1) const noexcept {
494  if ((size_t(1) << m_max_block_size_lg2) < alloc_size) {
495  Kokkos::abort(
496  "Kokkos MemoryPool allocation request exceeded specified maximum "
497  "allocation size");
498  }
499 
500  if (0 == alloc_size) return nullptr;
501 
502  void *p = nullptr;
503 
504  const uint32_t block_size_lg2 = get_block_size_lg2(alloc_size);
505 
506  // Allocation will fit within a superblock
507  // that has block sizes ( 1 << block_size_lg2 )
508 
509  const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
510  const uint32_t block_state = block_count_lg2 << state_shift;
511  const uint32_t block_count = 1u << block_count_lg2;
512 
513  // Superblock hints for this block size:
514  // hint_sb_id_ptr[0] is the dynamically changing hint
515  // hint_sb_id_ptr[1] is the static start point
516 
517  volatile uint32_t *const hint_sb_id_ptr =
518  m_sb_state_array /* memory pool state array */
519  + m_hint_offset /* offset to hint portion of array */
520  + HINT_PER_BLOCK_SIZE /* number of hints per block size */
521  * (block_size_lg2 - m_min_block_size_lg2); /* block size id */
522 
523  const int32_t sb_id_begin = int32_t(hint_sb_id_ptr[1]);
524 
525  // Fast query clock register 'tic' to pseudo-randomize
526  // the guess for which block within a superblock should
527  // be claimed. If not available then a search occurs.
528 #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
529  const uint32_t block_id_hint = alloc_size;
530 #else
531  const uint32_t block_id_hint =
532  (uint32_t)(Kokkos::Impl::clock_tic()
533 #ifdef __CUDA_ARCH__ // FIXME_CUDA
534  // Spread out potentially concurrent access
535  // by threads within a warp or thread block.
536  + (threadIdx.x + blockDim.x * threadIdx.y)
537 #endif
538  );
539 #endif
540 
541  // expected state of superblock for allocation
542  uint32_t sb_state = block_state;
543 
544  int32_t sb_id = -1;
545 
546  volatile uint32_t *sb_state_array = nullptr;
547 
548  while (attempt_limit) {
549  int32_t hint_sb_id = -1;
550 
551  if (sb_id < 0) {
552  // No superblock specified, try the hint for this block size
553 
554  sb_id = hint_sb_id = int32_t(*hint_sb_id_ptr);
555 
556  sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
557  }
558 
559  // Require:
560  // 0 <= sb_id
561  // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
562 
563  if (sb_state == (state_header_mask & *sb_state_array)) {
564  // This superblock state is as expected, for the moment.
565  // Attempt to claim a bit. The attempt updates the state
566  // so have already made sure the state header is as expected.
567 
568  const uint32_t count_lg2 = sb_state >> state_shift;
569  const uint32_t mask = (1u << count_lg2) - 1;
570 
571  const Kokkos::pair<int, int> result = CB::acquire_bounded_lg2(
572  sb_state_array, count_lg2, block_id_hint & mask, sb_state);
573 
574  // If result.first < 0 then failed to acquire
575  // due to either full or buffer was wrong state.
576  // Could be wrong state if a deallocation raced the
577  // superblock to empty before the acquire could succeed.
578 
579  if (0 <= result.first) { // acquired a bit
580 
581  const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2;
582 
583  // Set the allocated block pointer
584 
585  p = ((char *)(m_sb_state_array + m_data_offset)) +
586  (uint64_t(sb_id) << m_sb_size_lg2) // superblock memory
587  + (uint64_t(result.first) << size_lg2); // block memory
588 
589  break; // Success
590  }
591  }
592  //------------------------------------------------------------------
593  // Arrive here if failed to acquire a block.
594  // Must find a new superblock.
595 
596  // Start searching at designated index for this block size.
597  // Look for superblock that, in preferential order,
598  // 1) part-full superblock of this block size
599  // 2) empty superblock to claim for this block size
600  // 3) part-full superblock of the next larger block size
601 
602  sb_state = block_state; // Expect to find the desired state
603  sb_id = -1;
604 
605  bool update_hint = false;
606  int32_t sb_id_empty = -1;
607  int32_t sb_id_large = -1;
608  uint32_t sb_state_large = 0;
609 
610  sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size;
611 
612  for (int32_t i = 0, id = sb_id_begin; i < m_sb_count; ++i) {
613  // Query state of the candidate superblock.
614  // Note that the state may change at any moment
615  // as concurrent allocations and deallocations occur.
616 
617  const uint32_t full_state = *sb_state_array;
618  const uint32_t used = full_state & state_used_mask;
619  const uint32_t state = full_state & state_header_mask;
620 
621  if (state == block_state) {
622  // Superblock is assigned to this block size
623 
624  if (used < block_count) {
625  // There is room to allocate one block
626 
627  sb_id = id;
628 
629  // Is there room to allocate more than one block?
630 
631  update_hint = used + 1 < block_count;
632 
633  break;
634  }
635  } else if (0 == used) {
636  // Superblock is empty
637 
638  if (-1 == sb_id_empty) {
639  // Superblock is not assigned to this block size
640  // and is the first empty superblock encountered.
641  // Save this id to use if a partfull superblock is not found.
642 
643  sb_id_empty = id;
644  }
645  } else if ((-1 == sb_id_empty /* have not found an empty */) &&
646  (-1 == sb_id_large /* have not found a larger */) &&
647  (state < block_state /* a larger block */) &&
648  // is not full:
649  (used < (1u << (state >> state_shift)))) {
650  // First superblock encountered that is
651  // larger than this block size and
652  // has room for an allocation.
653  // Save this id to use of partfull or empty superblock not found
654  sb_id_large = id;
655  sb_state_large = state;
656  }
657 
658  // Iterate around the superblock array:
659 
660  if (++id < m_sb_count) {
661  sb_state_array += m_sb_state_size;
662  } else {
663  id = 0;
664  sb_state_array = m_sb_state_array;
665  }
666  }
667 
668  // printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)
669  // sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
670 
671  if (sb_id < 0) {
672  // Did not find a partfull superblock for this block size.
673 
674  if (0 <= sb_id_empty) {
675  // Found first empty superblock following designated superblock
676  // Attempt to claim it for this block size.
677  // If the claim fails assume that another thread claimed it
678  // for this block size and try to use it anyway,
679  // but do not update hint.
680 
681  sb_id = sb_id_empty;
682 
683  sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
684 
685  // If successfully changed assignment of empty superblock 'sb_id'
686  // to this block_size then update the hint.
687 
688  const uint32_t state_empty = state_header_mask & *sb_state_array;
689 
690  // If this thread claims the empty block then update the hint
691  update_hint =
692  state_empty == Kokkos::atomic_compare_exchange(
693  sb_state_array, state_empty, block_state);
694  } else if (0 <= sb_id_large) {
695  // Found a larger superblock with space available
696 
697  sb_id = sb_id_large;
698  sb_state = sb_state_large;
699 
700  sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
701  } else {
702  // Did not find a potentially usable superblock
703  --attempt_limit;
704  }
705  }
706 
707  if (update_hint) {
708  Kokkos::atomic_compare_exchange(hint_sb_id_ptr, uint32_t(hint_sb_id),
709  uint32_t(sb_id));
710  }
711  } // end allocation attempt loop
712  //--------------------------------------------------------------------
713 
714  return p;
715  }
716  // end allocate
717  //--------------------------------------------------------------------------
718 
725  KOKKOS_INLINE_FUNCTION
726  void deallocate(void *p, size_t /* alloc_size */) const noexcept {
727  if (nullptr == p) return;
728 
729  // Determine which superblock and block
730  const ptrdiff_t d =
731  static_cast<char *>(p) -
732  reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
733 
734  // Verify contained within the memory pool's superblocks:
735  const int ok_contains =
736  (0 <= d) && (size_t(d) < (size_t(m_sb_count) << m_sb_size_lg2));
737 
738  int ok_block_aligned = 0;
739  int ok_dealloc_once = 0;
740 
741  if (ok_contains) {
742  const int sb_id = d >> m_sb_size_lg2;
743 
744  // State array for the superblock.
745  volatile uint32_t *const sb_state_array =
746  m_sb_state_array + (sb_id * m_sb_state_size);
747 
748  const uint32_t block_state = (*sb_state_array) & state_header_mask;
749  const uint32_t block_size_lg2 =
750  m_sb_size_lg2 - (block_state >> state_shift);
751 
752  ok_block_aligned = 0 == (d & ((1UL << block_size_lg2) - 1));
753 
754  if (ok_block_aligned) {
755  // Map address to block's bit
756  // mask into superblock and then shift down for block index
757 
758  const uint32_t bit =
759  (d & ((ptrdiff_t(1) << m_sb_size_lg2) - 1)) >> block_size_lg2;
760 
761  const int result = CB::release(sb_state_array, bit, block_state);
762 
763  ok_dealloc_once = 0 <= result;
764  }
765  }
766 
767  if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
768  Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
769  }
770  }
771  // end deallocate
772  //--------------------------------------------------------------------------
773 
774  KOKKOS_INLINE_FUNCTION
775  int number_of_superblocks() const noexcept { return m_sb_count; }
776 
777  KOKKOS_INLINE_FUNCTION
778  void superblock_state(int sb_id, int &block_size, int &block_count_capacity,
779  int &block_count_used) const noexcept {
780  block_size = 0;
781  block_count_capacity = 0;
782  block_count_used = 0;
783 
784  bool can_access_state_array = []() {
785  KOKKOS_IF_ON_HOST(
786  (return SpaceAccessibility<DefaultHostExecutionSpace,
787  base_memory_space>::accessible;))
788  KOKKOS_IF_ON_DEVICE(
789  (return SpaceAccessibility<DefaultExecutionSpace,
790  base_memory_space>::accessible;))
791  }();
792 
793  if (can_access_state_array) {
794  // Can access the state array
795 
796  const uint32_t state =
797  ((uint32_t volatile *)m_sb_state_array)[sb_id * m_sb_state_size];
798 
799  const uint32_t block_count_lg2 = state >> state_shift;
800  const uint32_t block_used = state & state_used_mask;
801 
802  block_size = 1LU << (m_sb_size_lg2 - block_count_lg2);
803  block_count_capacity = 1LU << block_count_lg2;
804  block_count_used = block_used;
805  }
806  }
807 };
808 
809 } // namespace Kokkos
810  // NOLINTEND(bugprone-implicit-widening-of-multiplication-result)
811 
812 #endif /* #ifndef KOKKOS_MEMORYPOOL_HPP */
void * allocate(const ExecutionSpace &, const size_t arg_alloc_size) const
Allocate untracked memory in the space.
Replacement for std::pair that works on CUDA devices.
Definition: Kokkos_Pair.hpp:44
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
first_type first
The first element of the pair.
Definition: Kokkos_Pair.hpp:51
Memory management for host memory.
Declaration of parallel operators.
Atomic functions.
Access relationship between DstMemorySpace and SrcMemorySpace.