Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_DeviceProp.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef STOKHOS_CUDA_DEVICE_PROP_HPP
11 #define STOKHOS_CUDA_DEVICE_PROP_HPP
12 
13 #include "Kokkos_Core.hpp"
14 
16 
17 #include "cuda_runtime_api.h"
18 
19 namespace Stokhos {
20 
21  // Class encapsulating various device attributes
22  class DeviceProp {
23  public:
24 
25  typedef Kokkos::Cuda::size_type size_type;
26 
29 
42 
44  bool has_ldg;
45 
46  DeviceProp(int device_id = -1) :
56  warp_size(0),
58  max_regs_per_sm(0),
60  reg_bank_size(0),
61  has_shuffle(false),
62  has_ldg(false)
63  {
64  // If device_id is negative, use currently selected device
65  if (device_id < 0)
66  cudaGetDevice(&device_id);
67 
68  // Get compute capability
69  int major, minor;
70  cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor,
71  device_id);
72  cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor,
73  device_id);
76 
77  // Require compute capability >= 2
79  compute_capability_major < 2, std::logic_error,
80  "Cuda compute capability >= 2 is required!");
81 
82  // These come from the CUDA occupancy calculator
83  if (compute_capability_major == 7) {
84  if (compute_capability_minor == 0) {
85  shared_memory_capacity = 96 * 1024;
86  }
87  else {
88  shared_memory_capacity = 64 * 1024;
89  }
90 
91  max_shmem_per_block = 48 * 1024;
92  max_regs_per_block = 64 * 1024;
93  max_regs_per_sm = 64 * 1024;
95  max_threads_per_block = 1024;
96 
97  if (compute_capability_minor == 0) {
98  max_threads_per_sm = 2048;
99  max_warps_per_sm = 64;
100  max_blocks_per_sm = 32;
101  }
102  else {
103  max_threads_per_sm = 1024;
104  max_warps_per_sm = 32;
105  max_blocks_per_sm = 16;
106  }
107 
108  warp_size = 32;
109  warp_granularity = 4; // ??
110  reg_bank_size = 256;
111  has_shuffle = true;
112  has_ldg = true;
113  }
114 
115  else if (compute_capability_major == 6) {
116  if (compute_capability_minor == 1)
117  shared_memory_capacity = 96 * 1024;
118  else
119  shared_memory_capacity = 64 * 1024;
120 
122  max_regs_per_block = 64 * 1024;
123  else
124  max_regs_per_block = 32 * 1024;
125 
126  max_shmem_per_block = 48 * 1024;
127  max_regs_per_sm = 64 * 1024;
129  max_threads_per_block = 1024;
130 
131  if (compute_capability_minor == 2) {
132  max_threads_per_sm = 4096;
133  max_warps_per_sm = 128;
134  }
135  else {
136  max_threads_per_sm = 2048;
137  max_warps_per_sm = 64;
138  }
139  max_blocks_per_sm = 32;
140 
141  warp_size = 32;
142  if (compute_capability_minor == 0)
143  warp_granularity = 2;
144  else
145  warp_granularity = 4;
146  reg_bank_size = 256;
147  has_shuffle = true;
148  has_ldg = true;
149  }
150 
151  else if (compute_capability_major == 3) {
152  if (compute_capability_minor >= 7) {
153  shared_memory_capacity = 112 * 1024;
154  max_shmem_per_block = 48 * 1024;
155  max_regs_per_sm = 128 * 1024;
156  max_regs_per_block = 64 * 1024;
157  }
158  else {
159  shared_memory_capacity = 48 * 1024;
160  max_shmem_per_block = 48 * 1024;
161  max_regs_per_sm = 64 * 1024;
162  max_regs_per_block = 64 * 1024;
163  }
165  max_threads_per_block = 1024;
166  max_threads_per_sm = 2048;
167  max_blocks_per_sm = 16;
168  max_warps_per_sm = 64;
169  warp_size = 32;
170  warp_granularity = 4;
171  reg_bank_size = 256;
172  has_shuffle = true;
173  has_ldg = true;
174  }
175 
176  else if (compute_capability_major == 2) {
177  shared_memory_capacity = 48 * 1024;
179  max_shmem_per_block = 48 * 1024;
180  max_threads_per_block = 1024;
181  max_threads_per_sm = 1536;
182  max_blocks_per_sm = 8;
183  max_warps_per_sm = 48;
184  warp_size = 32;
185  warp_granularity = 2;
186  max_regs_per_sm = 32 * 1024;
187  max_regs_per_block = 32 * 1024;
188  reg_bank_size = 64;
189  has_shuffle = false;
190  has_ldg = false;
191  }
192 
193  else
195  true, std::logic_error,
196  "DeviceProp not configured for compute capability " <<
198  }
199 
200  // Returns number of registers per thread used by the given kernel
201  template <typename Kernel>
202  size_type
203  get_kernel_registers(Kernel kernel) {
204 #ifdef __CUDACC__
205  typedef void (*func_ptr_t)();
206  func_ptr_t func_ptr = reinterpret_cast<func_ptr_t>(kernel);
207  cudaFuncAttributes attrib;
208  cudaFuncGetAttributes(&attrib, func_ptr);
209  return attrib.numRegs;
210 #else
211  return 0;
212 #endif
213  }
214 
215  // Returns number of resident warps per sm for the given kernel
216  template <typename Kernel>
217  size_type
218  get_resident_warps_per_sm(Kernel kernel) {
219  const size_type regs_per_thread = get_kernel_registers(kernel);
220  const size_type regs_per_warp =
221  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
222  const size_type warps_per_sm =
223  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
224  return warps_per_sm;
225  }
226  };
227 
228 } // namespace Stokhos
229 
230 #endif /* #ifndef STOKHOS_CUDA_DEVICE_PROP_HPP */
Kokkos::Cuda::size_type size_type
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type get_resident_warps_per_sm(Kernel kernel)
size_type get_kernel_registers(Kernel kernel)