Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Stokhos_Cuda_DeviceProp.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_DEVICE_PROP_HPP
43 #define STOKHOS_CUDA_DEVICE_PROP_HPP
44 
45 #include "Kokkos_Core.hpp"
46 
48 
49 #include "cuda_runtime_api.h"
50 
51 namespace Stokhos {
52 
53  // Class encapsulating various device attributes
54  class DeviceProp {
55  public:
56 
57  typedef Kokkos::Cuda::size_type size_type;
58 
61 
74 
76  bool has_ldg;
77 
78  DeviceProp(int device_id = -1) :
88  warp_size(0),
90  max_regs_per_sm(0),
92  reg_bank_size(0),
93  has_shuffle(false),
94  has_ldg(false)
95  {
96  // If device_id is negative, use currently selected device
97  if (device_id < 0)
98  cudaGetDevice(&device_id);
99 
100  // Get compute capability
101  int major, minor;
102  cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor,
103  device_id);
104  cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor,
105  device_id);
106  compute_capability_major = major;
107  compute_capability_minor = minor;
108 
109  // Require compute capability >= 2
111  compute_capability_major < 2, std::logic_error,
112  "Cuda compute capability >= 2 is required!");
113 
114  // These come from the CUDA occupancy calculator
115  if (compute_capability_major == 7) {
116  if (compute_capability_minor == 0) {
117  shared_memory_capacity = 96 * 1024;
118  }
119  else {
120  shared_memory_capacity = 64 * 1024;
121  }
122 
123  max_shmem_per_block = 48 * 1024;
124  max_regs_per_block = 64 * 1024;
125  max_regs_per_sm = 64 * 1024;
127  max_threads_per_block = 1024;
128 
129  if (compute_capability_minor == 0) {
130  max_threads_per_sm = 2048;
131  max_warps_per_sm = 64;
132  max_blocks_per_sm = 32;
133  }
134  else {
135  max_threads_per_sm = 1024;
136  max_warps_per_sm = 32;
137  max_blocks_per_sm = 16;
138  }
139 
140  warp_size = 32;
141  warp_granularity = 4; // ??
142  reg_bank_size = 256;
143  has_shuffle = true;
144  has_ldg = true;
145  }
146 
147  else if (compute_capability_major == 6) {
148  if (compute_capability_minor == 1)
149  shared_memory_capacity = 96 * 1024;
150  else
151  shared_memory_capacity = 64 * 1024;
152 
154  max_regs_per_block = 64 * 1024;
155  else
156  max_regs_per_block = 32 * 1024;
157 
158  max_shmem_per_block = 48 * 1024;
159  max_regs_per_sm = 64 * 1024;
161  max_threads_per_block = 1024;
162 
163  if (compute_capability_minor == 2) {
164  max_threads_per_sm = 4096;
165  max_warps_per_sm = 128;
166  }
167  else {
168  max_threads_per_sm = 2048;
169  max_warps_per_sm = 64;
170  }
171  max_blocks_per_sm = 32;
172 
173  warp_size = 32;
174  if (compute_capability_minor == 0)
175  warp_granularity = 2;
176  else
177  warp_granularity = 4;
178  reg_bank_size = 256;
179  has_shuffle = true;
180  has_ldg = true;
181  }
182 
183  else if (compute_capability_major == 3) {
184  if (compute_capability_minor >= 7) {
185  shared_memory_capacity = 112 * 1024;
186  max_shmem_per_block = 48 * 1024;
187  max_regs_per_sm = 128 * 1024;
188  max_regs_per_block = 64 * 1024;
189  }
190  else {
191  shared_memory_capacity = 48 * 1024;
192  max_shmem_per_block = 48 * 1024;
193  max_regs_per_sm = 64 * 1024;
194  max_regs_per_block = 64 * 1024;
195  }
197  max_threads_per_block = 1024;
198  max_threads_per_sm = 2048;
199  max_blocks_per_sm = 16;
200  max_warps_per_sm = 64;
201  warp_size = 32;
202  warp_granularity = 4;
203  reg_bank_size = 256;
204  has_shuffle = true;
205  has_ldg = true;
206  }
207 
208  else if (compute_capability_major == 2) {
209  shared_memory_capacity = 48 * 1024;
211  max_shmem_per_block = 48 * 1024;
212  max_threads_per_block = 1024;
213  max_threads_per_sm = 1536;
214  max_blocks_per_sm = 8;
215  max_warps_per_sm = 48;
216  warp_size = 32;
217  warp_granularity = 2;
218  max_regs_per_sm = 32 * 1024;
219  max_regs_per_block = 32 * 1024;
220  reg_bank_size = 64;
221  has_shuffle = false;
222  has_ldg = false;
223  }
224 
225  else
227  true, std::logic_error,
228  "DeviceProp not configured for compute capability " <<
230  }
231 
232  // Returns number of registers per thread used by the given kernel
233  template <typename Kernel>
234  size_type
235  get_kernel_registers(Kernel kernel) {
236 #ifdef __CUDACC__
237  typedef void (*func_ptr_t)();
238  func_ptr_t func_ptr = reinterpret_cast<func_ptr_t>(kernel);
239  cudaFuncAttributes attrib;
240  cudaFuncGetAttributes(&attrib, func_ptr);
241  return attrib.numRegs;
242 #else
243  return 0;
244 #endif
245  }
246 
247  // Returns number of resident warps per sm for the given kernel
248  template <typename Kernel>
249  size_type
250  get_resident_warps_per_sm(Kernel kernel) {
251  const size_type regs_per_thread = get_kernel_registers(kernel);
252  const size_type regs_per_warp =
253  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
254  const size_type warps_per_sm =
255  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
256  return warps_per_sm;
257  }
258  };
259 
260 } // namespace Stokhos
261 
262 #endif /* #ifndef STOKHOS_CUDA_DEVICE_PROP_HPP */
Kokkos::Cuda::size_type size_type
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type get_resident_warps_per_sm(Kernel kernel)
size_type get_kernel_registers(Kernel kernel)