Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Kokkos_Parallel_MP_Vector.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
11 #define KOKKOS_PARALLEL_MP_VECTOR_HPP
12 
13 #include "Sacado_MP_Vector.hpp"
14 #include "Kokkos_Core.hpp"
15 
16 //----------------------------------------------------------------------------
17 // Kokkos execution policies useful for Sacado::MP::Vector scalar type
18 //----------------------------------------------------------------------------
19 
20 namespace Kokkos {
21 
25  template< class ExecSpace, class Tag = void >
27 
29  typedef ExecSpace execution_space ;
30  typedef Tag work_tag ;
31 
33  size_t range;
34  size_t team;
35  size_t shared;
36 
37 
40  const size_t range_,
41  const size_t team_,
42  const size_t shared_ = 0 ) :
43  space_(space), range(range_), team(team_), shared(shared_) {}
44 
46  MPVectorWorkConfig( const size_t range_,
47  const size_t team_,
48  const size_t shared_ = 0 ) :
49  MPVectorWorkConfig(execution_space(), range_, team_, shared_) {}
50 
51  ExecSpace space() const { return space_; }
52 };
53 
54 namespace Impl {
55 
56 #if defined( KOKKOS_ENABLE_THREADS )
57 // Specialization of ParallelFor<> for MPVectorWorkConfig and Threads
58 // The default implementation ignores the team size and uses the standard
59 // work-range implementation. In the future maybe we should try and use
60 // hyperthreads in a useful way. That would require:
61 // -- interpreting the team-size differently, rather as the sacado size
62 // -- determining the vector size of the architecture
63 // -- laying out the threads differently to use hyperthreads across the
64 // the sacado dimension
65 template< class FunctorType, class Tag >
66 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads, Tag > > :
67  public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Threads > > {
68  typedef Kokkos::RangePolicy< Tag, Threads > Policy ;
69 public:
70  ParallelFor( const FunctorType & functor ,
71  const MPVectorWorkConfig< Threads, Tag > & work_config ) :
72  ParallelFor< FunctorType , Policy >( functor ,
73  Policy( 0, work_config.range ) ) {}
74 };
75 #endif
76 
77 #if defined( KOKKOS_ENABLE_OPENMP )
78 // Specialization of ParallelFor<> for MPVectorWorkConfig and OpenMP
79 // The default implementation ignores the team size and uses the standard
80 // work-range implementation. In the future maybe we should try and use
81 // hyperthreads in a useful way. That would require:
82 // -- interpreting the team-size differently, rather as the sacado size
83 // -- determining the vector size of the architecture
84 // -- laying out the threads differently to use hyperthreads across the
85 // the sacado dimension
86 template< class FunctorType, class Tag >
87 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP, Tag > > :
88  public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, OpenMP > > {
89  typedef Kokkos::RangePolicy< Tag, OpenMP > Policy ;
90 public:
91  ParallelFor( const FunctorType & functor ,
92  const MPVectorWorkConfig< OpenMP, Tag > & work_config ) :
93  ParallelFor< FunctorType , Policy >( functor ,
94  Policy( 0, work_config.range ) ) {}
95 };
96 #endif
97 
98 #if defined(KOKKOS_ENABLE_SERIAL)
99 // Specialization of ParallelFor<> for MPVectorWorkConfig and Serial
100 // The default implementation ignores the team size and uses the standard
101 // work-range implementation. In the future maybe we should try and use
102 // hyperthreads in a useful way. That would require:
103 // -- interpreting the team-size differently, rather as the sacado size
104 // -- determining the vector size of the architecture
105 // -- laying out the threads differently to use hyperthreads across the
106 // the sacado dimension
107 template< class FunctorType, class Tag >
108 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial, Tag > > :
109  public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Serial > > {
110  typedef Kokkos::RangePolicy< Tag, Serial > Policy ;
111 public:
112  ParallelFor( const FunctorType & functor ,
113  const MPVectorWorkConfig< Serial, Tag > & work_config ) :
114  ParallelFor< FunctorType , Policy >( functor ,
115  Policy( 0, work_config.range ) ) {}
116 };
117 #endif // defined(KOKKOS_ENABLE_SERIAL)
118 
119 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
120 
121 // Specialization of ParallelFor<> for MPVectorWorkConfig on Cuda
122 // Here we use threadIdx.x for each entry in the specified team-size
123 template< class FunctorType, class Tag >
124 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda, Tag > > {
125 public:
126 
127  typedef Kokkos::RangePolicy< Tag, Cuda > Policy;
128 
129  const FunctorType m_functor ;
130  const MPVectorWorkConfig< Cuda, Tag > m_config;
131  const Cuda::size_type m_work ;
132  const Policy m_policy;
133 
134  template <class TagType>
135  inline __device__
136  typename std::enable_if<std::is_same<TagType, void>::value>::type
137  exec_range(const Cuda::size_type i, Cuda::size_type j) const {
138  m_functor(i, j);
139  }
140 
141  template <class TagType>
142  inline __device__
143  typename std::enable_if<!std::is_same<TagType, void>::value>::type
144  exec_range(const Cuda::size_type i, Cuda::size_type j) const {
145  m_functor(TagType(), i, j);
146  }
147 
148  Policy const& get_policy() const { return m_policy; }
149 
150  inline
151  __device__
152  void operator()(void) const
153  {
154  const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
155 
156  for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
157  iwork < m_work ;
158  iwork += work_stride ) {
159  this->template exec_range<Tag>(iwork, threadIdx.x);
160  }
161  }
162 
163  ParallelFor( const FunctorType & functor ,
164  const MPVectorWorkConfig< Cuda, Tag > & work_config )
165  : m_functor( functor ) ,
166  m_config( work_config ) ,
167  m_work( work_config.range ),
168  m_policy()
169  {
170  }
171 
172  inline
173  void execute() const
174  {
175  // To do: query number of registers used by functor and adjust
176  // nwarp accordingly to get maximum occupancy
177 
178  auto const maxWarpCount = std::min<unsigned>(
179  m_policy.space().cuda_device_prop().maxThreadsPerBlock / CudaTraits::WarpSize,
180  CudaTraits::WarpSize);
181 
182  Cuda::size_type nwarp = 0;
183  if (m_config.team > CudaTraits::WarpSize) {
184  const Cuda::size_type warps_per_team =
185  ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
186  nwarp = maxWarpCount / warps_per_team;
187  }
188  else {
189  const Cuda::size_type teams_per_warp =
190  CudaTraits::WarpSize / m_config.team ;
191  nwarp = maxWarpCount * teams_per_warp;
192  }
193  const dim3 block( m_config.team , nwarp , 1 );
194 
195  const Cuda::size_type maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0];
196  Cuda::size_type nblock =
197  std::min( (m_work + block.y - 1 ) / block.y , maxGridSizeX );
198  const dim3 grid( nblock , 1 , 1 );
199 
200  const Cuda::size_type shared = m_config.shared;
201  CudaParallelLaunch< ParallelFor >( *this , grid , block , shared , m_policy.space().impl_internal_space_instance() );
202  }
203 };
204 
205 #endif
206 
207 } // namespace Impl
208 
209 } // namespace Kokkos
210 
211 //----------------------------------------------------------------------------
212 //----------------------------------------------------------------------------
213 
214 #endif /* #ifndef KOKKOS_ATOMIC_MP_VECTOR_HPP */
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
MPVectorWorkConfig(const execution_space &space, const size_t range_, const size_t team_, const size_t shared_=0)
in the provided execution space instance
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
in the default execution space instance
Team-based parallel work configuration for Sacado::MP::Vector.