10 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
11 #define KOKKOS_PARALLEL_MP_VECTOR_HPP
14 #include "Kokkos_Core.hpp"
25 template<
class ExecSpace,
class Tag =
void >
42 const size_t shared_ = 0 ) :
48 const size_t shared_ = 0 ) :
56 #if defined( KOKKOS_ENABLE_THREADS )
65 template<
class FunctorType,
class Tag >
66 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads, Tag > > :
67 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Threads > > {
68 typedef Kokkos::RangePolicy< Tag, Threads > Policy ;
70 ParallelFor(
const FunctorType & functor ,
71 const MPVectorWorkConfig< Threads, Tag > & work_config ) :
72 ParallelFor< FunctorType , Policy >( functor ,
73 Policy( 0, work_config.range ) ) {}
77 #if defined( KOKKOS_ENABLE_OPENMP )
86 template<
class FunctorType,
class Tag >
87 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP, Tag > > :
88 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, OpenMP > > {
89 typedef Kokkos::RangePolicy< Tag, OpenMP > Policy ;
91 ParallelFor(
const FunctorType & functor ,
92 const MPVectorWorkConfig< OpenMP, Tag > & work_config ) :
93 ParallelFor< FunctorType , Policy >( functor ,
94 Policy( 0, work_config.range ) ) {}
98 #if defined(KOKKOS_ENABLE_SERIAL)
107 template<
class FunctorType,
class Tag >
108 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial, Tag > > :
109 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Serial > > {
110 typedef Kokkos::RangePolicy< Tag, Serial > Policy ;
112 ParallelFor(
const FunctorType & functor ,
113 const MPVectorWorkConfig< Serial, Tag > & work_config ) :
114 ParallelFor< FunctorType , Policy >( functor ,
115 Policy( 0, work_config.range ) ) {}
117 #endif // defined(KOKKOS_ENABLE_SERIAL)
119 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
123 template<
class FunctorType,
class Tag >
124 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda, Tag > > {
127 typedef Kokkos::RangePolicy< Tag, Cuda > Policy;
129 const FunctorType m_functor ;
130 const MPVectorWorkConfig< Cuda, Tag > m_config;
131 const Cuda::size_type m_work ;
132 const Policy m_policy;
134 template <
class TagType>
136 typename std::enable_if<std::is_same<TagType, void>::value>::type
137 exec_range(
const Cuda::size_type i, Cuda::size_type
j)
const {
141 template <
class TagType>
143 typename std::enable_if<!std::is_same<TagType, void>::value>::type
144 exec_range(
const Cuda::size_type i, Cuda::size_type
j)
const {
145 m_functor(TagType(), i, j);
148 Policy
const& get_policy()
const {
return m_policy; }
152 void operator()(
void)
const
154 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
156 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
158 iwork += work_stride ) {
159 this->
template exec_range<Tag>(iwork, threadIdx.x);
163 ParallelFor(
const FunctorType & functor ,
164 const MPVectorWorkConfig< Cuda, Tag > & work_config )
165 : m_functor( functor ) ,
166 m_config( work_config ) ,
167 m_work( work_config.range ),
178 auto const maxWarpCount = std::min<unsigned>(
179 m_policy.space().cuda_device_prop().maxThreadsPerBlock / CudaTraits::WarpSize,
180 CudaTraits::WarpSize);
182 Cuda::size_type nwarp = 0;
183 if (m_config.team > CudaTraits::WarpSize) {
184 const Cuda::size_type warps_per_team =
185 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
186 nwarp = maxWarpCount / warps_per_team;
189 const Cuda::size_type teams_per_warp =
190 CudaTraits::WarpSize / m_config.team ;
191 nwarp = maxWarpCount * teams_per_warp;
193 const dim3 block( m_config.team , nwarp , 1 );
195 const Cuda::size_type maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0];
196 Cuda::size_type nblock =
197 std::min( (m_work + block.y - 1 ) / block.y , maxGridSizeX );
198 const dim3 grid( nblock , 1 , 1 );
200 const Cuda::size_type shared = m_config.shared;
201 CudaParallelLaunch< ParallelFor >( *this , grid , block , shared , m_policy.space().impl_internal_space_instance() );
MPVectorWorkConfig execution_policy
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
MPVectorWorkConfig(const execution_space &space, const size_t range_, const size_t team_, const size_t shared_=0)
in the provided execution space instance
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
in the default execution space instance
Team-based parallel work configuration for Sacado::MP::Vector.
ExecSpace execution_space