42 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43 #define KOKKOS_PARALLEL_MP_VECTOR_HPP
46 #include "Kokkos_Core.hpp"
57 template<
class ExecSpace >
70 const size_t shared_ = 0 ) :
73 ExecSpace
space()
const {
return ExecSpace(); }
78 #if defined( KOKKOS_ENABLE_THREADS )
87 template<
class FunctorType >
88 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads > > :
89 public ParallelFor< FunctorType , Kokkos::RangePolicy< Threads > > {
90 typedef Kokkos::RangePolicy< Threads > Policy ;
92 ParallelFor(
const FunctorType & functor ,
93 const MPVectorWorkConfig< Threads > & work_config ) :
94 ParallelFor< FunctorType , Policy >( functor ,
95 Policy( 0, work_config.range ) ) {}
99 #if defined( KOKKOS_ENABLE_OPENMP )
108 template<
class FunctorType >
109 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP > > :
110 public ParallelFor< FunctorType , Kokkos::RangePolicy< OpenMP > > {
111 typedef Kokkos::RangePolicy< OpenMP > Policy ;
113 ParallelFor(
const FunctorType & functor ,
114 const MPVectorWorkConfig< OpenMP > & work_config ) :
115 ParallelFor< FunctorType , Policy >( functor ,
116 Policy( 0, work_config.range ) ) {}
120 #if defined(KOKKOS_ENABLE_SERIAL)
129 template<
class FunctorType >
130 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial > > :
131 public ParallelFor< FunctorType , Kokkos::RangePolicy< Serial > > {
132 typedef Kokkos::RangePolicy< Serial > Policy ;
134 ParallelFor(
const FunctorType & functor ,
135 const MPVectorWorkConfig< Serial > & work_config ) :
136 ParallelFor< FunctorType , Policy >( functor ,
137 Policy( 0, work_config.range ) ) {}
139 #endif // defined(KOKKOS_ENABLE_SERIAL)
141 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
145 template<
class FunctorType >
146 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda > > {
149 typedef Kokkos::RangePolicy< Cuda > Policy;
151 const FunctorType m_functor ;
152 const MPVectorWorkConfig< Cuda > m_config;
153 const Cuda::size_type m_work ;
157 void operator()(
void)
const
159 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
161 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
163 iwork += work_stride ) {
164 m_functor( iwork , threadIdx.x );
168 ParallelFor(
const FunctorType & functor ,
169 const MPVectorWorkConfig< Cuda > & work_config )
170 : m_functor( functor ) ,
171 m_config( work_config ) ,
172 m_work( work_config.range )
182 Cuda::size_type nwarp = 0;
183 if (m_config.team > CudaTraits::WarpSize) {
184 const Cuda::size_type warps_per_team =
185 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
186 nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
189 const Cuda::size_type teams_per_warp =
190 CudaTraits::WarpSize / m_config.team ;
191 nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
193 const dim3 block( m_config.team , nwarp , 1 );
195 Cuda::size_type nblock =
196 std::min( (m_work + block.y - 1 ) / block.y ,
197 cuda_internal_maximum_grid_count() );
198 const dim3 grid( nblock , 1 , 1 );
200 const Cuda::size_type shared = m_config.shared;
201 CudaParallelLaunch< ParallelFor >( *this , grid , block , shared , Policy().space().impl_internal_space_instance(), false );
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
ExecSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Team-based parallel work configuration for Sacado::MP::Vector.
MPVectorWorkConfig execution_policy