42 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43 #define KOKKOS_PARALLEL_MP_VECTOR_HPP
46 #include "Kokkos_Core.hpp"
57 template<
class ExecSpace >
70 const size_t shared_ = 0 ) :
76 #if defined( KOKKOS_ENABLE_THREADS )
85 template<
class FunctorType >
86 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads > > :
87 public ParallelFor< FunctorType , Kokkos::RangePolicy< Threads > > {
88 typedef Kokkos::RangePolicy< Threads > Policy ;
90 ParallelFor(
const FunctorType & functor ,
91 const MPVectorWorkConfig< Threads > & work_config ) :
92 ParallelFor< FunctorType , Policy >( functor ,
93 Policy( 0, work_config.range ) ) {}
97 #if defined( KOKKOS_ENABLE_OPENMP )
106 template<
class FunctorType >
107 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP > > :
108 public ParallelFor< FunctorType , Kokkos::RangePolicy< OpenMP > > {
109 typedef Kokkos::RangePolicy< OpenMP > Policy ;
111 ParallelFor(
const FunctorType & functor ,
112 const MPVectorWorkConfig< OpenMP > & work_config ) :
113 ParallelFor< FunctorType , Policy >( functor ,
114 Policy( 0, work_config.range ) ) {}
118 #if defined(KOKKOS_ENABLE_SERIAL)
127 template<
class FunctorType >
128 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial > > :
129 public ParallelFor< FunctorType , Kokkos::RangePolicy< Serial > > {
130 typedef Kokkos::RangePolicy< Serial > Policy ;
132 ParallelFor(
const FunctorType & functor ,
133 const MPVectorWorkConfig< Serial > & work_config ) :
134 ParallelFor< FunctorType , Policy >( functor ,
135 Policy( 0, work_config.range ) ) {}
137 #endif // defined(KOKKOS_ENABLE_SERIAL)
139 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
143 template<
class FunctorType >
144 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda > > {
147 typedef Kokkos::RangePolicy< Cuda > Policy;
149 const FunctorType m_functor ;
150 const MPVectorWorkConfig< Cuda > m_config;
151 const Cuda::size_type m_work ;
155 void operator()(
void)
const
157 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
159 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
161 iwork += work_stride ) {
162 m_functor( iwork , threadIdx.x );
166 ParallelFor(
const FunctorType & functor ,
167 const MPVectorWorkConfig< Cuda > & work_config )
168 : m_functor( functor ) ,
169 m_config( work_config ) ,
170 m_work( work_config.range )
180 Cuda::size_type nwarp = 0;
181 if (m_config.team > CudaTraits::WarpSize) {
182 const Cuda::size_type warps_per_team =
183 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
184 nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
187 const Cuda::size_type teams_per_warp =
188 CudaTraits::WarpSize / m_config.team ;
189 nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
191 const dim3 block( m_config.team , nwarp , 1 );
193 Cuda::size_type nblock =
194 std::min( (m_work + block.y - 1 ) / block.y ,
195 cuda_internal_maximum_grid_count() );
196 const dim3 grid( nblock , 1 , 1 );
198 const Cuda::size_type shared = m_config.shared;
199 CudaParallelLaunch< ParallelFor >( *this , grid , block , shared , Policy().space().impl_internal_space_instance(), false );
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
ExecSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Team-based parallel work configuration for Sacado::MP::Vector.
MPVectorWorkConfig execution_policy