42 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43 #define KOKKOS_PARALLEL_MP_VECTOR_HPP
46 #include "Kokkos_Core.hpp"
57 template<
class ExecSpace >
70 const size_t shared_ = 0 ) :
76 #if defined( KOKKOS_ENABLE_THREADS )
85 template<
class FunctorType >
86 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads > > :
87 public ParallelFor< FunctorType , Kokkos::RangePolicy< Threads > > {
88 typedef Kokkos::RangePolicy< Threads > Policy ;
90 ParallelFor(
const FunctorType & functor ,
91 const MPVectorWorkConfig< Threads > & work_config ) :
92 ParallelFor< FunctorType , Policy >( functor ,
93 Policy( 0, work_config.range ) ) {}
97 #if defined( KOKKOS_ENABLE_OPENMP )
106 template<
class FunctorType >
107 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP > > :
108 public ParallelFor< FunctorType , Kokkos::RangePolicy< OpenMP > > {
109 typedef Kokkos::RangePolicy< OpenMP > Policy ;
111 ParallelFor(
const FunctorType & functor ,
112 const MPVectorWorkConfig< OpenMP > & work_config ) :
113 ParallelFor< FunctorType , Policy >( functor ,
114 Policy( 0, work_config.range ) ) {}
118 #if defined(KOKKOS_ENABLE_SERIAL)
127 template<
class FunctorType >
128 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial > > :
129 public ParallelFor< FunctorType , Kokkos::RangePolicy< Serial > > {
130 typedef Kokkos::RangePolicy< Serial > Policy ;
132 ParallelFor(
const FunctorType & functor ,
133 const MPVectorWorkConfig< Serial > & work_config ) :
134 ParallelFor< FunctorType , Policy >( functor ,
135 Policy( 0, work_config.range ) ) {}
137 #endif // defined(KOKKOS_ENABLE_SERIAL)
139 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
143 template<
class FunctorType >
144 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda > > {
147 const FunctorType m_functor ;
148 const MPVectorWorkConfig< Cuda > m_config;
149 const Cuda::size_type m_work ;
153 void operator()(
void)
const
155 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
157 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
159 iwork += work_stride ) {
160 m_functor( iwork , threadIdx.x );
164 ParallelFor(
const FunctorType & functor ,
165 const MPVectorWorkConfig< Cuda > & work_config )
166 : m_functor( functor ) ,
167 m_config( work_config ) ,
168 m_work( work_config.range )
178 Cuda::size_type nwarp = 0;
179 if (m_config.team > CudaTraits::WarpSize) {
180 const Cuda::size_type warps_per_team =
181 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
182 nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
185 const Cuda::size_type teams_per_warp =
186 CudaTraits::WarpSize / m_config.team ;
187 nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
189 const dim3 block( m_config.team , nwarp , 1 );
191 Cuda::size_type nblock =
192 std::min( (m_work + block.y - 1 ) / block.y ,
193 cuda_internal_maximum_grid_count() );
194 const dim3 grid( nblock , 1 , 1 );
196 const Cuda::size_type shared = m_config.shared;
197 CudaParallelLaunch< ParallelFor >( *this , grid , block , shared );
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
ExecSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Team-based parallel work configuration for Sacado::MP::Vector.
MPVectorWorkConfig execution_policy