10 #ifndef IFPACK2_BLOCKHELPER_IMPL_HPP
11 #define IFPACK2_BLOCKHELPER_IMPL_HPP
13 #include "Ifpack2_BlockHelper_Timers.hpp"
17 namespace BlockHelperDetails {
19 namespace KB = KokkosBatched;
24 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
26 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
27 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
28 MemoryTraitsType::is_random_access |
31 template <
typename ViewType>
32 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
33 typename ViewType::array_layout,
34 typename ViewType::device_type,
35 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
36 template <
typename ViewType>
37 using Atomic = Kokkos::View<
typename ViewType::data_type,
38 typename ViewType::array_layout,
39 typename ViewType::device_type,
40 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
41 template <
typename ViewType>
42 using Const = Kokkos::View<
typename ViewType::const_data_type,
43 typename ViewType::array_layout,
44 typename ViewType::device_type,
45 typename ViewType::memory_traits>;
46 template <
typename ViewType>
47 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
49 template <
typename ViewType>
50 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
52 template <
typename ViewType>
53 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
54 typename ViewType::array_layout,
55 typename ViewType::device_type,
56 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
59 template <
typename ViewType>
60 using Scratch = Kokkos::View<
typename ViewType::data_type,
61 typename ViewType::array_layout,
62 typename ViewType::execution_space::scratch_memory_space,
63 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
70 template<
typename T> KOKKOS_INLINE_FUNCTION
71 static T getFlatIndex(
const T i,
const T j,
const T blksize) {
return i+j*blksize; }
73 template<>
struct TpetraLittleBlock<Kokkos::LayoutRight> {
74 template<
typename T> KOKKOS_INLINE_FUNCTION
75 static T getFlatIndex(
const T i,
const T j,
const T blksize) {
return i*blksize+j; }
82 #if defined(IFPACK2_BLOCKHELPER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
90 template<
typename T>
struct is_cuda {
enum :
bool { value =
false }; };
91 #if defined(KOKKOS_ENABLE_CUDA)
92 template<>
struct is_cuda<Kokkos::Cuda> {
enum :
bool { value =
true }; };
98 template<
typename T>
struct is_hip {
enum :
bool { value =
false }; };
99 #if defined(KOKKOS_ENABLE_HIP)
100 template<>
struct is_hip<Kokkos::HIP> {
enum :
bool { value =
true }; };
106 template<
typename T>
struct is_sycl {
enum :
bool { value =
false }; };
107 #if defined(KOKKOS_ENABLE_SYCL)
108 template<>
struct is_sycl<Kokkos::Experimental::SYCL> {
enum :
bool { value =
true }; };
111 template<
typename T>
struct is_device {
enum :
bool { value = is_cuda<T>::value || is_hip<T>::value || is_sycl<T>::value }; };
119 static void createInstance(T &exec_instance) {
122 #if defined(KOKKOS_ENABLE_CUDA)
123 static void createInstance(
const cudaStream_t &s, T &exec_instance) {
129 #if defined(KOKKOS_ENABLE_CUDA)
132 static void createInstance(Kokkos::Cuda &exec_instance) {
133 exec_instance = Kokkos::Cuda();
135 static void createInstance(
const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
136 exec_instance = Kokkos::Cuda(s);
141 #if defined(KOKKOS_ENABLE_HIP)
143 struct ExecutionSpaceFactory<Kokkos::HIP> {
144 static void createInstance(Kokkos::HIP &exec_instance) {
145 exec_instance = Kokkos::HIP();
150 #if defined(KOKKOS_ENABLE_SYCL)
152 struct ExecutionSpaceFactory<Kokkos::Experimental::SYCL> {
153 static void createInstance(Kokkos::Experimental::SYCL &exec_instance) {
154 exec_instance = Kokkos::Experimental::SYCL();
159 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_ENABLE_PROFILE)
160 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN \
161 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
163 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_END \
164 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
166 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN
168 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_END
175 template<
typename CommPtrType>
176 std::string get_msg_prefix (
const CommPtrType &comm) {
177 const auto rank = comm->getRank();
178 const auto nranks = comm->getSize();
179 std::stringstream ss;
180 ss <<
"Rank " << rank <<
" of " << nranks <<
": ";
187 template<
typename T,
int N>
190 KOKKOS_INLINE_FUNCTION
192 for (
int i=0;i<N;++i)
195 KOKKOS_INLINE_FUNCTION
197 for (
int i=0;i<N;++i)
201 template<
typename T,
int N>
203 KOKKOS_INLINE_FUNCTION
207 for (
int i=0;i<N;++i)
214 template<
typename T,
int N,
typename ExecSpace>
218 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
221 KOKKOS_INLINE_FUNCTION
224 KOKKOS_INLINE_FUNCTION
226 for (
int i=0;i<N;++i)
227 dst.v[i] += src.v[i];
229 KOKKOS_INLINE_FUNCTION
231 for (
int i=0;i<N;++i)
232 val.v[i] = Kokkos::reduction_identity<T>::sum();
234 KOKKOS_INLINE_FUNCTION
238 KOKKOS_INLINE_FUNCTION
239 result_view_type view()
const {
240 return result_view_type(value);
248 template <
typename MatrixType>
254 typedef typename MatrixType::scalar_type scalar_type;
255 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
256 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
257 typedef typename MatrixType::node_type node_type;
263 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
265 typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
266 typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
277 typedef typename node_device_type::execution_space node_execution_space;
278 typedef typename node_device_type::memory_space node_memory_space;
280 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_USE_CUDA_SPACE)
281 typedef node_execution_space execution_space;
283 typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
285 node_memory_space>::type memory_space;
286 typedef Kokkos::Device<execution_space,memory_space> device_type;
289 typedef node_execution_space execution_space;
290 typedef node_memory_space memory_space;
293 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
294 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
295 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
296 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
297 typedef Tpetra::CrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_crs_matrix_type;
298 typedef Tpetra::CrsGraph<local_ordinal_type,global_ordinal_type,node_type> tpetra_crs_graph_type;
299 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
300 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
301 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
302 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
307 template<
typename T,
int l>
using Vector = KB::Vector<T,l>;
308 template<
typename T>
using SIMD = KB::SIMD<T>;
309 template<
typename T,
typename M>
using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
310 template<
typename T,
typename M>
using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
312 static constexpr
int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
313 static constexpr
int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
321 typedef Kokkos::View<size_type**,device_type> size_type_2d_view;
322 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
323 typedef Kokkos::View<local_ordinal_type**,device_type> local_ordinal_type_2d_view;
325 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
326 typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
329 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
330 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
333 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
334 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
335 typedef Kokkos::View<vector_type****,Kokkos::LayoutRight,device_type> vector_type_4d_view;
336 typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
337 typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
338 typedef Kokkos::View<internal_vector_type*****,Kokkos::LayoutRight,device_type> internal_vector_type_5d_view;
339 typedef Kokkos::View<btdm_scalar_type**,Kokkos::LayoutRight,device_type> btdm_scalar_type_2d_view;
340 typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
341 typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
342 typedef Kokkos::View<btdm_scalar_type*****,Kokkos::LayoutRight,device_type> btdm_scalar_type_5d_view;
349 template<
typename MatrixType>
354 using magnitude_type =
typename impl_type::magnitude_type;
358 int sweep_step_, sweep_step_upper_bound_;
359 #ifdef HAVE_IFPACK2_MPI
360 MPI_Request mpi_request_;
363 magnitude_type work_[3];
370 sweep_step_upper_bound_ = 1;
371 collective_ = comm->getSize() > 1;
373 #ifdef HAVE_IFPACK2_MPI
374 const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(comm);
376 comm_ = *mpi_comm->getRawMpiComm();
379 const magnitude_type zero(0), minus_one(-1);
382 work_[2] = minus_one;
386 void setCheckFrequency(
const int sweep_step) {
388 sweep_step_upper_bound_ = sweep_step;
392 // Get the buffer into which to store rank-local squared norms.
393 magnitude_type* getBuffer() { return &work_[0]; }
395 // Call MPI_Iallreduce to find the global squared norms.
396 void ireduce(const int sweep, const bool force = false) {
397 if ( ! force && sweep % sweep_step_) return;
399 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::Ireduce
", Ireduce);
402 #ifdef HAVE_IFPACK2_MPI
403 auto send_data = &work_[1];
404 auto recv_data = &work_[0];
406 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
407 MPI_Iallreduce(send_data, recv_data, 1,
408 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
409 MPI_SUM, comm_, &mpi_request_);
411 MPI_Allreduce (send_data, recv_data, 1,
412 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
419 // Check if the norm-based termination criterion is met. tol2 is the
420 // tolerance squared. Sweep is the sweep index. If not every iteration is
421 // being checked, this function immediately returns false. If a check must
422 // be done at this iteration, it waits for the reduction triggered by
423 // ireduce to complete, then checks the global norm against the tolerance.
424 bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
426 if (sweep <= 0) return false;
428 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::CheckDone
", CheckDone);
430 TEUCHOS_ASSERT(sweep >= 1);
431 if ( ! force && (sweep - 1) % sweep_step_) return false;
433 #ifdef HAVE_IFPACK2_MPI
434 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
435 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
445 r_val = (work_[0] < tol2*work_[2]);
449 const auto adjusted_sweep_step = 2*sweep_step_;
450 if (adjusted_sweep_step < sweep_step_upper_bound_) {
451 sweep_step_ = adjusted_sweep_step;
453 sweep_step_ = sweep_step_upper_bound_;
458 // After termination has occurred, finalize the norms for use in
459 // get_norms{0,final}.
461 work_[0] = std::sqrt(work_[0]); // after converged
463 work_[2] = std::sqrt(work_[2]); // first norm
464 // if work_[2] is minus one, then norm is not requested.
467 // Report norms to the caller.
468 const magnitude_type getNorms0 () const { return work_[2]; }
469 const magnitude_type getNormsFinal () const { return work_[0]; }
472 template<typename MatrixType>
473 void reduceVector(const ConstUnmanaged<typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
474 /* */ typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type *vals) {
475 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
476 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ReduceVector
", ReduceVector);
478 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
479 using local_ordinal_type = typename impl_type::local_ordinal_type;
480 using impl_scalar_type = typename impl_type::impl_scalar_type;
482 const auto norm2 = KokkosBlas::nrm1(zz);
484 impl_scalar_type norm2(0);
485 Kokkos::parallel_reduce
486 ("ReduceMultiVector::Device
",
487 Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
488 KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
492 vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
494 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
495 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
498 } // namespace BlockHelperDetails
500 } // namespace Ifpack2
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:276
Definition: Ifpack2_BlockHelper.hpp:81
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockHelper.hpp:271
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Definition: Ifpack2_BlockHelper.hpp:118
Definition: Ifpack2_BlockHelper.hpp:90
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockHelper.hpp:307
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Definition: Ifpack2_BlockHelper.hpp:98
Definition: Ifpack2_BlockHelper.hpp:350
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Definition: Ifpack2_BlockHelper.hpp:188
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:215
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockHelper.hpp:106
Definition: Ifpack2_BlockHelper.hpp:68