43 #ifndef IFPACK2_BLOCKHELPER_IMPL_HPP
44 #define IFPACK2_BLOCKHELPER_IMPL_HPP
49 namespace BlockHelperDetails {
51 namespace KB = KokkosBatched;
56 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
58 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
59 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
60 MemoryTraitsType::is_random_access |
63 template <
typename ViewType>
64 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
65 typename ViewType::array_layout,
66 typename ViewType::device_type,
67 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
68 template <
typename ViewType>
69 using Atomic = Kokkos::View<
typename ViewType::data_type,
70 typename ViewType::array_layout,
71 typename ViewType::device_type,
72 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
73 template <
typename ViewType>
74 using Const = Kokkos::View<
typename ViewType::const_data_type,
75 typename ViewType::array_layout,
76 typename ViewType::device_type,
77 typename ViewType::memory_traits>;
78 template <
typename ViewType>
79 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
81 template <
typename ViewType>
82 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
84 template <
typename ViewType>
85 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
86 typename ViewType::array_layout,
87 typename ViewType::device_type,
88 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
91 template <
typename ViewType>
92 using Scratch = Kokkos::View<
typename ViewType::data_type,
93 typename ViewType::array_layout,
94 typename ViewType::execution_space::scratch_memory_space,
95 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
102 template<
typename T> KOKKOS_INLINE_FUNCTION
103 static T getFlatIndex(
const T i,
const T j,
const T blksize) {
return i+j*blksize; }
105 template<>
struct TpetraLittleBlock<Kokkos::LayoutRight> {
106 template<
typename T> KOKKOS_INLINE_FUNCTION
107 static T getFlatIndex(
const T i,
const T j,
const T blksize) {
return i*blksize+j; }
114 #if defined(IFPACK2_BLOCKHELPER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
122 template<
typename T>
struct is_cuda {
enum :
bool { value =
false }; };
123 #if defined(KOKKOS_ENABLE_CUDA)
124 template<>
struct is_cuda<Kokkos::Cuda> {
enum :
bool { value =
true }; };
130 template<
typename T>
struct is_hip {
enum :
bool { value =
false }; };
131 #if defined(KOKKOS_ENABLE_HIP)
132 template<>
struct is_hip<Kokkos::HIP> {
enum :
bool { value =
true }; };
138 template<
typename T>
struct is_sycl {
enum :
bool { value =
false }; };
139 #if defined(KOKKOS_ENABLE_SYCL)
140 template<>
struct is_sycl<Kokkos::Experimental::SYCL> {
enum :
bool { value =
true }; };
143 template<
typename T>
struct is_device {
enum :
bool { value = is_cuda<T>::value || is_hip<T>::value || is_sycl<T>::value }; };
151 static void createInstance(T &exec_instance) {
154 #if defined(KOKKOS_ENABLE_CUDA)
155 static void createInstance(
const cudaStream_t &s, T &exec_instance) {
161 #if defined(KOKKOS_ENABLE_CUDA)
164 static void createInstance(Kokkos::Cuda &exec_instance) {
165 exec_instance = Kokkos::Cuda();
167 static void createInstance(
const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
168 exec_instance = Kokkos::Cuda(s);
173 #if defined(KOKKOS_ENABLE_HIP)
175 struct ExecutionSpaceFactory<Kokkos::HIP> {
176 static void createInstance(Kokkos::HIP &exec_instance) {
177 exec_instance = Kokkos::HIP();
182 #if defined(KOKKOS_ENABLE_SYCL)
184 struct ExecutionSpaceFactory<Kokkos::Experimental::SYCL> {
185 static void createInstance(Kokkos::Experimental::SYCL &exec_instance) {
186 exec_instance = Kokkos::Experimental::SYCL();
192 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
193 #define IFPACK2_BLOCKHELPER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
194 #define IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) execution_space().fence();
196 #define IFPACK2_BLOCKHELPER_TIMER(label)
197 #define IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
200 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_ENABLE_PROFILE)
201 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN \
202 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
204 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_END \
205 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
207 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN
209 #define IFPACK2_BLOCKHELPER_PROFILER_REGION_END
216 template<
typename CommPtrType>
217 std::string get_msg_prefix (
const CommPtrType &comm) {
218 const auto rank = comm->getRank();
219 const auto nranks = comm->getSize();
220 std::stringstream ss;
221 ss <<
"Rank " << rank <<
" of " << nranks <<
": ";
228 template<
typename T,
int N>
231 KOKKOS_INLINE_FUNCTION
233 for (
int i=0;i<N;++i)
236 KOKKOS_INLINE_FUNCTION
238 for (
int i=0;i<N;++i)
242 template<
typename T,
int N>
244 KOKKOS_INLINE_FUNCTION
248 for (
int i=0;i<N;++i)
255 template<
typename T,
int N,
typename ExecSpace>
259 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
262 KOKKOS_INLINE_FUNCTION
265 KOKKOS_INLINE_FUNCTION
267 for (
int i=0;i<N;++i)
268 dst.v[i] += src.v[i];
270 KOKKOS_INLINE_FUNCTION
272 for (
int i=0;i<N;++i)
273 val.v[i] = Kokkos::reduction_identity<T>::sum();
275 KOKKOS_INLINE_FUNCTION
279 KOKKOS_INLINE_FUNCTION
280 result_view_type view()
const {
281 return result_view_type(value);
289 template <
typename MatrixType>
295 typedef typename MatrixType::scalar_type scalar_type;
296 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
297 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
298 typedef typename MatrixType::node_type node_type;
304 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
306 typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
307 typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
318 typedef typename node_device_type::execution_space node_execution_space;
319 typedef typename node_device_type::memory_space node_memory_space;
321 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_USE_CUDA_SPACE)
322 typedef node_execution_space execution_space;
324 typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
326 node_memory_space>::type memory_space;
327 typedef Kokkos::Device<execution_space,memory_space> device_type;
330 typedef node_execution_space execution_space;
331 typedef node_memory_space memory_space;
334 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
335 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
336 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
337 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
338 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
339 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
340 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
341 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
346 template<
typename T,
int l>
using Vector = KB::Vector<T,l>;
347 template<
typename T>
using SIMD = KB::SIMD<T>;
348 template<
typename T,
typename M>
using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
349 template<
typename T,
typename M>
using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
351 static constexpr
int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
352 static constexpr
int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
360 typedef Kokkos::View<size_type**,device_type> size_type_2d_view;
361 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
362 typedef Kokkos::View<local_ordinal_type**,device_type> local_ordinal_type_2d_view;
364 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
365 typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
368 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
369 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
372 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
373 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
374 typedef Kokkos::View<vector_type****,Kokkos::LayoutRight,device_type> vector_type_4d_view;
375 typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
376 typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
377 typedef Kokkos::View<internal_vector_type*****,Kokkos::LayoutRight,device_type> internal_vector_type_5d_view;
378 typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
379 typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
380 typedef Kokkos::View<btdm_scalar_type*****,Kokkos::LayoutRight,device_type> btdm_scalar_type_5d_view;
387 template<
typename MatrixType>
392 using magnitude_type =
typename impl_type::magnitude_type;
396 int sweep_step_, sweep_step_upper_bound_;
397 #ifdef HAVE_IFPACK2_MPI
398 MPI_Request mpi_request_;
401 magnitude_type work_[3];
408 sweep_step_upper_bound_ = 1;
409 collective_ = comm->getSize() > 1;
411 #ifdef HAVE_IFPACK2_MPI
412 const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(comm);
414 comm_ = *mpi_comm->getRawMpiComm();
417 const magnitude_type zero(0), minus_one(-1);
420 work_[2] = minus_one;
424 void setCheckFrequency(
const int sweep_step) {
426 sweep_step_upper_bound_ = sweep_step;
430 // Get the buffer into which to store rank-local squared norms.
431 magnitude_type* getBuffer() { return &work_[0]; }
433 // Call MPI_Iallreduce to find the global squared norms.
434 void ireduce(const int sweep, const bool force = false) {
435 if ( ! force && sweep % sweep_step_) return;
437 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::Ireduce
");
440 #ifdef HAVE_IFPACK2_MPI
441 auto send_data = &work_[1];
442 auto recv_data = &work_[0];
444 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
445 MPI_Iallreduce(send_data, recv_data, 1,
446 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
447 MPI_SUM, comm_, &mpi_request_);
449 MPI_Allreduce (send_data, recv_data, 1,
450 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
457 // Check if the norm-based termination criterion is met. tol2 is the
458 // tolerance squared. Sweep is the sweep index. If not every iteration is
459 // being checked, this function immediately returns false. If a check must
460 // be done at this iteration, it waits for the reduction triggered by
461 // ireduce to complete, then checks the global norm against the tolerance.
462 bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
464 if (sweep <= 0) return false;
466 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::CheckDone
");
468 TEUCHOS_ASSERT(sweep >= 1);
469 if ( ! force && (sweep - 1) % sweep_step_) return false;
471 #ifdef HAVE_IFPACK2_MPI
472 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
473 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
483 r_val = (work_[0] < tol2*work_[2]);
487 const auto adjusted_sweep_step = 2*sweep_step_;
488 if (adjusted_sweep_step < sweep_step_upper_bound_) {
489 sweep_step_ = adjusted_sweep_step;
491 sweep_step_ = sweep_step_upper_bound_;
496 // After termination has occurred, finalize the norms for use in
497 // get_norms{0,final}.
499 work_[0] = std::sqrt(work_[0]); // after converged
501 work_[2] = std::sqrt(work_[2]); // first norm
502 // if work_[2] is minus one, then norm is not requested.
505 // Report norms to the caller.
506 const magnitude_type getNorms0 () const { return work_[2]; }
507 const magnitude_type getNormsFinal () const { return work_[0]; }
510 template<typename MatrixType>
511 void reduceVector(const ConstUnmanaged<typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
512 /* */ typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type *vals) {
513 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
514 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ReduceVector
");
516 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
517 using local_ordinal_type = typename impl_type::local_ordinal_type;
518 using impl_scalar_type = typename impl_type::impl_scalar_type;
520 const auto norm2 = KokkosBlas::nrm1(zz);
522 impl_scalar_type norm2(0);
523 Kokkos::parallel_reduce
524 ("ReduceMultiVector::Device
",
525 Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
526 KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
530 vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
532 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
533 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
536 } // namespace BlockHelperDetails
538 } // namespace Ifpack2
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:317
Definition: Ifpack2_BlockHelper.hpp:113
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockHelper.hpp:312
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:294
Definition: Ifpack2_BlockHelper.hpp:150
Definition: Ifpack2_BlockHelper.hpp:122
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockHelper.hpp:346
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Definition: Ifpack2_BlockHelper.hpp:130
Definition: Ifpack2_BlockHelper.hpp:388
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:303
Definition: Ifpack2_BlockHelper.hpp:229
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:359
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:256
Definition: Ifpack2_BlockHelper.hpp:290
Definition: Ifpack2_BlockHelper.hpp:138
Definition: Ifpack2_BlockHelper.hpp:100