Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockView.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_BLOCKVIEW_HPP
11 #define TPETRA_BLOCKVIEW_HPP
12 
20 
21 #include "TpetraCore_config.h"
22 #if KOKKOS_VERSION >= 40799
23 #include "KokkosKernels_ArithTraits.hpp"
24 #else
25 #include "Kokkos_ArithTraits.hpp"
26 #endif
27 #include "Kokkos_Complex.hpp"
28 
29 namespace Tpetra {
30 
35 
36 namespace Impl {
37 
44 template <class ViewType1,
45  class ViewType2,
46  const int rank1 = ViewType1::rank>
47 struct AbsMax {
48  static KOKKOS_INLINE_FUNCTION void
49  run(const ViewType2& Y, const ViewType1& X);
50 };
51 
56 template <class ViewType1,
57  class ViewType2>
58 struct AbsMax<ViewType1, ViewType2, 2> {
61  static KOKKOS_INLINE_FUNCTION void
62  run(const ViewType2& Y, const ViewType1& X) {
63  static_assert(static_cast<int>(ViewType1::rank) == static_cast<int>(ViewType2::rank),
64  "AbsMax: ViewType1 and ViewType2 must have the same rank.");
65  typedef typename std::remove_reference<decltype(Y(0, 0))>::type STY;
66  static_assert(!std::is_const<STY>::value,
67  "AbsMax: The type of each entry of Y must be nonconst.");
68  typedef typename std::decay<decltype(X(0, 0))>::type STX;
69  static_assert(std::is_same<STX, STY>::value,
70  "AbsMax: The type of each entry of X and Y must be the same.");
71 #if KOKKOS_VERSION >= 40799
72  typedef KokkosKernels::ArithTraits<STY> KAT;
73 #else
74  typedef Kokkos::ArithTraits<STY> KAT;
75 #endif
76 
77  const int numCols = Y.extent(1);
78  const int numRows = Y.extent(0);
79  for (int j = 0; j < numCols; ++j) {
80  for (int i = 0; i < numRows; ++i) {
81  STY& Y_ij = Y(i, j); // use ref here to avoid 2nd op() call on Y
82  const STX X_ij = X(i, j);
83  // NOTE: no std::max (not a CUDA __device__ function); must
84  // cast back up to complex.
85  const auto Y_ij_abs = KAT::abs(Y_ij);
86  const auto X_ij_abs = KAT::abs(X_ij);
87  Y_ij = (Y_ij_abs >= X_ij_abs) ? static_cast<STY>(Y_ij_abs) : static_cast<STY>(X_ij_abs);
88  }
89  }
90  }
91 };
92 
97 template <class ViewType1,
98  class ViewType2>
99 struct AbsMax<ViewType1, ViewType2, 1> {
102  static KOKKOS_INLINE_FUNCTION void
103  run(const ViewType2& Y, const ViewType1& X) {
104  static_assert(static_cast<int>(ViewType1::rank) == static_cast<int>(ViewType2::rank),
105  "AbsMax: ViewType1 and ViewType2 must have the same rank.");
106 
107  typedef typename std::remove_reference<decltype(Y(0))>::type STY;
108  static_assert(!std::is_const<STY>::value,
109  "AbsMax: The type of each entry of Y must be nonconst.");
110  typedef typename std::remove_const<typename std::remove_reference<decltype(X(0))>::type>::type STX;
111  static_assert(std::is_same<STX, STY>::value,
112  "AbsMax: The type of each entry of X and Y must be the same.");
113 #if KOKKOS_VERSION >= 40799
114  typedef KokkosKernels::ArithTraits<STY> KAT;
115 #else
116  typedef Kokkos::ArithTraits<STY> KAT;
117 #endif
118 
119  const int numRows = Y.extent(0);
120  for (int i = 0; i < numRows; ++i) {
121  STY& Y_i = Y(i); // use ref here to avoid 2nd op() call on Y
122  const STX X_i = X(i);
123  // NOTE: no std::max (not a CUDA __device__ function); must
124  // cast back up to complex.
125  const auto Y_i_abs = KAT::abs(Y_i);
126  const auto X_i_abs = KAT::abs(X_i);
127  Y_i = (Y_i_abs >= X_i_abs) ? static_cast<STY>(Y_i_abs) : static_cast<STY>(X_i_abs);
128  }
129  }
130 };
131 
138 template <class ViewType1, class ViewType2, const int rank = ViewType1::rank>
139 KOKKOS_INLINE_FUNCTION void
140 absMax(const ViewType2& Y, const ViewType1& X) {
141  static_assert(static_cast<int>(ViewType1::rank) == static_cast<int>(ViewType2::rank),
142  "absMax: ViewType1 and ViewType2 must have the same rank.");
144 }
145 
150 template <class ViewType,
151  class CoefficientType,
152  class IndexType = int,
153  const bool is_contiguous = false,
154  const int rank = ViewType::rank>
155 struct SCAL {
156  static KOKKOS_INLINE_FUNCTION void
157  run(const CoefficientType& alpha, const ViewType& x);
158 };
159 
162 template <class ViewType,
163  class CoefficientType,
164  class IndexType>
165 struct SCAL<ViewType, CoefficientType, IndexType, false, 1> {
167  static KOKKOS_INLINE_FUNCTION void
168  run(const CoefficientType& alpha, const ViewType& x) {
169  const IndexType numRows = static_cast<IndexType>(x.extent(0));
170 
172 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
173 #pragma unroll
174 #endif
175  for (IndexType i = 0; i < numRows; ++i)
176  x(i) = alpha * x(i);
177  }
178 };
181 template <class ViewType,
182  class CoefficientType,
183  class IndexType>
184 struct SCAL<ViewType, CoefficientType, IndexType, false, 2> {
186  static KOKKOS_INLINE_FUNCTION void
187  run(const CoefficientType& alpha, const ViewType& A) {
188  const IndexType numRows = static_cast<IndexType>(A.extent(0));
189  const IndexType numCols = static_cast<IndexType>(A.extent(1));
190 
191  for (IndexType j = 0; j < numCols; ++j)
192  for (IndexType i = 0; i < numRows; ++i)
193  A(i, j) = alpha * A(i, j);
194  }
195 };
196 template <class ViewType,
197  class CoefficientType,
198  class IndexType,
199  const int rank>
200 struct SCAL<ViewType, CoefficientType, IndexType, true, rank> {
202  static KOKKOS_INLINE_FUNCTION void
203  run(const CoefficientType& alpha, const ViewType& x) {
204  using x_value_type = typename std::decay<decltype(*x.data())>::type;
205  const IndexType span = static_cast<IndexType>(x.span());
206  x_value_type* __restrict x_ptr(x.data());
207 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
208 #pragma unroll
209 #endif
210  for (IndexType i = 0; i < span; ++i)
211  x_ptr[i] = alpha * x_ptr[i];
212  }
213 };
214 
219 template <class ViewType,
220  class InputType,
221  class IndexType = int,
222  const bool is_contiguous = false,
223  const int rank = ViewType::rank>
224 struct FILL {
225  static KOKKOS_INLINE_FUNCTION void
226  run(const ViewType& x, const InputType& val);
227 };
228 
231 template <class ViewType,
232  class InputType,
233  class IndexType>
234 struct FILL<ViewType, InputType, IndexType, false, 1> {
235  static KOKKOS_INLINE_FUNCTION void
236  run(const ViewType& x, const InputType& val) {
237  const IndexType numRows = static_cast<IndexType>(x.extent(0));
238  for (IndexType i = 0; i < numRows; ++i)
239  x(i) = val;
240  }
241 };
244 template <class ViewType,
245  class InputType,
246  class IndexType>
247 struct FILL<ViewType, InputType, IndexType, false, 2> {
248  static KOKKOS_INLINE_FUNCTION void
249  run(const ViewType& X, const InputType& val) {
250  const IndexType numRows = static_cast<IndexType>(X.extent(0));
251  const IndexType numCols = static_cast<IndexType>(X.extent(1));
252  for (IndexType j = 0; j < numCols; ++j)
253  for (IndexType i = 0; i < numRows; ++i)
254  X(i, j) = val;
255  }
256 };
257 template <class ViewType,
258  class InputType,
259  class IndexType,
260  const int rank>
261 struct FILL<ViewType, InputType, IndexType, true, rank> {
262  static KOKKOS_INLINE_FUNCTION void
263  run(const ViewType& x, const InputType& val) {
264  const IndexType span = static_cast<IndexType>(x.span());
265  auto x_ptr = x.data();
266 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
267 #pragma unroll
268 #endif
269  for (IndexType i = 0; i < span; ++i)
270  x_ptr[i] = val;
271  }
272 };
273 
278 template <class CoefficientType,
279  class ViewType1,
280  class ViewType2,
281  class IndexType = int,
282  const bool is_contiguous = false,
283  const int rank = ViewType1::rank>
284 struct AXPY {
285  static KOKKOS_INLINE_FUNCTION void
286  run(const CoefficientType& alpha,
287  const ViewType1& x,
288  const ViewType2& y);
289 };
290 
293 template <class CoefficientType,
294  class ViewType1,
295  class ViewType2,
296  class IndexType>
297 struct AXPY<CoefficientType, ViewType1, ViewType2, IndexType, false, 1> {
299  static KOKKOS_INLINE_FUNCTION void
300  run(const CoefficientType& alpha,
301  const ViewType1& x,
302  const ViewType2& y) {
303 #if KOKKOS_VERSION >= 40799
304  using KokkosKernels::ArithTraits;
305 #else
306  using Kokkos::ArithTraits;
307 #endif
308  static_assert(static_cast<int>(ViewType1::rank) == static_cast<int>(ViewType2::rank),
309  "AXPY: x and y must have the same rank.");
310 
311  const IndexType numRows = static_cast<IndexType>(y.extent(0));
312  if (alpha != ArithTraits<CoefficientType>::zero()) {
314  for (IndexType i = 0; i < numRows; ++i)
315  y(i) += alpha * x(i);
316  }
317  }
318 };
319 
322 template <class CoefficientType,
323  class ViewType1,
324  class ViewType2,
325  class IndexType>
326 struct AXPY<CoefficientType, ViewType1, ViewType2, IndexType, false, 2> {
328  static KOKKOS_INLINE_FUNCTION void
329  run(const CoefficientType& alpha,
330  const ViewType1& X,
331  const ViewType2& Y) {
332 #if KOKKOS_VERSION >= 40799
333  using KokkosKernels::ArithTraits;
334 #else
335  using Kokkos::ArithTraits;
336 #endif
337  static_assert(static_cast<int>(ViewType1::rank) == static_cast<int>(ViewType2::rank),
338  "AXPY: X and Y must have the same rank.");
339  const IndexType numRows = static_cast<IndexType>(Y.extent(0));
340  const IndexType numCols = static_cast<IndexType>(Y.extent(1));
341 
342  if (alpha != ArithTraits<CoefficientType>::zero()) {
343  for (IndexType j = 0; j < numCols; ++j)
344  for (IndexType i = 0; i < numRows; ++i)
345  Y(i, j) += alpha * X(i, j);
346  }
347  }
348 };
349 
350 template <class CoefficientType,
351  class ViewType1,
352  class ViewType2,
353  class IndexType,
354  const int rank>
355 struct AXPY<CoefficientType, ViewType1, ViewType2, IndexType, true, rank> {
357  static KOKKOS_INLINE_FUNCTION void
358  run(const CoefficientType& alpha,
359  const ViewType1& x,
360  const ViewType2& y) {
361 #if KOKKOS_VERSION >= 40799
362  using KokkosKernels::ArithTraits;
363 #else
364  using Kokkos::ArithTraits;
365 #endif
366  static_assert(static_cast<int>(ViewType1::rank) == static_cast<int>(ViewType2::rank),
367  "AXPY: x and y must have the same rank.");
368 
369  if (alpha != ArithTraits<CoefficientType>::zero()) {
370  using x_value_type = typename std::decay<decltype(*x.data())>::type;
371  using y_value_type = typename std::decay<decltype(*y.data())>::type;
372  const IndexType span = static_cast<IndexType>(y.span());
373  const x_value_type* __restrict x_ptr(x.data());
374  y_value_type* __restrict y_ptr(y.data());
375 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
376 #pragma unroll
377 #endif
378  for (IndexType i = 0; i < span; ++i)
379  y_ptr[i] += alpha * x_ptr[i];
380  }
381  }
382 };
383 
388 template <class ViewType1,
389  class ViewType2,
390  class IndexType = int,
391  const bool is_contiguous = false,
392  const int rank = ViewType1::rank>
393 struct COPY {
394  static KOKKOS_INLINE_FUNCTION void
395  run(const ViewType1& x, const ViewType2& y);
396 };
397 
400 template <class ViewType1,
401  class ViewType2,
402  class IndexType>
403 struct COPY<ViewType1, ViewType2, IndexType, false, 1> {
405  static KOKKOS_INLINE_FUNCTION void
406  run(const ViewType1& x, const ViewType2& y) {
407  const IndexType numRows = static_cast<IndexType>(x.extent(0));
409  for (IndexType i = 0; i < numRows; ++i)
410  y(i) = x(i);
411  }
412 };
413 
416 template <class ViewType1,
417  class ViewType2,
418  class IndexType>
419 struct COPY<ViewType1, ViewType2, IndexType, false, 2> {
421  static KOKKOS_INLINE_FUNCTION void
422  run(const ViewType1& X, const ViewType2& Y) {
423  const IndexType numRows = static_cast<IndexType>(Y.extent(0));
424  const IndexType numCols = static_cast<IndexType>(Y.extent(1));
426  for (IndexType j = 0; j < numCols; ++j)
427  for (IndexType i = 0; i < numRows; ++i)
428  Y(i, j) = X(i, j);
429  }
430 };
431 
432 template <class ViewType1,
433  class ViewType2,
434  class IndexType,
435  const int rank>
436 struct COPY<ViewType1, ViewType2, IndexType, true, rank> {
438  static KOKKOS_INLINE_FUNCTION void
439  run(const ViewType1& x, const ViewType2& y) {
440  const IndexType span = static_cast<IndexType>(x.span());
441  using x_value_type = typename std::decay<decltype(*x.data())>::type;
442  using y_value_type = typename std::decay<decltype(*y.data())>::type;
443  const x_value_type* __restrict x_ptr(x.data());
444  y_value_type* __restrict y_ptr(y.data());
445 
446 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
447 #pragma unroll
448 #endif
449  for (IndexType i = 0; i < span; ++i)
450  y_ptr[i] = x_ptr[i];
451  }
452 };
453 
454 template <class VecType1,
455  class BlkType,
456  class VecType2,
457  class CoeffType,
458  class IndexType = int,
459  bool is_contiguous = false,
460  class BlkLayoutType = typename BlkType::array_layout>
461 struct GEMV {
462  static KOKKOS_INLINE_FUNCTION void
463  run(const CoeffType& alpha,
464  const BlkType& A,
465  const VecType1& x,
466  const VecType2& y) {
467  static_assert(VecType1::rank == 1, "GEMV: VecType1 must have rank 1.");
468  static_assert(BlkType::rank == 2, "GEMV: BlkType must have rank 2.");
469  static_assert(VecType2::rank == 1, "GEMV: VecType2 must have rank 1.");
470 
471  const IndexType numRows = static_cast<IndexType>(A.extent(0));
472  const IndexType numCols = static_cast<IndexType>(A.extent(1));
473 
475  for (IndexType i = 0; i < numRows; ++i)
476  for (IndexType j = 0; j < numCols; ++j)
477  y(i) += alpha * A(i, j) * x(j);
478  }
479 };
480 
481 template <class VecType1,
482  class BlkType,
483  class VecType2,
484  class CoeffType,
485  class IndexType>
486 struct GEMV<VecType1, BlkType, VecType2, CoeffType, IndexType, true, Kokkos::LayoutLeft> {
487  static KOKKOS_INLINE_FUNCTION void
488  run(const CoeffType& alpha,
489  const BlkType& A,
490  const VecType1& x,
491  const VecType2& y) {
492  static_assert(VecType1::rank == 1, "GEMV: VecType1 must have rank 1.");
493  static_assert(BlkType::rank == 2, "GEMV: BlkType must have rank 2.");
494  static_assert(VecType2::rank == 1, "GEMV: VecType2 must have rank 1.");
495 
496  using A_value_type = typename std::decay<decltype(A(0, 0))>::type;
497  using x_value_type = typename std::decay<decltype(x(0))>::type;
498  using y_value_type = typename std::decay<decltype(y(0))>::type;
499 
500  const IndexType numRows = static_cast<IndexType>(A.extent(0));
501  const IndexType numCols = static_cast<IndexType>(A.extent(1));
502 
503  const A_value_type* __restrict A_ptr(A.data());
504  const IndexType as1(A.stride(1));
505  const x_value_type* __restrict x_ptr(x.data());
506  y_value_type* __restrict y_ptr(y.data());
507 
508 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
509 #pragma unroll
510 #endif
511  for (IndexType j = 0; j < numCols; ++j) {
512  const x_value_type x_at_j = alpha * x_ptr[j];
513  const A_value_type* __restrict A_at_j = A_ptr + j * as1;
514 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
515 #pragma unroll
516 #endif
517  for (IndexType i = 0; i < numRows; ++i)
518  y_ptr[i] += A_at_j[i] * x_at_j;
519  }
520  }
521 };
522 
523 template <class VecType1,
524  class BlkType,
525  class VecType2,
526  class CoeffType,
527  class IndexType>
528 struct GEMV<VecType1, BlkType, VecType2, CoeffType, IndexType, true, Kokkos::LayoutRight> {
529  static KOKKOS_INLINE_FUNCTION void
530  run(const CoeffType& alpha,
531  const BlkType& A,
532  const VecType1& x,
533  const VecType2& y) {
534  static_assert(VecType1::rank == 1, "GEMV: VecType1 must have rank 1.");
535  static_assert(BlkType::rank == 2, "GEMV: BlkType must have rank 2.");
536  static_assert(VecType2::rank == 1, "GEMV: VecType2 must have rank 1.");
537 
538  using A_value_type = typename std::decay<decltype(A(0, 0))>::type;
539  using x_value_type = typename std::decay<decltype(x(0))>::type;
540  using y_value_type = typename std::decay<decltype(y(0))>::type;
541 
542  const IndexType numRows = static_cast<IndexType>(A.extent(0));
543  const IndexType numCols = static_cast<IndexType>(A.extent(1));
544 
545  const A_value_type* __restrict A_ptr(A.data());
546  const IndexType as0(A.stride(0));
547  const x_value_type* __restrict x_ptr(x.data());
548  y_value_type* __restrict y_ptr(y.data());
549 
550 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
551 #pragma unroll
552 #endif
553  for (IndexType i = 0; i < numRows; ++i) {
554  y_value_type y_at_i(0);
555  const auto A_at_i = A_ptr + i * as0;
556 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
557 #pragma unroll
558 #endif
559  for (IndexType j = 0; j < numCols; ++j)
560  y_at_i += A_at_i[j] * x_ptr[j];
561  y_ptr[i] += alpha * y_at_i;
562  }
563  }
564 };
565 
566 } // namespace Impl
567 
570 template <class ViewType,
571  class CoefficientType,
572  class IndexType = int,
573  const int rank = ViewType::rank>
574 KOKKOS_INLINE_FUNCTION void
575 SCAL(const CoefficientType& alpha, const ViewType& x) {
576  using LayoutType = typename ViewType::array_layout;
577  constexpr bool is_contiguous = (std::is_same<LayoutType, Kokkos::LayoutLeft>::value ||
578  std::is_same<LayoutType, Kokkos::LayoutRight>::value);
580 }
581 
583 template <class ViewType,
584  class InputType,
585  class IndexType = int,
586  const int rank = ViewType::rank>
587 KOKKOS_INLINE_FUNCTION void
588 FILL(const ViewType& x, const InputType& val) {
589  using LayoutType = typename ViewType::array_layout;
590  constexpr bool is_contiguous = (std::is_same<LayoutType, Kokkos::LayoutLeft>::value ||
591  std::is_same<LayoutType, Kokkos::LayoutRight>::value);
593 }
594 
600 template <class CoefficientType,
601  class ViewType1,
602  class ViewType2,
603  class IndexType = int,
604  const int rank = ViewType1::rank>
605 KOKKOS_INLINE_FUNCTION void
606 AXPY(const CoefficientType& alpha,
607  const ViewType1& x,
608  const ViewType2& y) {
609  static_assert(static_cast<int>(ViewType1::rank) ==
610  static_cast<int>(ViewType2::rank),
611  "AXPY: x and y must have the same rank.");
612  using LayoutType1 = typename ViewType1::array_layout;
613  using LayoutType2 = typename ViewType2::array_layout;
614  constexpr bool is_layout_same = std::is_same<LayoutType1, LayoutType2>::value;
615  constexpr bool is_x_contiguous = (std::is_same<LayoutType1, Kokkos::LayoutLeft>::value ||
616  std::is_same<LayoutType1, Kokkos::LayoutRight>::value);
617  constexpr bool is_contiguous = is_layout_same && is_x_contiguous;
619 }
620 
629 template <class ViewType1,
630  class ViewType2,
631  class IndexType = int,
632  const int rank = ViewType1::rank>
633 KOKKOS_INLINE_FUNCTION void
634 COPY(const ViewType1& x, const ViewType2& y) {
635  static_assert(static_cast<int>(ViewType1::rank) ==
636  static_cast<int>(ViewType2::rank),
637  "COPY: x and y must have the same rank.");
638  using LayoutType1 = typename ViewType1::array_layout;
639  using LayoutType2 = typename ViewType2::array_layout;
640  constexpr bool is_layout_same = std::is_same<LayoutType1, LayoutType2>::value;
641  constexpr bool is_x_contiguous = (std::is_same<LayoutType1, Kokkos::LayoutLeft>::value ||
642  std::is_same<LayoutType1, Kokkos::LayoutRight>::value);
643  constexpr bool is_contiguous = is_layout_same && is_x_contiguous;
645 }
646 
658 template <class VecType1,
659  class BlkType,
660  class VecType2,
661  class CoeffType,
662  class IndexType = int>
663 KOKKOS_INLINE_FUNCTION void
664 GEMV(const CoeffType& alpha,
665  const BlkType& A,
666  const VecType1& x,
667  const VecType2& y) {
668  constexpr bool is_A_contiguous = (std::is_same<typename BlkType::array_layout, Kokkos::LayoutLeft>::value ||
669  std::is_same<typename BlkType::array_layout, Kokkos::LayoutRight>::value);
670  constexpr bool is_x_contiguous = (std::is_same<typename VecType1::array_layout, Kokkos::LayoutLeft>::value ||
671  std::is_same<typename VecType1::array_layout, Kokkos::LayoutRight>::value);
672  constexpr bool is_y_contiguous = (std::is_same<typename VecType2::array_layout, Kokkos::LayoutLeft>::value ||
673  std::is_same<typename VecType2::array_layout, Kokkos::LayoutRight>::value);
674  constexpr bool is_contiguous = is_A_contiguous && is_x_contiguous && is_y_contiguous;
675  Impl::GEMV<VecType1, BlkType, VecType2, CoeffType, IndexType, is_contiguous>::run(alpha, A, x, y);
676 }
677 
685 template <class ViewType1,
686  class ViewType2,
687  class ViewType3,
688  class CoefficientType,
689  class IndexType = int>
690 KOKKOS_INLINE_FUNCTION void
691 GEMM(const char transA[],
692  const char transB[],
693  const CoefficientType& alpha,
694  const ViewType1& A,
695  const ViewType2& B,
696  const CoefficientType& beta,
697  const ViewType3& C) {
698  // Assert that A, B, and C are in fact matrices
699  static_assert(ViewType1::rank == 2, "GEMM: A must have rank 2 (be a matrix).");
700  static_assert(ViewType2::rank == 2, "GEMM: B must have rank 2 (be a matrix).");
701  static_assert(ViewType3::rank == 2, "GEMM: C must have rank 2 (be a matrix).");
702 
703  typedef typename std::remove_reference<decltype(A(0, 0))>::type Scalar;
704 #if KOKKOS_VERSION >= 40799
705  typedef KokkosKernels::ArithTraits<Scalar> STS;
706 #else
707  typedef Kokkos::ArithTraits<Scalar> STS;
708 #endif
709  const Scalar ZERO = STS::zero();
710  const Scalar ONE = STS::one();
711 
712  // Get the dimensions
713  IndexType m, n, k;
714  if (transA[0] == 'N' || transA[0] == 'n') {
715  m = static_cast<IndexType>(A.extent(0));
716  n = static_cast<IndexType>(A.extent(1));
717  } else {
718  m = static_cast<IndexType>(A.extent(1));
719  n = static_cast<IndexType>(A.extent(0));
720  }
721  k = static_cast<IndexType>(C.extent(1));
722 
723  // quick return if possible
724  if (alpha == ZERO && beta == ONE)
725  return;
726 
727  // And if alpha equals zero...
728  if (alpha == ZERO) {
729  if (beta == ZERO) {
730  for (IndexType i = 0; i < m; i++) {
731  for (IndexType j = 0; j < k; j++) {
732  C(i, j) = ZERO;
733  }
734  }
735  } else {
736  for (IndexType i = 0; i < m; i++) {
737  for (IndexType j = 0; j < k; j++) {
738  C(i, j) = beta * C(i, j);
739  }
740  }
741  }
742  }
743 
744  // Start the operations
745  if (transB[0] == 'n' || transB[0] == 'N') {
746  if (transA[0] == 'n' || transA[0] == 'N') {
747  // Form C = alpha*A*B + beta*C
748  for (IndexType j = 0; j < n; j++) {
749  if (beta == ZERO) {
750  for (IndexType i = 0; i < m; i++) {
751  C(i, j) = ZERO;
752  }
753  } else if (beta != ONE) {
754  for (IndexType i = 0; i < m; i++) {
755  C(i, j) = beta * C(i, j);
756  }
757  }
758  for (IndexType l = 0; l < k; l++) {
759  Scalar temp = alpha * B(l, j);
760  for (IndexType i = 0; i < m; i++) {
761  C(i, j) = C(i, j) + temp * A(i, l);
762  }
763  }
764  }
765  } else {
766  // Form C = alpha*A**T*B + beta*C
767  for (IndexType j = 0; j < n; j++) {
768  for (IndexType i = 0; i < m; i++) {
769  Scalar temp = ZERO;
770  for (IndexType l = 0; l < k; l++) {
771  temp = temp + A(l, i) * B(l, j);
772  }
773  if (beta == ZERO) {
774  C(i, j) = alpha * temp;
775  } else {
776  C(i, j) = alpha * temp + beta * C(i, j);
777  }
778  }
779  }
780  }
781  } else {
782  if (transA[0] == 'n' || transA[0] == 'N') {
783  // Form C = alpha*A*B**T + beta*C
784  for (IndexType j = 0; j < n; j++) {
785  if (beta == ZERO) {
786  for (IndexType i = 0; i < m; i++) {
787  C(i, j) = ZERO;
788  }
789  } else if (beta != ONE) {
790  for (IndexType i = 0; i < m; i++) {
791  C(i, j) = beta * C(i, j);
792  }
793  }
794  for (IndexType l = 0; l < k; l++) {
795  Scalar temp = alpha * B(j, l);
796  for (IndexType i = 0; i < m; i++) {
797  C(i, j) = C(i, j) + temp * A(i, l);
798  }
799  }
800  }
801  } else {
802  // Form C = alpha*A**T*B**T + beta*C
803  for (IndexType j = 0; j < n; j++) {
804  for (IndexType i = 0; i < m; i++) {
805  Scalar temp = ZERO;
806  for (IndexType l = 0; l < k; l++) {
807  temp = temp + A(l, i) * B(j, l);
808  }
809  if (beta == ZERO) {
810  C(i, j) = alpha * temp;
811  } else {
812  C(i, j) = alpha * temp + beta * C(i, j);
813  }
814  }
815  }
816  }
817  }
818 }
819 
821 template <class LittleBlockType,
822  class LittleVectorType>
823 KOKKOS_INLINE_FUNCTION void
824 GETF2(const LittleBlockType& A, const LittleVectorType& ipiv, int& info) {
825  // The type of an entry of ipiv is the index type.
826  typedef typename std::decay<decltype(ipiv(0))>::type IndexType;
827  static_assert(std::is_integral<IndexType>::value,
828  "GETF2: The type of each entry of ipiv must be an integer type.");
829  typedef typename std::remove_reference<decltype(A(0, 0))>::type Scalar;
830  static_assert(!std::is_const<Scalar>::value,
831  "GETF2: A must not be a const View (or LittleBlock).");
832  static_assert(!std::is_const<std::remove_reference<decltype(ipiv(0))>>::value,
833  "GETF2: ipiv must not be a const View (or LittleBlock).");
834  static_assert(LittleBlockType::rank == 2, "GETF2: A must have rank 2 (be a matrix).");
835 #if KOKKOS_VERSION >= 40799
836  typedef KokkosKernels::ArithTraits<Scalar> STS;
837 #else
838  typedef Kokkos::ArithTraits<Scalar> STS;
839 #endif
840  const Scalar ZERO = STS::zero();
841 
842  const IndexType numRows = static_cast<IndexType>(A.extent(0));
843  const IndexType numCols = static_cast<IndexType>(A.extent(1));
844  const IndexType pivDim = static_cast<IndexType>(ipiv.extent(0));
845 
846  // std::min is not a CUDA device function
847  const IndexType minPivDim = (numRows < numCols) ? numRows : numCols;
848  if (pivDim < minPivDim) {
849  info = -2;
850  return;
851  }
852 
853  // Initialize info
854  info = 0;
855 
856  for (IndexType j = 0; j < pivDim; j++) {
857  // Find pivot and test for singularity
858  IndexType jp = j;
859  for (IndexType i = j + 1; i < numRows; i++) {
860  if (STS::abs(A(i, j)) > STS::abs(A(jp, j))) {
861  jp = i;
862  }
863  }
864  ipiv(j) = jp + 1;
865 
866  if (A(jp, j) != ZERO) {
867  // Apply the interchange to columns 1:N
868  if (jp != j) {
869  for (IndexType i = 0; i < numCols; i++) {
870  Scalar temp = A(jp, i);
871  A(jp, i) = A(j, i);
872  A(j, i) = temp;
873  }
874  }
875 
876  // Compute elements J+1:M of J-th column
877  for (IndexType i = j + 1; i < numRows; i++) {
878  A(i, j) = A(i, j) / A(j, j);
879  }
880  } else if (info == 0) {
881  info = j;
882  }
883 
884  // Update trailing submatrix
885  for (IndexType r = j + 1; r < numRows; r++) {
886  for (IndexType c = j + 1; c < numCols; c++) {
887  A(r, c) = A(r, c) - A(r, j) * A(j, c);
888  }
889  }
890  }
891 }
892 
893 namespace Impl {
894 
898 template <class LittleBlockType,
899  class LittleIntVectorType,
900  class LittleScalarVectorType,
901  const int rank = LittleScalarVectorType::rank>
902 struct GETRS {
903  static KOKKOS_INLINE_FUNCTION void
904  run(const char mode[],
905  const LittleBlockType& A,
906  const LittleIntVectorType& ipiv,
907  const LittleScalarVectorType& B,
908  int& info);
909 };
910 
912 template <class LittleBlockType,
913  class LittleIntVectorType,
914  class LittleScalarVectorType>
915 struct GETRS<LittleBlockType, LittleIntVectorType, LittleScalarVectorType, 1> {
916  static KOKKOS_INLINE_FUNCTION void
917  run(const char mode[],
918  const LittleBlockType& A,
919  const LittleIntVectorType& ipiv,
920  const LittleScalarVectorType& B,
921  int& info) {
922  // The type of an entry of ipiv is the index type.
923  typedef typename std::decay<decltype(ipiv(0))>::type IndexType;
924  // IndexType must be signed, because this code does a countdown loop
925  // to zero. Unsigned integers are always >= 0, even on underflow.
926  static_assert(std::is_integral<IndexType>::value &&
927  std::is_signed<IndexType>::value,
928  "GETRS: The type of each entry of ipiv must be a signed integer.");
929  typedef typename std::decay<decltype(A(0, 0))>::type Scalar;
930  static_assert(!std::is_const<std::remove_reference<decltype(B(0))>>::value,
931  "GETRS: B must not be a const View (or LittleBlock).");
932  static_assert(LittleBlockType::rank == 2, "GETRS: A must have rank 2 (be a matrix).");
933  static_assert(LittleIntVectorType::rank == 1, "GETRS: ipiv must have rank 1.");
934  static_assert(LittleScalarVectorType::rank == 1, "GETRS: For this specialization, B must have rank 1.");
935 
936 #if KOKKOS_VERSION >= 40799
937  typedef KokkosKernels::ArithTraits<Scalar> STS;
938 #else
939  typedef Kokkos::ArithTraits<Scalar> STS;
940 #endif
941  const Scalar ZERO = STS::zero();
942  const IndexType numRows = static_cast<IndexType>(A.extent(0));
943  const IndexType numCols = static_cast<IndexType>(A.extent(1));
944  const IndexType pivDim = static_cast<IndexType>(ipiv.extent(0));
945 
946  info = 0;
947 
948  // Ensure that the matrix is square
949  if (numRows != numCols) {
950  info = -2;
951  return;
952  }
953 
954  // Ensure that the pivot array is sufficiently large
955  if (pivDim < numRows) {
956  info = -3;
957  return;
958  }
959 
960  // No transpose case
961  if (mode[0] == 'n' || mode[0] == 'N') {
962  // Apply row interchanges to the RHS
963  for (IndexType i = 0; i < numRows; i++) {
964  if (ipiv(i) != i + 1) {
965  Scalar temp = B(i);
966  B(i) = B(ipiv(i) - 1);
967  B(ipiv(i) - 1) = temp;
968  }
969  }
970 
971  // Solve Lx=b, overwriting b with x
972  for (IndexType r = 1; r < numRows; r++) {
973  for (IndexType c = 0; c < r; c++) {
974  B(r) = B(r) - A(r, c) * B(c);
975  }
976  }
977 
978  // Solve Ux=b, overwriting b with x
979  for (IndexType r = numRows - 1; r >= 0; r--) {
980  // Check whether U is singular
981  if (A(r, r) == ZERO) {
982  info = r + 1;
983  return;
984  }
985 
986  for (IndexType c = r + 1; c < numCols; c++) {
987  B(r) = B(r) - A(r, c) * B(c);
988  }
989  B(r) = B(r) / A(r, r);
990  }
991  }
992  // Transpose case
993  else if (mode[0] == 't' || mode[0] == 'T') {
994  info = -1; // NOT YET IMPLEMENTED
995  return;
996  }
997  // Conjugate transpose case
998  else if (mode[0] == 'c' || mode[0] == 'C') {
999  info = -1; // NOT YET IMPLEMENTED
1000  return;
1001  } else { // invalid mode
1002  info = -1;
1003  return;
1004  }
1005  }
1006 };
1007 
1009 template <class LittleBlockType,
1010  class LittleIntVectorType,
1011  class LittleScalarVectorType>
1012 struct GETRS<LittleBlockType, LittleIntVectorType, LittleScalarVectorType, 2> {
1013  static KOKKOS_INLINE_FUNCTION void
1014  run(const char mode[],
1015  const LittleBlockType& A,
1016  const LittleIntVectorType& ipiv,
1017  const LittleScalarVectorType& B,
1018  int& info) {
1019  // The type of an entry of ipiv is the index type.
1020  typedef typename std::decay<decltype(ipiv(0))>::type IndexType;
1021  static_assert(std::is_integral<IndexType>::value,
1022  "GETRS: The type of each entry of ipiv must be an integer type.");
1023  static_assert(!std::is_const<std::remove_reference<decltype(B(0))>>::value,
1024  "GETRS: B must not be a const View (or LittleBlock).");
1025  static_assert(LittleBlockType::rank == 2, "GETRS: A must have rank 2 (be a matrix).");
1026  static_assert(LittleIntVectorType::rank == 1, "GETRS: ipiv must have rank 1.");
1027  static_assert(LittleScalarVectorType::rank == 2, "GETRS: For this specialization, B must have rank 2.");
1028 
1029  // The current implementation iterates over one right-hand side at
1030  // a time. It might be faster to do this differently, but this
1031  // should work for now.
1032  const IndexType numRhs = B.extent(1);
1033  info = 0;
1034 
1035  for (IndexType rhs = 0; rhs < numRhs; ++rhs) {
1036  auto B_cur = Kokkos::subview(B, Kokkos::ALL(), rhs);
1038  if (info != 0) {
1039  return;
1040  }
1041  }
1042  }
1043 };
1044 
1045 } // namespace Impl
1046 
1050 template <class LittleBlockType,
1051  class LittleIntVectorType,
1052  class LittleScalarVectorType>
1053 KOKKOS_INLINE_FUNCTION void
1054 GETRS(const char mode[],
1055  const LittleBlockType& A,
1056  const LittleIntVectorType& ipiv,
1057  const LittleScalarVectorType& B,
1058  int& info) {
1059  Impl::GETRS<LittleBlockType, LittleIntVectorType, LittleScalarVectorType,
1060  LittleScalarVectorType::rank>::run(mode, A, ipiv, B, info);
1061 }
1062 
1077 template <class LittleBlockType,
1078  class LittleIntVectorType,
1079  class LittleScalarVectorType>
1080 KOKKOS_INLINE_FUNCTION void
1081 GETRI(const LittleBlockType& A,
1082  const LittleIntVectorType& ipiv,
1083  const LittleScalarVectorType& work,
1084  int& info) {
1085  // The type of an entry of ipiv is the index type.
1086  typedef typename std::decay<decltype(ipiv(0))>::type IndexType;
1087  // IndexType must be signed, because this code does a countdown loop
1088  // to zero. Unsigned integers are always >= 0, even on underflow.
1089  static_assert(std::is_integral<IndexType>::value &&
1090  std::is_signed<IndexType>::value,
1091  "GETRI: The type of each entry of ipiv must be a signed integer.");
1092  typedef typename std::remove_reference<decltype(A(0, 0))>::type Scalar;
1093  static_assert(!std::is_const<std::remove_reference<decltype(A(0, 0))>>::value,
1094  "GETRI: A must not be a const View (or LittleBlock).");
1095  static_assert(!std::is_const<std::remove_reference<decltype(work(0))>>::value,
1096  "GETRI: work must not be a const View (or LittleBlock).");
1097  static_assert(LittleBlockType::rank == 2, "GETRI: A must have rank 2 (be a matrix).");
1098 #if KOKKOS_VERSION >= 40799
1099  typedef KokkosKernels::ArithTraits<Scalar> STS;
1100 #else
1101  typedef Kokkos::ArithTraits<Scalar> STS;
1102 #endif
1103  const Scalar ZERO = STS::zero();
1104  const Scalar ONE = STS::one();
1105 
1106  const IndexType numRows = static_cast<IndexType>(A.extent(0));
1107  const IndexType numCols = static_cast<IndexType>(A.extent(1));
1108  const IndexType pivDim = static_cast<IndexType>(ipiv.extent(0));
1109  const IndexType workDim = static_cast<IndexType>(work.extent(0));
1110 
1111  info = 0;
1112 
1113  // Ensure that the matrix is square
1114  if (numRows != numCols) {
1115  info = -1;
1116  return;
1117  }
1118 
1119  // Ensure that the pivot array is sufficiently large
1120  if (pivDim < numRows) {
1121  info = -2;
1122  return;
1123  }
1124 
1125  // Ensure that the work array is sufficiently large
1126  if (workDim < numRows) {
1127  info = -3;
1128  return;
1129  }
1130 
1131  // Form Uinv in place
1132  for (IndexType j = 0; j < numRows; j++) {
1133  if (A(j, j) == ZERO) {
1134  info = j + 1;
1135  return;
1136  }
1137 
1138  A(j, j) = ONE / A(j, j);
1139 
1140  // Compute elements 1:j-1 of j-th column
1141  for (IndexType r = 0; r < j; r++) {
1142  A(r, j) = A(r, r) * A(r, j);
1143  for (IndexType c = r + 1; c < j; c++) {
1144  A(r, j) = A(r, j) + A(r, c) * A(c, j);
1145  }
1146  }
1147  for (IndexType r = 0; r < j; r++) {
1148  A(r, j) = -A(j, j) * A(r, j);
1149  }
1150  }
1151 
1152  // Compute Ainv by solving A\L = Uinv
1153  for (IndexType j = numCols - 2; j >= 0; j--) {
1154  // Copy lower triangular data to work array and replace with 0
1155  for (IndexType r = j + 1; r < numRows; r++) {
1156  work(r) = A(r, j);
1157  A(r, j) = 0;
1158  }
1159 
1160  for (IndexType r = 0; r < numRows; r++) {
1161  for (IndexType i = j + 1; i < numRows; i++) {
1162  A(r, j) = A(r, j) - work(i) * A(r, i);
1163  }
1164  }
1165  }
1166 
1167  // Apply column interchanges
1168  for (IndexType j = numRows - 1; j >= 0; j--) {
1169  IndexType jp = ipiv(j) - 1;
1170  if (j != jp) {
1171  for (IndexType r = 0; r < numRows; r++) {
1172  Scalar temp = A(r, j);
1173  A(r, j) = A(r, jp);
1174  A(r, jp) = temp;
1175  }
1176  }
1177  }
1178 }
1179 
1180 // mfh 08 Nov 2015: I haven't tested this overload yet. It also needs
1181 // an implementation for trans != 'N' (the transpose and conjugate
1182 // transpose cases).
1183 #if 0
1184 template<class LittleBlockType,
1185  class LittleVectorTypeX,
1186  class LittleVectorTypeY,
1187  class CoefficientType,
1188  class IndexType = int>
1189 KOKKOS_INLINE_FUNCTION void
1190 GEMV (const char trans,
1191  const CoefficientType& alpha,
1192  const LittleBlockType& A,
1193  const LittleVectorTypeX& x,
1194  const CoefficientType& beta,
1195  const LittleVectorTypeY& y)
1196 {
1197  // y(0) returns a reference to the 0-th entry of y. Remove that
1198  // reference to get the type of each entry of y. It's OK if y has
1199  // zero entries -- this doesn't actually do y(i), it just returns
1200  // the type of that expression.
1201  typedef typename std::remove_reference<decltype (y(0)) >::type y_value_type;
1202  const IndexType numRows = static_cast<IndexType> (A.extent (0));
1203  const IndexType numCols = static_cast<IndexType> (A.extent (1));
1204 
1205  if (beta == 0.0) {
1206  if (alpha == 0.0) {
1207  for (IndexType i = 0; i < numRows; ++i) {
1208  y(i) = 0.0;
1209  }
1210  }
1211  else {
1212  for (IndexType i = 0; i < numRows; ++i) {
1213  y_value_type y_i = 0.0;
1214  for (IndexType j = 0; j < numCols; ++j) {
1215  y_i += A(i,j) * x(j);
1216  }
1217  y(i) = y_i;
1218  }
1219  }
1220  }
1221  else { // beta != 0
1222  if (alpha == 0.0) {
1223  if (beta == 0.0) {
1224  for (IndexType i = 0; i < numRows; ++i) {
1225  y(i) = 0.0;
1226  }
1227  }
1228  else {
1229  for (IndexType i = 0; i < numRows; ++i) {
1230  y(i) *= beta;
1231  }
1232  }
1233  }
1234  else {
1235  for (IndexType i = 0; i < numRows; ++i) {
1236  y_value_type y_i = beta * y(i);
1237  for (IndexType j = 0; j < numCols; ++j) {
1238  y_i += alpha * A(i,j) * x(j);
1239  }
1240  y(i) = y_i;
1241  }
1242  }
1243  }
1244 }
1245 
1246 #endif // 0
1247 
1248 } // namespace Tpetra
1249 
1250 #endif // TPETRA_BLOCKVIEW_HPP
static KOKKOS_INLINE_FUNCTION void run(const CoefficientType &alpha, const ViewType &A)
A := alpha*A (rank-2 A, i.e., a matrix)
static KOKKOS_INLINE_FUNCTION void run(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha*x (rank-1 x and y, i.e., vectors)
Implementation of Tpetra::SCAL function.
static KOKKOS_INLINE_FUNCTION void run(const ViewType1 &x, const ViewType2 &y)
y := x (rank-1 x and y, i.e., vectors)
KOKKOS_INLINE_FUNCTION void GETRS(const char mode[], const LittleBlockType &A, const LittleIntVectorType &ipiv, const LittleScalarVectorType &B, int &info)
Solve the linear system(s) AX=B, using the result of GETRF or GETF2.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
static KOKKOS_INLINE_FUNCTION void run(const CoefficientType &alpha, const ViewType &x)
x := alpha*x (rank-1 x, i.e., a vector)
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
KOKKOS_INLINE_FUNCTION void GEMM(const char transA[], const char transB[], const CoefficientType &alpha, const ViewType1 &A, const ViewType2 &B, const CoefficientType &beta, const ViewType3 &C)
Small dense matrix-matrix multiply: C := alpha*A*B + beta*C
static KOKKOS_INLINE_FUNCTION void run(const CoefficientType &alpha, const ViewType1 &X, const ViewType2 &Y)
Y := Y + alpha*X (rank-2 X and Y, i.e., matrices)
static KOKKOS_INLINE_FUNCTION void run(const ViewType1 &X, const ViewType2 &Y)
Y := X (rank-2 X and Y, i.e., matrices)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, or the small dense vectors in BlockMultiVector and BlockVector.
Replace old values with zero.
KOKKOS_INLINE_FUNCTION void GETRI(const LittleBlockType &A, const LittleIntVectorType &ipiv, const LittleScalarVectorType &work, int &info)
Compute inverse of A, using result of GETRF or GETF2.
Implementation of Tpetra::COPY function.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
static KOKKOS_INLINE_FUNCTION void run(const ViewType2 &Y, const ViewType1 &X)
(*this)(i,j) := max(abs((*this)(i,j)), abs(X(i,j))) for all (i,j).
Implementation of Tpetra::AXPY function.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Computes the solution to Ax=b.
KOKKOS_INLINE_FUNCTION void GETF2(const LittleBlockType &A, const LittleVectorType &ipiv, int &info)
Computes A = P*L*U.
Implementation of Tpetra::FILL function.
static KOKKOS_INLINE_FUNCTION void run(const ViewType2 &Y, const ViewType1 &X)
(*this)(i) := max(abs((*this)(i)), abs(X(i))) for all i.