Teuchos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Teuchos_Details_Lapack128.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Teuchos: Common Tools Package
4 //
5 // Copyright 2004 NTESS and the Teuchos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
11 #ifdef HAVE_TEUCHOSCORE_QUADMATH
12 # include "Teuchos_BLAS.hpp"
13 #endif // HAVE_TEUCHOSCORE_QUADMATH
14 
15 
16 #ifdef HAVE_TEUCHOSCORE_QUADMATH
17 namespace Teuchos {
18 namespace Details {
19 
20 void
21 Lapack128::
22 GETRF (const int M, const int N, __float128 A[],
23  const int LDA, int IPIV[], int* INFO) const
24 {
25  //std::cerr << "GETRF: N = " << N << std::endl;
26 
28 
29  // NOTE (mfh 05 Sep 2015) This is a direct translation of LAPACK
30  // 3.5.0's DGETF2 routine. LAPACK is under a BSD license.
31 
32  *INFO = 0;
33  if (M < 0) {
34  *INFO = -1;
35  } else if (N < 0) {
36  *INFO = -2;
37  } else if (LDA < std::max (1, M)) {
38  *INFO = -4;
39  }
40  if (*INFO != 0) {
41  return;
42  }
43 
44  // Quick return if possible
45  if (M == 0 || N == 0) {
46  return;
47  }
48 
49  // Compute machine safe minimum sfmin (such that 1/sfmin does
50  // not overflow). LAPACK 3.1 just returns for this the smallest
51  // normalized number.
52  const __float128 sfmin = FLT128_MIN;
53  const __float128 zero = 0.0;
54  const __float128 one = 1.0;
55 
56  const int j_upperBound = std::min (M, N);
57  for (int j = 1; j <= j_upperBound; ++j) {
58  //std::cerr << " j = " << j << std::endl;
59 
60  // Find pivot and test for singularity.
61  __float128* const A_jj = A + (j-1)*LDA + (j-1);
62 
63  //std::cerr << " CALLING IAMAX" << std::endl;
64  const int jp = (j - 1) + blas.IAMAX (M - j + 1, A_jj, 1);
65  IPIV[j - 1] = jp;
66 
67  const __float128* A_jp_j = A + jp + LDA*j;
68  if (*A_jp_j != zero) {
69  // Apply the interchange to columns 1:N.
70  __float128* const A_j1 = A + (j - 1);
71  __float128* const A_jp_1 = A + (jp - 1);
72 
73  if (jp != j) {
74  blas.SWAP (N, A_j1, LDA, A_jp_1, LDA);
75  }
76 
77  // Compute elements J+1:M of J-th column.
78  if (j < M) {
79  __float128* const A_j1_j = A + j + (j-1)*LDA;
80 
81  if (fabsq (*A_jj) >= sfmin) {
82  blas.SCAL (M-j, one / *A_jj, A_j1_j, 1);
83  } else {
84  for (int i = 1; i <= M-j; ++i) {
85  __float128* const A_jpi_j = A + (j+i-1) + (j-1)*LDA;
86  *A_jpi_j /= *A_jj;
87  }
88  }
89  }
90  } else if (*INFO == 0) {
91  *INFO = j;
92  }
93 
94  if (j < std::min (M, N)) {
95  //std::cerr << " UPDATE TRAILING SUBMATRIX" << std::endl;
96 
97  // Update trailing submatrix.
98  const __float128* A_j1_j = A + j + (j-1)*LDA;
99  const __float128* A_j_j1 = A + (j-1) + j*LDA;
100  __float128* A_j1_j1 = A + j + j*LDA;
101  blas.GER (M-j, N-j, -one, A_j1_j, 1, A_j_j1, LDA, A_j1_j1, LDA);
102  }
103  }
104 }
105 
106 void
107 Lapack128::
108 LASWP (const int N, __float128 A[], const int LDA, const int K1,
109  const int K2, const int IPIV[], const int INCX) const
110 {
111  int i, i1, i2, inc, ip, ix, ix0, j, k, n32;
112  __float128 temp;
113 
114  // Interchange row I with row IPIV(I) for each of rows K1 through K2.
115 
116  if (INCX > 0) {
117  ix0 = K1;
118  i1 = K1;
119  i2 = K2;
120  inc = 1;
121  } else if (INCX < 0) {
122  ix0 = 1 + (1 - K2)*INCX;
123  i1 = K2;
124  i2 = K1;
125  inc = -1;
126  } else { // INCX == 0
127  return;
128  }
129 
130  // The LAPACK 3.5.0 source code does 32 entries at a time,
131  // presumably for better vectorization or cache line usage.
132  n32 = (N / 32) * 32;
133 
134  if (n32 != 0) {
135  for (j = 1; j <= n32; j += 32) {
136  ix = ix0;
137  // C and C++ lack Fortran's convenient range specifier,
138  // which can iterate over a range in either direction
139  // without particular fuss about the end condition.
140  for (i = i1; (inc > 0) ? (i <= i2) : (i >= i2); i += inc) {
141  ip = IPIV[ix-1];
142  if (ip != i) {
143  for (k = j; k <= j+31; ++k) {
144  temp = A[(i-1) + (k-1)*LDA]; // temp = a( i, k )
145  A[(i-1) + (k-1)*LDA] = A[(ip-1) + (k-1)*LDA]; // a( i, k ) = a( ip, k )
146  A[(ip-1) + (k-1)*LDA] = temp; // a( ip, k ) = temp
147  }
148  }
149  ix = ix + INCX;
150  }
151  }
152  }
153 
154  if (n32 != N) {
155  n32 = n32 + 1;
156  ix = ix0;
157  // C and C++ lack Fortran's convenient range specifier,
158  // which can iterate over a range in either direction
159  // without particular fuss about the end condition.
160  for (i = i1; (inc > 0) ? (i <= i2) : (i >= i2); i += inc) {
161  ip = IPIV[ix-1];
162  if (ip != i) {
163  for (k = n32; k <= N; ++k) {
164  temp = A[(i-1) + (k-1)*LDA]; // temp = a( i, k )
165  A[(i-1) + (k-1)*LDA] = A[(ip-1) + (k-1)*LDA]; // a( i, k ) = a( ip, k )
166  A[(ip-1) + (k-1)*LDA] = temp; // a( ip, k ) = temp
167  }
168  }
169  ix = ix + INCX;
170  }
171  }
172 }
173 
174 void
175 Lapack128::
176 GETRI (const int /* N */, __float128 /* A */ [], const int /* LDA */,
177  int /* IPIV */ [], __float128 /* WORK */ [], const int /* LWORK */,
178  int* /* INFO */) const
179 {
181  (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GETRI: Not implemented yet.");
182 }
183 
184 
185 void
186 Lapack128::
187 GETRS (const char TRANS, const int N, const int NRHS,
188  const __float128 A[], const int LDA, const int IPIV[],
189  __float128 B[], const int LDB, int* INFO) const
190 {
191  //std::cerr << "GETRS: N = " << N << std::endl;
192 
194 
195  // NOTE (mfh 05 Sep 2015) This is a direct translation of LAPACK
196  // 3.5.0's DGETRS routine. LAPACK is under a BSD license.
197 
198  *INFO = 0;
199  const bool notran = (TRANS == 'N' || TRANS == 'n');
200  if (! notran
201  && ! (TRANS == 'T' || TRANS == 't')
202  && ! (TRANS == 'C' || TRANS == 'c')) {
203  *INFO = -1; // invalid TRANS argument
204  }
205  else if (N < 0) {
206  *INFO = -2; // invalid N (negative)
207  }
208  else if (NRHS < 0) {
209  *INFO = -3; // invalid NRHS (negative)
210  }
211  else if (LDA < std::max (1, N)) {
212  *INFO = -5; // invalid LDA (too small)
213  }
214  else if (LDB < std::max (1, N)) {
215  *INFO = -8; // invalid LDB (too small)
216  }
217  if (*INFO != 0) {
218  return;
219  }
220 
221  const __float128 one = 1.0;
222 
223  using Teuchos::LEFT_SIDE;
224  using Teuchos::LOWER_TRI;
225  using Teuchos::UPPER_TRI;
226  using Teuchos::NO_TRANS;
227  using Teuchos::TRANS;
228  using Teuchos::CONJ_TRANS;
229  using Teuchos::UNIT_DIAG;
231 
232  if (notran) { // No transpose; solve AX=B
233  // Apply row interchanges to the right-hand sides.
234  //std::cerr << "CALLING LASWP" << std::endl;
235  LASWP (NRHS, B, LDB, 1, N, IPIV, 1);
236  // Solve L*X = B, overwriting B with X.
237  //std::cerr << "CALLING TRSM (1)" << std::endl;
238  blas.TRSM (LEFT_SIDE, LOWER_TRI, NO_TRANS, UNIT_DIAG, N, NRHS,
239  one, A, LDA, B, LDB);
240  // Solve U*X = B, overwriting B with X.
241  //std::cerr << "CALLING TRSM (2)" << std::endl;
242  blas.TRSM (LEFT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, N, NRHS,
243  one, A, LDA, B, LDB);
244  }
245  else { // Transpose or conjugate transpose: solve A^{T,H}X = B.
246  const Teuchos::ETransp transposeMode = (TRANS == 'T' || TRANS == 't') ?
247  TRANS : CONJ_TRANS;
248 
249  // Solve U^{T,H}*X = B, overwriting B with X.
250  //std::cerr << "CALLING TRSM (1)" << std::endl;
251  blas.TRSM (LEFT_SIDE, UPPER_TRI, transposeMode, NON_UNIT_DIAG, N, NRHS,
252  one, A, LDA, B, LDB);
253  // Solve L^{T,H}*X = B, overwriting B with X.
254  //std::cerr << "CALLING TRSM (2)" << std::endl;
255  blas.TRSM (LEFT_SIDE, LOWER_TRI, transposeMode, UNIT_DIAG, N, NRHS,
256  one, A, LDA, B, LDB);
257  //std::cerr << "CALLING LASWP" << std::endl;
258  // Apply row interchanges to the solution vectors.
259  LASWP (NRHS, B, LDB, 1, N, IPIV, -1);
260  }
261 
262  //std::cerr << "DONE WITH GETRS" << std::endl;
263 }
264 
265 __float128
266 Lapack128::
267 LAPY2 (const __float128& x, const __float128& y) const
268 {
269  const __float128 xabs = fabsq (x);
270  const __float128 yabs = fabsq (y);
271  const __float128 w = fmaxq (xabs, yabs);
272  const __float128 z = fminq (xabs, yabs);
273 
274  if (z == 0.0) {
275  return w;
276  } else {
277  const __float128 one = 1.0;
278  const __float128 z_div_w = z / w;
279  return w * sqrtq (one + z_div_w * z_div_w);
280  }
281 }
282 
283 void
284 Lapack128::
285 ORM2R (const char side, const char trans,
286  const int m, const int n, const int k,
287  const __float128 A[], const int lda,
288  const __float128* const tau,
289  __float128 C[], const int ldc,
290  __float128 work[], int* const info) const
291 {
292  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Not implemented");
293 }
294 
295 namespace { // (anonymous)
296 
297  int
298  ILADLC (const int m, const int n, const __float128 A[], const int lda)
299  {
300  const __float128 zero = 0.0;
301 
302  // Quick test for the common case where one corner is non-zero.
303  if (n == 0) {
304  return n;
305  } else if (A[0 + (n-1)*lda] != zero || A[(m-1) + (n-1)*lda] != zero) {
306  return n;
307  } else {
308  // Now scan each column from the end, returning with the first non-zero.
309  for (int j = n; j > 0; --j) {
310  for (int i = 1; i <= m; ++i) {
311  if (A[(i-1) + (j-1)*lda] != zero) {
312  return j;
313  }
314  }
315  }
316  return 0;
317  }
318  }
319 
320  int
321  ILADLR (const int m, const int n, const __float128 A[], const int lda)
322  {
323  const __float128 zero = 0.0;
324 
325  // Quick test for the common case where one corner is non-zero.
326  if (m == 0) {
327  return m;
328  } else if (A[(m-1) + 0*lda] != zero || A[(m-1) + (n-1)*lda] != zero) {
329  return m;
330  } else {
331  // Scan up each column tracking the last zero row seen.
332  int lastZeroRow = 0;
333  for (int j = 1; j <= n; ++j) {
334  int i = m;
335  while (A[(std::max (i, 1) - 1) + (j - 1)*lda] == zero && i >= 1) {
336  i--;
337  }
338  lastZeroRow = std::max (lastZeroRow, i);
339  }
340  return lastZeroRow;
341  }
342  }
343 } // namespace (anonymous)
344 
345 void
346 Lapack128::
347 LARF (const char side,
348  const int m,
349  const int n,
350  const __float128 v[],
351  const int incv,
352  const __float128 tau,
353  __float128 C[],
354  const int ldc,
355  __float128 work[]) const
356 {
357  const __float128 zero = 0.0;
358  const __float128 one = 1.0;
360  const bool applyLeft = (side == 'L');
361  int lastv = 0;
362  int lastc = 0;
363  int i = 0;
364 
365  if (tau != zero) {
366  // Set up variables for scanning V. LASTV begins pointing to the end of V.
367  if (applyLeft) {
368  lastv = m;
369  } else {
370  lastv = n;
371  }
372  if (incv > 0) {
373  i = 1 + (lastv - 1) * incv;
374  } else {
375  i = 1;
376  }
377  // Look for the last non-zero row in V.
378  while (lastv > 0 && v[i-1] == zero) {
379  lastv = lastv - 1;
380  i = i - incv;
381  }
382  if (applyLeft) {
383  // Scan for the last non-zero column in C(1:lastv,:).
384  lastc = ILADLC (lastv, n, C, ldc);
385  } else {
386  // Scan for the last non-zero row in C(:,1:lastv).
387  lastc = ILADLR (m, lastv, C, ldc);
388  }
389  }
390 
391  // Note that lastc == 0 renders the BLAS operations null; no special
392  // case is needed at this level.
393  if (applyLeft) {
394  // Form H * C
395  if (lastv > 0) {
396  // w(1:lastc,1) := C(1:lastv,1:lastc)**T * v(1:lastv,1)
397  blas.GEMV (Teuchos::TRANS, lastv, lastc, one, C, ldc, v, incv,
398  zero, work, 1);
399  // C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**T
400  blas.GER (lastv, lastc, -tau, v, incv, work, 1, C, ldc);
401  }
402  }
403  else {
404  // Form C * H
405  if (lastv > 0) {
406  // w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1)
407  blas.GEMV (Teuchos::NO_TRANS, lastc, lastv, one, C, ldc,
408  v, incv, zero, work, 1);
409  // C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)**T
410  blas.GER (lastc, lastv, -tau, work, 1, v, incv, C, ldc);
411  }
412  }
413 }
414 
415 
416 void
417 Lapack128::
418 LARFG (const int N, __float128* const ALPHA,
419  __float128 X[], const int INCX, __float128* const TAU) const
420 {
421  // This is actually LARFGP.
422 
423  const __float128 zero = 0.0;
424  const __float128 one = 1.0;
425  const __float128 two = 2.0;
427 
428  if (N <= 0) {
429  *TAU = zero;
430  return;
431  }
432  __float128 xnorm = blas.NRM2 (N-1, X, INCX);
433 
434  if (xnorm == zero) {
435  // H = [+/-1, 0; I], sign chosen so *ALPHA >= 0
436  if (*ALPHA >= zero) {
437  // When TAU.eq.ZERO, the vector is special-cased to be all zeros
438  // in the application routines. We do not need to clear it.
439  *TAU = zero;
440  } else {
441  // However, the application routines rely on explicit
442  // zero checks when TAU.ne.ZERO, and we must clear X.
443  *TAU = two;
444  for (int j = 0; j < N; ++j) {
445  X[j * INCX] = 0.0;
446  }
447  *ALPHA = -*ALPHA;
448  }
449  } else { // general case (norm of x is nonzero)
450  // This implements Fortran's two-argument SIGN intrinsic.
451  __float128 beta = copysignq (LAPY2 (*ALPHA, xnorm), *ALPHA);
452  const __float128 smlnum = FLT128_MIN / FLT128_EPSILON;
453  int knt = 0;
454 
455  if (fabsq (beta) < smlnum) {
456  // XNORM, BETA may be inaccurate; scale X and recompute them
457 
458  __float128 bignum = one / smlnum;
459  do {
460  knt = knt + 1;
461  blas.SCAL (N-1, bignum, X, INCX);
462  beta = beta*bignum;
463  *ALPHA = *ALPHA*bignum;
464  } while (fabsq(beta) < smlnum);
465 
466  // New BETA is at most 1, at least SMLNUM
467  xnorm = blas.NRM2 (N-1, X, INCX);
468  beta = copysignq (LAPY2 (*ALPHA, xnorm), *ALPHA);
469  }
470 
471  __float128 savealpha = *ALPHA;
472  *ALPHA = *ALPHA + beta;
473  if (beta < zero) {
474  beta = -beta;
475  *TAU = -*ALPHA / beta;
476  } else {
477  *ALPHA = xnorm * (xnorm / *ALPHA);
478  *TAU = *ALPHA / beta;
479  *ALPHA = -*ALPHA;
480  }
481 
482  if (fabsq (*TAU) <= smlnum) {
483  // In the case where the computed TAU ends up being a
484  // denormalized number, it loses relative accuracy. This is a
485  // BIG problem. Solution: flush TAU to ZERO. This explains the
486  // next IF statement.
487  //
488  // (Bug report provided by Pat Quillen from MathWorks on Jul 29,
489  // 2009.) (Thanks Pat. Thanks MathWorks.)
490 
491  if (savealpha >= zero) {
492  *TAU = zero;
493  } else {
494  *TAU = two;
495  for (int j = 0; j < N; ++j) {
496  X[j*INCX] = 0.0;
497  }
498  beta = -savealpha;
499  }
500  }
501  else { // this is the general case
502  blas.SCAL (N-1, one / *ALPHA, X, INCX);
503  }
504  // If BETA is subnormal, it may lose relative accuracy
505  for (int j = 1; j <= knt; ++j) {
506  beta = beta*smlnum;
507  }
508  *ALPHA = beta;
509  }
510 }
511 
512 void
513 Lapack128::
514 GEQR2 (const int /* M */,
515  const int /* N */,
516  __float128 /* A */ [],
517  const int /* LDA */,
518  __float128 /* TAU */ [],
519  __float128 /* WORK */ [],
520  int* const /* INFO */ ) const
521 {
523  (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GEQR2: Not implemented yet.");
524 }
525 
526 void
527 Lapack128::
528 GEQRF (const int M,
529  const int N,
530  __float128 A[],
531  const int LDA,
532  __float128 TAU[],
533  __float128 WORK[],
534  const int LWORK,
535  int* const INFO) const
536 {
537  // mfh 17 Sep 2015: We don't implement a BLAS 3 QR factorization for
538  // __float128. Instead, we call the BLAS 2 QR factorization GEQR2,
539  // which has a fixed minimum WORK array length of N. Thus, we have
540  // to roll our own LWORK query here.
541 
542  if (LWORK == -1) {
543  WORK[0] = static_cast<__float128> (N);
544  }
545  else {
546  GEQR2 (M, N, A, LDA, TAU, WORK, INFO);
547  }
548 }
549 
550 void
551 Lapack128::
552 ORGQR (const int /* M */,
553  const int /* N */,
554  const int /* K */,
555  __float128 /* A */ [],
556  const int /* LDA */,
557  const __float128 /* TAU */ [],
558  __float128 /* WORK */ [],
559  const int /* LWORK */,
560  int* const /* INFO */) const
561 {
563  (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GEQR2: Not implemented yet.");
564 }
565 
566 void
567 Lapack128::
568 UNGQR (const int /* M */,
569  const int /* N */,
570  const int /* K */,
571  __float128 /* A */ [],
572  const int /* LDA */,
573  const __float128 /* TAU */ [],
574  __float128 /* WORK */ [],
575  const int /* LWORK */,
576  int* const /* INFO */) const
577 {
579  (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GEQR2: Not implemented yet.");
580 }
581 
582 void
583 Lapack128::
584 LASCL (const char TYPE,
585  const int kl,
586  const int ku,
587  const __float128 cfrom,
588  const __float128 cto,
589  const int m,
590  const int n,
591  __float128* A,
592  const int lda,
593  int* info) const
594 {
596  (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::LASCL: Not implemented yet.");
597 }
598 
599 void
600 Lapack128::
601 GBTRF (const int m,
602  const int n,
603  const int kl,
604  const int ku,
605  __float128* A,
606  const int lda,
607  int* IPIV,
608  int* info) const
609 {
611  (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GBTRF: Not implemented yet.");
612 }
613 
614 void
615 Lapack128::
616 GBTRS (const char TRANS,
617  const int n,
618  const int kl,
619  const int ku,
620  const int nrhs,
621  const __float128* A,
622  const int lda,
623  const int* IPIV,
624  __float128* B,
625  const int ldb,
626  int* info) const
627 {
629  (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GBTRS: Not implemented yet.");
630 }
631 
632 } // namespace Details
633 } // namespace Teuchos
634 #endif // HAVE_TEUCHOSCORE_QUADMATH
void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType &m, const OrdinalType &n, const alpha_type alpha, const A_type *A, const OrdinalType &lda, ScalarType *B, const OrdinalType &ldb) const
Solves the matrix equations: op(A)*X=alpha*B or X*op(A)=alpha*B where X and B are m by n matrices...
void GER(const OrdinalType &m, const OrdinalType &n, const alpha_type alpha, const x_type *x, const OrdinalType &incx, const y_type *y, const OrdinalType &incy, ScalarType *A, const OrdinalType &lda) const
Performs the rank 1 operation: A &lt;- alpha*x*y&#39;+A.
Declaration and definition of Teuchos::Details::Lapack128, a partial implementation of Teuchos::LAPAC...
Templated interface class to BLAS routines.
void GEMV(ETransp trans, const OrdinalType &m, const OrdinalType &n, const alpha_type alpha, const A_type *A, const OrdinalType &lda, const x_type *x, const OrdinalType &incx, const beta_type beta, ScalarType *y, const OrdinalType &incy) const
Performs the matrix-vector operation: y &lt;- alpha*A*x+beta*y or y &lt;- alpha*A&#39;*x+beta*y where A is a gene...
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Macro for throwing an exception with breakpointing to ease debugging.
Templated BLAS wrapper.
ScalarTraits< ScalarType >::magnitudeType NRM2(const OrdinalType &n, const ScalarType *x, const OrdinalType &incx) const
Compute the 2-norm of the vector x.
OrdinalType IAMAX(const OrdinalType &n, const ScalarType *x, const OrdinalType &incx) const
Return the index of the element of x with the maximum magnitude.
void SCAL(const OrdinalType &n, const ScalarType &alpha, ScalarType *x, const OrdinalType &incx) const
Scale the vector x by the constant alpha.
void SWAP(const OrdinalType &n, ScalarType *const x, const OrdinalType &incx, ScalarType *const y, const OrdinalType &incy) const
Swap the entries of x and y.