Anasazi  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
LOBPCG_light.cpp
1 // @HEADER
2 // *****************************************************************************
3 // Anasazi: Block Eigensolvers Package
4 //
5 // Copyright 2004 NTESS and the Anasazi contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 //**************************************************************************
11 //
12 // NOTICE
13 //
14 // This software is a result of the research described in the cout
15 //
16 // " A comparison of algorithms for modal analysis in the absence
17 // of a sparse direct method", P. Arbenz, R. Lehoucq, and U. Hetmaniuk,
18 // Sandia National Laboratories, Technical cout SAND2003-1028J.
19 //
20 // It is based on the Epetra, AztecOO, and ML packages defined in the Trilinos
21 // framework ( http://software.sandia.gov/trilinos/ ).
22 //
23 // The distribution of this software follows also the rules defined in Trilinos.
24 // This notice shall be marked on any reproduction of this software, in whole or
25 // in part.
26 //
27 // This program is distributed in the hope that it will be useful, but
28 // WITHOUT ANY WARRANTY; without even the implied warranty of
29 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
30 //
31 // Code Authors: U. Hetmaniuk (ulhetma@sandia.gov), R. Lehoucq (rblehou@sandia.gov)
32 //
33 //**************************************************************************
34 
35 #include "LOBPCG_light.h"
36 
37 
38 LOBPCG_light::LOBPCG_light(const Epetra_Comm &_Comm, const Epetra_Operator *KK,
39  const Epetra_Operator *PP, int _blk,
40  double _tol, int _maxIter, int _verb) :
41  myVerify(_Comm),
42  callBLAS(),
43  callFortran(),
44  modalTool(_Comm),
45  mySort(),
46  MyComm(_Comm),
47  K(KK),
48  M(0),
49  Prec(PP),
50  MyWatch(_Comm),
51  tolEigenSolve(_tol),
52  maxIterEigenSolve(_maxIter),
53  blockSize(_blk),
54  normWeight(0),
55  verbose(_verb),
56  historyCount(0),
57  resHistory(0),
58  memRequested(0.0),
59  highMem(0.0),
60  massOp(0),
61  numRestart(0),
62  outerIter(0),
63  precOp(0),
64  residual(0),
65  stifOp(0),
66  timeLocalProj(0.0),
67  timeLocalSolve(0.0),
68  timeLocalUpdate(0.0),
69  timeMassOp(0.0),
70  timeNorm(0.0),
71  timeOrtho(0.0),
72  timeOuterLoop(0.0),
73  timePostProce(0.0),
74  timePrecOp(0.0),
75  timeResidual(0.0),
76  timeRestart(0.0),
77  timeStifOp(0.0)
78  {
79 
80 }
81 
82 
83 LOBPCG_light::LOBPCG_light(const Epetra_Comm &_Comm, const Epetra_Operator *KK,
84  const Epetra_Operator *MM, const Epetra_Operator *PP, int _blk,
85  double _tol, int _maxIter, int _verb,
86  double *_weight) :
87  myVerify(_Comm),
88  callBLAS(),
89  callFortran(),
90  modalTool(_Comm),
91  mySort(),
92  MyComm(_Comm),
93  K(KK),
94  M(MM),
95  Prec(PP),
96  MyWatch(_Comm),
97  tolEigenSolve(_tol),
98  maxIterEigenSolve(_maxIter),
99  blockSize(_blk),
100  normWeight(_weight),
101  verbose(_verb),
102  historyCount(0),
103  resHistory(0),
104  memRequested(0.0),
105  highMem(0.0),
106  massOp(0),
107  numRestart(0),
108  outerIter(0),
109  precOp(0),
110  residual(0),
111  stifOp(0),
112  timeLocalProj(0.0),
113  timeLocalSolve(0.0),
114  timeLocalUpdate(0.0),
115  timeMassOp(0.0),
116  timeNorm(0.0),
117  timeOrtho(0.0),
118  timeOuterLoop(0.0),
119  timePostProce(0.0),
120  timePrecOp(0.0),
121  timeResidual(0.0),
122  timeRestart(0.0),
123  timeStifOp(0.0)
124  {
125 
126 }
127 
128 
129 LOBPCG_light::~LOBPCG_light() {
130 
131  if (resHistory) {
132  delete[] resHistory;
133  resHistory = 0;
134  }
135 
136 }
137 
138 
139 int LOBPCG_light::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV) {
140 
141  // Computes the smallest eigenvalues and the corresponding eigenvectors
142  // of the generalized eigenvalue problem
143  //
144  // K X = M X Lambda
145  //
146  // using a modified Locally Optimal Block Preconditioned Conjugate
147  // Gradient method.
148  //
149  // Note that if M is not specified, then K X = X Lambda is solved.
150  //
151  // Ref: A. Knyazev, "Toward the optimal preconditioned eigensolver:
152  // Locally optimal block preconditioner conjugate gradient method",
153  // SIAM J. Sci. Comput., vol 23, n 2, pp. 517-541
154  //
155  // Input variables:
156  //
157  // numEigen (integer) = Number of eigenmodes requested
158  //
159  // Q (Epetra_MultiVector) = Converged eigenvectors
160  // The number of columns of Q must be equal to numEigen + blockSize.
161  // The rows of Q are distributed across processors.
162  // At exit, the first numEigen columns contain the eigenvectors requested.
163  //
164  // lambda (array of doubles) = Converged eigenvalues
165  // At input, it must be of size numEigen + blockSize.
166  // At exit, the first numEigen locations contain the eigenvalues requested.
167  //
168  // startingEV (integer) = Number of existing converged eigenmodes
169  //
170  // Return information on status of computation
171  //
172  // info >= 0 >> Number of converged eigenpairs at the end of computation
173  //
174  // // Failure due to input arguments
175  //
176  // info = - 1 >> The stiffness matrix K has not been specified.
177  // info = - 2 >> The maps for the matrix K and the matrix M differ.
178  // info = - 3 >> The maps for the matrix K and the preconditioner P differ.
179  // info = - 4 >> The maps for the vectors and the matrix K differ.
180  // info = - 5 >> Q is too small for the number of eigenvalues requested.
181  // info = - 6 >> Q is too small for the computation parameters.
182  //
183  // info = - 10 >> Failure during the mass orthonormalization
184  //
185  // info = - 20 >> Error in LAPACK during the local eigensolve
186  //
187  // info = - 30 >> MEMORY
188  //
189 
190  // Check the input parameters
191 
192  if (numEigen <= startingEV) {
193  return numEigen;
194  }
195 
196  int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, minimumSpaceDimension(numEigen));
197  if (info < 0)
198  return info;
199 
200  int myPid = MyComm.MyPID();
201 
202  // Get the weight for approximating the M-inverse norm
203  Epetra_Vector *vectWeight = 0;
204  if (normWeight) {
205  vectWeight = new Epetra_Vector(View, Q.Map(), normWeight);
206  }
207 
208  int knownEV = startingEV;
209  int localVerbose = verbose*(myPid==0);
210 
211  // Define local block vectors
212  //
213  // R = Residuals
214  //
215  // H = Preconditioned search space
216  //
217  // P = Search directions
218 
219  int xr = Q.MyLength();
220  Epetra_MultiVector X(View, Q, numEigen, blockSize);
221  X.Random();
222 
223  int tmp;
224  tmp = 3*blockSize*xr;
225 
226  double *work1 = new (nothrow) double[tmp];
227  if (work1 == 0) {
228  if (vectWeight)
229  delete vectWeight;
230  info = -30;
231  return info;
232  }
233  memRequested += sizeof(double)*tmp/(1024.0*1024.0);
234 
235  highMem = (highMem > currentSize()) ? highMem : currentSize();
236 
237  double *tmpD = work1;
238 
239  Epetra_MultiVector R(View, Q.Map(), tmpD, xr, blockSize);
240  tmpD = tmpD + xr*blockSize;
241 
242  Epetra_MultiVector H(View, Q.Map(), tmpD, xr, blockSize);
243  tmpD = tmpD + xr*blockSize;
244 
245  Epetra_MultiVector P(View, Q.Map(), tmpD, xr, blockSize);
246  tmpD = tmpD + xr*blockSize;
247 
248  // Define arrays
249  //
250  // theta = Store the local eigenvalues (size: 3*blockSize)
251  // normR = Store the norm of residuals (size: blockSize)
252  //
253  // MM = Local mass matrix (size: 3*blockSize x 3*blockSize)
254  // KK = Local stiffness matrix (size: 3*blockSize x 3*blockSize)
255  //
256  // S = Local eigenvectors (size: 3*blockSize x 3*blockSize)
257 
258  int lwork2;
259  lwork2 = 4*blockSize + 27*blockSize*blockSize;
260  double *work2 = new (nothrow) double[lwork2];
261  if (work2 == 0) {
262  if (vectWeight)
263  delete vectWeight;
264  delete[] work1;
265  info = -30;
266  return info;
267  }
268 
269  highMem = (highMem > currentSize()) ? highMem : currentSize();
270 
271  tmpD = work2;
272 
273  double *theta = tmpD;
274  tmpD = tmpD + 3*blockSize;
275 
276  double *normR = tmpD;
277  tmpD = tmpD + blockSize;
278 
279  double *MM = tmpD;
280  tmpD = tmpD + 9*blockSize*blockSize;
281 
282  double *KK = tmpD;
283  tmpD = tmpD + 9*blockSize*blockSize;
284 
285  double *S = tmpD;
286 
287  memRequested += sizeof(double)*lwork2/(1024.0*1024.0);
288 
289  // Define an array to store the residuals history
290  if (localVerbose > 2) {
291  resHistory = new (nothrow) double[maxIterEigenSolve*blockSize];
292  if (resHistory == 0) {
293  if (vectWeight)
294  delete vectWeight;
295  delete[] work1;
296  delete[] work2;
297  info = -30;
298  return info;
299  }
300  historyCount = 0;
301  }
302 
303  // Miscellaneous definitions
304 
305  bool reStart = false;
306  numRestart = 0;
307 
308  int localSize;
309  int twoBlocks = 2*blockSize;
310  int threeBlocks = 3*blockSize;
311  int nFound = blockSize;
312  int i, j;
313 
314  double minNorm = 1000*tolEigenSolve;
315  bool hasNextVectors = false;
316 
317  if (localVerbose > 0) {
318  cout << endl;
319  cout << " *|* Problem: ";
320  if (M)
321  cout << "K*Q = M*Q D ";
322  else
323  cout << "K*Q = Q D ";
324  if (Prec)
325  cout << " with preconditioner";
326  cout << endl;
327  cout << " *|* Algorithm = LOBPCG (Small memory requirements)" << endl;
328  cout << " *|* Size of blocks = " << blockSize << endl;
329  cout << " *|* Number of requested eigenvalues = " << numEigen << endl;
330  cout.precision(2);
331  cout.setf(ios::scientific, ios::floatfield);
332  cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl;
333  cout << " *|* Norm used for convergence: ";
334  if (normWeight){
335  cout << "weighted L2-norm with user-provided weights" << endl;
336  }
337  else {
338  cout << "L^2-norm" << endl;
339  }
340  if (startingEV > 0)
341  cout << " *|* Input converged eigenvectors = " << startingEV << endl;
342  cout << "\n -- Start iterations -- \n";
343  }
344 
345  timeOuterLoop -= MyWatch.WallTime();
346  for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) {
347 
348  highMem = (highMem > currentSize()) ? highMem : currentSize();
349 
350  if ((outerIter == 1) || (reStart == true)) {
351 
352  reStart = false;
353  localSize = blockSize;
354 
355  hasNextVectors = false;
356  minNorm = 1000*tolEigenSolve;
357 
358  // Apply the mass matrix to X
359  // Note: Use P as work space
360  timeMassOp -= MyWatch.WallTime();
361  if (M)
362  M->Apply(X, P);
363  else
364  memcpy(P.Values(), X.Values(), xr*blockSize*sizeof(double));
365  timeMassOp += MyWatch.WallTime();
366  massOp += blockSize;
367 
368  if ((knownEV > 0) && (nFound > 0)) {
369  // Orthonormalize X against the known eigenvectors with Gram-Schmidt
370  // Note: Use R as a temporary work space
371  Epetra_MultiVector copyQ(View, Q, 0, knownEV);
372  timeOrtho -= MyWatch.WallTime();
373  info = modalTool.massOrthonormalize(X, P, M, copyQ, nFound, 1, R.Values());
374  timeOrtho += MyWatch.WallTime();
375  // Exit the code if the orthogonalization did not succeed
376  if (info < 0) {
377  info = -10;
378  if (vectWeight)
379  delete vectWeight;
380  delete[] work1;
381  delete[] work2;
382  return info;
383  }
384  } // if ((knownEV > 0) && (nFound > 0))
385 
386  } // if ((outerIter == 1) || (reStart == true))
387  else {
388 
389  // Apply the preconditioner on the residuals
390  if (Prec) {
391  timePrecOp -= MyWatch.WallTime();
392  Prec->ApplyInverse(R, H);
393  timePrecOp += MyWatch.WallTime();
394  precOp += blockSize;
395  }
396  else {
397  memcpy(H.Values(), R.Values(), xr*blockSize*sizeof(double));
398  }
399 
400  // Apply the mass matrix on H
401  timeMassOp -= MyWatch.WallTime();
402  if (M)
403  M->Apply(H, R);
404  else
405  memcpy(R.Values(), H.Values(), xr*blockSize*sizeof(double));
406  timeMassOp += MyWatch.WallTime();
407  massOp += blockSize;
408 
409  if (knownEV > 0) {
410  // Orthogonalize H against the known eigenvectors
411  Epetra_MultiVector copyQ(View, Q, 0, knownEV);
412  timeOrtho -= MyWatch.WallTime();
413  modalTool.massOrthonormalize(H, R, M, copyQ, blockSize, 1);
414  timeOrtho += MyWatch.WallTime();
415  }
416 
417  if (localSize == blockSize)
418  localSize += blockSize;
419 
420  } // if ((outerIter == 1) || (reStart==true))
421 
422  // Form "local" mass and stiffness matrices
423  // Note: Use S as a temporary workspace
424  if (localSize == blockSize) {
425  // P stores M*X
426  timeLocalProj -= MyWatch.WallTime();
427  modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, P.Values(), xr,
428  MM, localSize, S);
429  timeLocalProj += MyWatch.WallTime();
430  // Put K*X in H
431  timeStifOp -= MyWatch.WallTime();
432  K->Apply(X, H);
433  timeStifOp += MyWatch.WallTime();
434  stifOp += blockSize;
435  timeLocalProj -= MyWatch.WallTime();
436  modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, H.Values(), xr,
437  KK, localSize, S);
438  timeLocalProj += MyWatch.WallTime();
439  }
440  else {
441  // Put diagonal matrices in first block
442  timeLocalProj -= MyWatch.WallTime();
443  for (i = 0; i < blockSize; ++i) {
444  memset(KK + i*localSize, 0, i*sizeof(double));
445  KK[i + i*localSize] = theta[i];
446  memset(MM + i*localSize, 0, i*sizeof(double));
447  MM[i + i*localSize] = 1.0;
448  }
449  timeLocalProj += MyWatch.WallTime();
450  // R stores M*H
451  timeLocalProj -= MyWatch.WallTime();
452  modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, R.Values(), xr,
453  MM + blockSize*localSize, localSize, S);
454  modalTool.localProjection(blockSize, blockSize, xr, H.Values(), xr, R.Values(), xr,
455  MM + blockSize*localSize + blockSize, localSize, S);
456  timeLocalProj += MyWatch.WallTime();
457  // Put K*H in R
458  timeStifOp -= MyWatch.WallTime();
459  K->Apply(H, R);
460  timeStifOp += MyWatch.WallTime();
461  stifOp += blockSize;
462  timeLocalProj -= MyWatch.WallTime();
463  modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, R.Values(), xr,
464  KK + blockSize*localSize, localSize, S);
465  modalTool.localProjection(blockSize, blockSize, xr, H.Values(), xr, R.Values(), xr,
466  KK + blockSize*localSize + blockSize, localSize, S);
467  timeLocalProj += MyWatch.WallTime();
468  if (localSize > twoBlocks) {
469  // Put M*P in R
470  timeMassOp -= MyWatch.WallTime();
471  if (M)
472  M->Apply(P, R);
473  else
474  memcpy(R.Values(), P.Values(), xr*blockSize*sizeof(double));
475  timeMassOp += MyWatch.WallTime();
476  massOp += blockSize;
477  timeLocalProj -= MyWatch.WallTime();
478  modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, R.Values(), xr,
479  MM + twoBlocks*localSize, localSize, S);
480  modalTool.localProjection(blockSize, blockSize, xr, H.Values(), xr, R.Values(), xr,
481  MM + twoBlocks*localSize + blockSize, localSize, S);
482  modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, R.Values(), xr,
483  MM + twoBlocks*localSize + twoBlocks, localSize, S);
484  timeLocalProj += MyWatch.WallTime();
485  // Put K*P in R
486  timeStifOp -= MyWatch.WallTime();
487  K->Apply(P, R);
488  timeStifOp += MyWatch.WallTime();
489  stifOp += blockSize;
490  timeLocalProj -= MyWatch.WallTime();
491  modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, R.Values(), xr,
492  KK + twoBlocks*localSize, localSize, S);
493  modalTool.localProjection(blockSize, blockSize, xr, H.Values(), xr, R.Values(), xr,
494  KK + twoBlocks*localSize + blockSize, localSize, S);
495  modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, R.Values(), xr,
496  KK + twoBlocks*localSize + twoBlocks, localSize, S);
497  timeLocalProj += MyWatch.WallTime();
498  } // if (localSize > twoBlocks)
499  } // if (localSize == blockSize)
500 
501  // Perform a spectral decomposition
502  timeLocalSolve -= MyWatch.WallTime();
503  int nevLocal = localSize;
504  info = modalTool.directSolver(localSize, KK, localSize, MM, localSize, nevLocal,
505  S, localSize, theta, localVerbose,
506  (blockSize == 1) ? 1 : 0);
507  timeLocalSolve += MyWatch.WallTime();
508 
509  if (info < 0) {
510  // Stop when spectral decomposition has a critical failure
511  break;
512  } // if (info < 0)
513 
514  // Check for restarting
515  if ((theta[0] < 0.0) || (nevLocal < blockSize)) {
516  if (localVerbose > 0) {
517  cout << " Iteration " << outerIter;
518  cout << "- Failure for spectral decomposition - RESTART with new random search\n";
519  }
520  if (blockSize == 1) {
521  X.Random();
522  nFound = blockSize;
523  }
524  else {
525  Epetra_MultiVector Xinit(View, X, 1, blockSize-1);
526  Xinit.Random();
527  nFound = blockSize - 1;
528  } // if (blockSize == 1)
529  reStart = true;
530  numRestart += 1;
531  info = 0;
532  continue;
533  } // if ((theta[0] < 0.0) || (nevLocal < blockSize))
534 
535  if ((localSize == twoBlocks) && (nevLocal == blockSize)) {
536  for (j = 0; j < nevLocal; ++j)
537  memcpy(S + j*blockSize, S + j*twoBlocks, blockSize*sizeof(double));
538  localSize = blockSize;
539  }
540 
541  if ((localSize == threeBlocks) && (nevLocal <= twoBlocks)) {
542  for (j = 0; j < nevLocal; ++j)
543  memcpy(S + j*twoBlocks, S + j*threeBlocks, twoBlocks*sizeof(double));
544  localSize = twoBlocks;
545  }
546 
547  // Update the iterate and, if needed, define the next Ritz vectors
548  if (localSize == twoBlocks) {
549  if (minNorm < 10*tolEigenSolve) {
550  timeRestart -= MyWatch.WallTime();
551  int numVec = (numEigen > knownEV + blockSize) ? blockSize : numEigen - knownEV - 1;
552  double *pointerS = S + blockSize*localSize;
553  double *pointerQ = Q.Values() + knownEV*xr;
554  callBLAS.GEMM('N', 'N', xr, numVec, blockSize, 1.0, X.Values(), xr,
555  pointerS, localSize, 0.0, pointerQ, xr);
556  callBLAS.GEMM('N', 'N', xr, numVec, blockSize, 1.0, H.Values(), xr,
557  pointerS + blockSize, localSize, 1.0, pointerQ, xr);
558  hasNextVectors = true;
559  timeRestart += MyWatch.WallTime();
560  }
561  timeLocalUpdate -= MyWatch.WallTime();
562  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr,
563  S + blockSize, localSize, 0.0, P.Values(), xr);
564  memcpy(R.Values(), X.Values(), xr*blockSize*sizeof(double));
565  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, R.Values(), xr,
566  S, localSize, 0.0, X.Values(), xr);
567  X.Update(1.0, P, 1.0);
568  timeLocalUpdate += MyWatch.WallTime();
569  }
570  if (localSize == threeBlocks) {
571  if (minNorm < 10*tolEigenSolve) {
572  timeRestart -= MyWatch.WallTime();
573  int numVec = (numEigen > knownEV + blockSize) ? blockSize : numEigen - knownEV - 1;
574  double *pointerS = S + blockSize*localSize;
575  double *pointerQ = Q.Values() + knownEV*xr;
576  callBLAS.GEMM('N', 'N', xr, numVec, blockSize, 1.0, X.Values(), xr,
577  pointerS, localSize, 0.0, pointerQ, xr);
578  // Note: We use the contiguity of [H, P] in the memory
579  callBLAS.GEMM('N', 'N', xr, numVec, twoBlocks, 1.0, H.Values(), xr,
580  pointerS + blockSize, localSize, 1.0, pointerQ, xr);
581  hasNextVectors = true;
582  timeRestart += MyWatch.WallTime();
583  }
584  timeLocalUpdate -= MyWatch.WallTime();
585  memcpy(R.Values(), P.Values(), xr*blockSize*sizeof(double));
586  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr,
587  S + blockSize, localSize, 0.0, P.Values(), xr);
588  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, R.Values(), xr,
589  S + twoBlocks, localSize, 1.0, P.Values(), xr);
590  memcpy(R.Values(), X.Values(), xr*blockSize*sizeof(double));
591  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, R.Values(), xr,
592  S, localSize, 0.0, X.Values(), xr);
593  X.Update(1.0, P, 1.0);
594  timeLocalUpdate += MyWatch.WallTime();
595  }
596 
597  // Compute the residuals
598  if (localSize == blockSize) {
599  timeResidual -= MyWatch.WallTime();
600  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, blockSize,
601  0.0, R.Values(), xr);
602  timeResidual += MyWatch.WallTime();
603  timeLocalUpdate -= MyWatch.WallTime();
604  memcpy(H.Values(), X.Values(), xr*blockSize*sizeof(double));
605  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, blockSize,
606  0.0, X.Values(), xr);
607  timeLocalUpdate += MyWatch.WallTime();
608  // Note: We scale S
609  timeResidual -= MyWatch.WallTime();
610  for (j = 0; j < blockSize; ++j)
611  callBLAS.SCAL(localSize, theta[j], S + j*localSize);
612  callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, -1.0, P.Values(), xr, S, blockSize,
613  1.0, R.Values(), xr);
614  timeResidual += MyWatch.WallTime();
615  // Note: S is not scaled back to its original value
616  }
617  else {
618  timeStifOp -= MyWatch.WallTime();
619  K->Apply(X, R);
620  timeStifOp += MyWatch.WallTime();
621  stifOp += blockSize;
622  timeMassOp -= MyWatch.WallTime();
623  if (M)
624  M->Apply(X, H);
625  else
626  memcpy(H.Values(), X.Values(), xr*blockSize*sizeof(double));
627  timeMassOp += MyWatch.WallTime();
628  massOp += blockSize;
629  timeResidual -= MyWatch.WallTime();
630  for (j = 0; j < blockSize; ++j)
631  callBLAS.AXPY(xr, -theta[j], H.Values() + j*xr, R.Values() + j*xr);
632  timeResidual += MyWatch.WallTime();
633  }
634 
635  // Compute the norms of the residuals
636  timeNorm -= MyWatch.WallTime();
637  if (vectWeight)
638  R.NormWeighted(*vectWeight, normR);
639  else
640  R.Norm2(normR);
641 
642  // Scale the norms of residuals with the eigenvalues
643  // Count the converged eigenvectors
644  nFound = 0;
645  for (j = 0; j < blockSize; ++j) {
646  normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
647  if (normR[j] < tolEigenSolve)
648  nFound += 1;
649  minNorm = (normR[j] < minNorm) ? normR[j] : minNorm;
650  }
651  timeNorm += MyWatch.WallTime();
652 
653  // Store the residual history
654  if (localVerbose > 2) {
655  memcpy(resHistory + historyCount*blockSize, normR, blockSize*sizeof(double));
656  historyCount += 1;
657  }
658 
659  // Print information on current iteration
660  if (localVerbose > 0) {
661  cout << " Iteration " << outerIter << " - Number of converged eigenvectors ";
662  cout << knownEV + nFound << endl;
663  }
664 
665  if (localVerbose > 1) {
666  cout << endl;
667  cout.precision(2);
668  cout.setf(ios::scientific, ios::floatfield);
669  for (i=0; i<blockSize; ++i) {
670  cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i;
671  cout << " = " << normR[i] << endl;
672  }
673  cout << endl;
674  cout.precision(2);
675  for (i=0; i<blockSize; ++i) {
676  cout << " Iteration " << outerIter << " - Ritz eigenvalue " << i;
677  cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);
678  cout << " = " << theta[i] << endl;
679  }
680  cout << endl;
681  }
682 
683  if (nFound == 0) {
684  // When required, monitor some orthogonalities
685  if (verbose > 2) {
686  if (knownEV == 0) {
687  accuracyCheck(&X, 0, &R, 0, 0, 0);
688  }
689  else {
690  Epetra_MultiVector copyQ(View, Q, 0, knownEV);
691  accuracyCheck(&X, 0, &R, &copyQ, (localSize>blockSize) ? &H : 0,
692  (localSize>twoBlocks) ? &P : 0);
693  }
694  } // if (verbose > 2)
695  if (localSize < threeBlocks)
696  localSize += blockSize;
697  continue;
698  } // if (nFound == 0)
699 
700  // Exit the loop if enough vectors have converged
701  if (knownEV + nFound >= numEigen) {
702  for (j = 0; j < blockSize; ++j) {
703  if (normR[j] < tolEigenSolve) {
704  lambda[knownEV] = theta[j];
705  memcpy(Q.Values() + knownEV*xr, X.Values() + j*xr, xr*sizeof(double));
706  knownEV += 1;
707  }
708  }
709  if (localVerbose == 1) {
710  cout << endl;
711  cout.precision(2);
712  cout.setf(ios::scientific, ios::floatfield);
713  for (i=0; i<blockSize; ++i) {
714  cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i;
715  cout << " = " << normR[i] << endl;
716  }
717  cout << endl;
718  }
719  break;
720  } // if (knownEV >= numEigen)
721 
722  // Store the converged eigenpairs and define the new X iterate
723  if (hasNextVectors == true) {
724  // Q stores the next Ritz vectors
725  for (j = 0; j < blockSize; ++j) {
726  if (normR[j] < tolEigenSolve) {
727  lambda[knownEV] = theta[j];
728  callFortran.SWAP(xr, X.Values() + j*xr, 1, Q.Values() + knownEV*xr, 1);
729  knownEV += 1;
730  }
731  }
732  nFound = 0;
733  }
734  else {
735  for (j = 0; j < blockSize; ++j) {
736  if (normR[j] < tolEigenSolve) {
737  lambda[knownEV] = theta[j];
738  memcpy(Q.Values() + knownEV*xr, X.Values() + j*xr, xr*sizeof(double));
739  knownEV += 1;
740  }
741  }
742  // Replace converged vectors with random vectors
743  timeRestart -= MyWatch.WallTime();
744  nFound = 0;
745  for (j = 0; j < blockSize; ++j) {
746  if (normR[j] >= tolEigenSolve) {
747  if (nFound > 0)
748  memcpy(X.Values() + (j-nFound)*xr, X.Values() + j*xr, xr*sizeof(double));
749  }
750  else
751  nFound += 1;
752  }
753  if (nFound > 0) {
754  // Put new random vectors at the end of the block
755  Epetra_MultiVector Xtmp(View, X, blockSize - nFound, nFound);
756  Xtmp.Random();
757  }
758  timeRestart += MyWatch.WallTime();
759  }
760  reStart = true;
761 
762  } // for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter)
763  timeOuterLoop += MyWatch.WallTime();
764  highMem = (highMem > currentSize()) ? highMem : currentSize();
765 
766  // Clean memory
767  delete[] work1;
768  delete[] work2;
769  if (vectWeight)
770  delete vectWeight;
771 
772  // Sort the eigenpairs
773  timePostProce -= MyWatch.WallTime();
774  if ((info == 0) && (knownEV > 0)) {
775  mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
776  }
777  timePostProce += MyWatch.WallTime();
778 
779  return (info == 0) ? knownEV : info;
780 
781 }
782 
783 
784 void LOBPCG_light::accuracyCheck(const Epetra_MultiVector *X, const Epetra_MultiVector *MX,
785  const Epetra_MultiVector *R, const Epetra_MultiVector *Q,
786  const Epetra_MultiVector *H, const Epetra_MultiVector *P) const {
787 
788  cout.precision(2);
789  cout.setf(ios::scientific, ios::floatfield);
790  double tmp;
791 
792  int myPid = MyComm.MyPID();
793 
794  if (X) {
795  if (M) {
796  if (MX) {
797  tmp = myVerify.errorEquality(X, MX, M);
798  if (myPid == 0)
799  cout << " >> Difference between MX and M*X = " << tmp << endl;
800  }
801  tmp = myVerify.errorOrthonormality(X, M);
802  if (myPid == 0)
803  cout << " >> Error in X^T M X - I = " << tmp << endl;
804  }
805  else {
806  tmp = myVerify.errorOrthonormality(X, 0);
807  if (myPid == 0)
808  cout << " >> Error in X^T X - I = " << tmp << endl;
809  }
810  }
811 
812  if ((R) && (X)) {
813  tmp = myVerify.errorOrthogonality(X, R);
814  if (myPid == 0)
815  cout << " >> Orthogonality X^T R up to " << tmp << endl;
816  }
817 
818  if (Q == 0)
819  return;
820 
821  if (M) {
822  tmp = myVerify.errorOrthonormality(Q, M);
823  if (myPid == 0)
824  cout << " >> Error in Q^T M Q - I = " << tmp << endl;
825  if (X) {
826  tmp = myVerify.errorOrthogonality(Q, X, M);
827  if (myPid == 0)
828  cout << " >> Orthogonality Q^T M X up to " << tmp << endl;
829  }
830  if (H) {
831  tmp = myVerify.errorOrthogonality(Q, H, M);
832  if (myPid == 0)
833  cout << " >> Orthogonality Q^T M H up to " << tmp << endl;
834  }
835  if (P) {
836  tmp = myVerify.errorOrthogonality(Q, P, M);
837  if (myPid == 0)
838  cout << " >> Orthogonality Q^T M P up to " << tmp << endl;
839  }
840  }
841  else {
842  tmp = myVerify.errorOrthonormality(Q, 0);
843  if (myPid == 0)
844  cout << " >> Error in Q^T Q - I = " << tmp << endl;
845  if (X) {
846  tmp = myVerify.errorOrthogonality(Q, X, 0);
847  if (myPid == 0)
848  cout << " >> Orthogonality Q^T X up to " << tmp << endl;
849  }
850  if (H) {
851  tmp = myVerify.errorOrthogonality(Q, H, 0);
852  if (myPid == 0)
853  cout << " >> Orthogonality Q^T H up to " << tmp << endl;
854  }
855  if (P) {
856  tmp = myVerify.errorOrthogonality(Q, P, 0);
857  if (myPid == 0)
858  cout << " >> Orthogonality Q^T P up to " << tmp << endl;
859  }
860  }
861 
862 }
863 
864 
865 void LOBPCG_light::initializeCounters() {
866 
867  historyCount = 0;
868  if (resHistory) {
869  delete[] resHistory;
870  resHistory = 0;
871  }
872 
873  memRequested = 0.0;
874  highMem = 0.0;
875 
876  massOp = 0;
877  numRestart = 0;
878  outerIter = 0;
879  precOp = 0;
880  residual = 0;
881  stifOp = 0;
882 
883  timeLocalProj = 0.0;
884  timeLocalSolve = 0.0;
885  timeLocalUpdate = 0.0;
886  timeMassOp = 0.0;
887  timeNorm = 0.0;
888  timeOrtho = 0.0;
889  timeOuterLoop = 0.0;
890  timePostProce = 0.0;
891  timePrecOp = 0.0;
892  timeResidual = 0.0;
893  timeRestart = 0.0;
894  timeStifOp = 0.0;
895 
896 
897 }
898 
899 
900 void LOBPCG_light::algorithmInfo() const {
901 
902  int myPid = MyComm.MyPID();
903 
904  if (myPid ==0) {
905  cout << " Algorithm: LOBPCG (block version) with storage for 4 blocks of vectors\n";
906  cout << " Block Size: " << blockSize << endl;
907  }
908 
909 }
910 
911 
912 void LOBPCG_light::historyInfo() const {
913 
914  if (resHistory) {
915  int j;
916  cout << " Block Size: " << blockSize << endl;
917  cout << endl;
918  cout << " Residuals\n";
919  cout << endl;
920  cout.precision(4);
921  cout.setf(ios::scientific, ios::floatfield);
922  for (j = 0; j < historyCount; ++j) {
923  int ii;
924  for (ii = 0; ii < blockSize; ++ii)
925  cout << resHistory[blockSize*j + ii] << endl;
926  }
927  cout << endl;
928  }
929 
930 }
931 
932 
933 void LOBPCG_light::memoryInfo() const {
934 
935  int myPid = MyComm.MyPID();
936 
937  double maxHighMem = 0.0;
938  double tmp = highMem;
939  MyComm.MaxAll(&tmp, &maxHighMem, 1);
940 
941  double maxMemRequested = 0.0;
942  tmp = memRequested;
943  MyComm.MaxAll(&tmp, &maxMemRequested, 1);
944 
945  if (myPid == 0) {
946  cout.precision(2);
947  cout.setf(ios::fixed, ios::floatfield);
948  cout << " Memory requested per processor by the eigensolver = (EST) ";
949  cout.width(6);
950  cout << maxMemRequested << " MB " << endl;
951  cout << endl;
952  cout << " High water mark in eigensolver = (EST) ";
953  cout.width(6);
954  cout << maxHighMem << " MB " << endl;
955  cout << endl;
956  }
957 
958 }
959 
960 
961 void LOBPCG_light::operationInfo() const {
962 
963  int myPid = MyComm.MyPID();
964 
965  if (myPid == 0) {
966  cout << " --- Operations ---\n\n";
967  cout << " Total number of mass matrix multiplications = ";
968  cout.width(9);
969  cout << massOp + modalTool.getNumProj_MassMult() + modalTool.getNumNorm_MassMult() << endl;
970  cout << " Mass multiplications in solver loop = ";
971  cout.width(9);
972  cout << massOp << endl;
973  cout << " Mass multiplications for orthogonalization = ";
974  cout.width(9);
975  cout << modalTool.getNumProj_MassMult() << endl;
976  cout << " Mass multiplications for normalization = ";
977  cout.width(9);
978  cout << modalTool.getNumNorm_MassMult() << endl;
979  cout << " Total number of stiffness matrix operations = ";
980  cout.width(9);
981  cout << stifOp << endl;
982  cout << " Total number of preconditioner applications = ";
983  cout.width(9);
984  cout << precOp << endl;
985  cout << " Total number of computed eigen-residuals = ";
986  cout.width(9);
987  cout << residual << endl;
988  cout << "\n";
989  cout << " Total number of outer iterations = ";
990  cout.width(9);
991  cout << outerIter << endl;
992  cout << " Number of restarts = ";
993  cout.width(9);
994  cout << numRestart << endl;
995  cout << "\n";
996  }
997 
998 }
999 
1000 
1001 void LOBPCG_light::timeInfo() const {
1002 
1003  int myPid = MyComm.MyPID();
1004 
1005  if (myPid == 0) {
1006  cout << " --- Timings ---\n\n";
1007  cout.setf(ios::fixed, ios::floatfield);
1008  cout.precision(2);
1009  cout << " Total time for outer loop = ";
1010  cout.width(9);
1011  cout << timeOuterLoop << " s" << endl;
1012  cout << " Time for mass matrix operations = ";
1013  cout.width(9);
1014  cout << timeMassOp << " s ";
1015  cout.width(5);
1016  cout << 100*timeMassOp/timeOuterLoop << " %\n";
1017  cout << " Time for stiffness matrix operations = ";
1018  cout.width(9);
1019  cout << timeStifOp << " s ";
1020  cout.width(5);
1021  cout << 100*timeStifOp/timeOuterLoop << " %\n";
1022  cout << " Time for preconditioner applications = ";
1023  cout.width(9);
1024  cout << timePrecOp << " s ";
1025  cout.width(5);
1026  cout << 100*timePrecOp/timeOuterLoop << " %\n";
1027  cout << " Time for orthogonalizations = ";
1028  cout.width(9);
1029  cout << timeOrtho << " s ";
1030  cout.width(5);
1031  cout << 100*timeOrtho/timeOuterLoop << " %\n";
1032  cout << " Projection step : ";
1033  cout.width(9);
1034  cout << modalTool.getTimeProj() << " s\n";
1035  cout << " Q^T mult. :: ";
1036  cout.width(9);
1037  cout << modalTool.getTimeProj_QtMult() << " s\n";
1038  cout << " Q mult. :: ";
1039  cout.width(9);
1040  cout << modalTool.getTimeProj_QMult() << " s\n";
1041  cout << " Mass mult. :: ";
1042  cout.width(9);
1043  cout << modalTool.getTimeProj_MassMult() << " s\n";
1044  cout << " Normalization step : ";
1045  cout.width(9);
1046  cout << modalTool.getTimeNorm() << " s\n";
1047  cout << " Mass mult. :: ";
1048  cout.width(9);
1049  cout << modalTool.getTimeNorm_MassMult() << " s\n";
1050  cout << " Time for local projection = ";
1051  cout.width(9);
1052  cout << timeLocalProj << " s ";
1053  cout.width(5);
1054  cout << 100*timeLocalProj/timeOuterLoop << " %\n";
1055  cout << " Time for local eigensolve = ";
1056  cout.width(9);
1057  cout << timeLocalSolve << " s ";
1058  cout.width(5);
1059  cout << 100*timeLocalSolve/timeOuterLoop << " %\n";
1060  cout << " Time for local update = ";
1061  cout.width(9);
1062  cout << timeLocalUpdate << " s ";
1063  cout.width(5);
1064  cout << 100*timeLocalUpdate/timeOuterLoop << " %\n";
1065  cout << " Time for residual computations = ";
1066  cout.width(9);
1067  cout << timeResidual << " s ";
1068  cout.width(5);
1069  cout << 100*timeResidual/timeOuterLoop << " %\n";
1070  cout << " Time for residuals norm computations = ";
1071  cout.width(9);
1072  cout << timeNorm << " s ";
1073  cout.width(5);
1074  cout << 100*timeNorm/timeOuterLoop << " %\n";
1075  cout << " Time for restarting space definition = ";
1076  cout.width(9);
1077  cout << timeRestart << " s ";
1078  cout.width(5);
1079  cout << 100*timeRestart/timeOuterLoop << " %\n";
1080  cout << "\n";
1081  cout << " Total time for post-processing = ";
1082  cout.width(9);
1083  cout << timePostProce << " s\n";
1084  cout << endl;
1085  } // if (myPid == 0)
1086 
1087 }
1088 
1089 
void GEMM(const char TRANSA, const char TRANSB, const int M, const int N, const int K, const float ALPHA, const float *A, const int LDA, const float *B, const int LDB, const float BETA, float *C, const int LDC) const