35 #include "KnyazevLOBPCG.h"
40 double _tol,
int _maxIter,
int _verb) :
52 maxIterEigenSolve(_maxIter),
83 double _tol,
int _maxIter,
int _verb,
96 maxIterEigenSolve(_maxIter),
112 timeLocalUpdate(0.0),
125 KnyazevLOBPCG::~KnyazevLOBPCG() {
137 return KnyazevLOBPCG::reSolve(numEigen, Q, lambda);
142 int KnyazevLOBPCG::reSolve(
int numEigen,
Epetra_MultiVector &Q,
double *lambda,
int startingEV) {
197 if (numEigen <= startingEV) {
201 int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, numEigen);
205 int myPid = MyComm.MyPID();
213 int knownEV = startingEV;
214 int localVerbose = verbose*(myPid==0);
231 int xr = Q.MyLength();
234 blockSize = numEigen;
237 tmp = (M == 0) ? 6*numEigen*xr : 9*numEigen*xr;
239 double *work1 =
new (nothrow)
double[tmp];
246 memRequested +=
sizeof(double)*tmp/(1024.0*1024.0);
248 highMem = (highMem > currentSize()) ? highMem : currentSize();
250 double *tmpD = work1;
253 tmpD = tmpD + xr*numEigen;
256 tmpD = (M) ? tmpD + xr*numEigen : tmpD;
259 tmpD = tmpD + xr*numEigen;
262 tmpD = tmpD + xr*numEigen;
265 tmpD = tmpD + xr*numEigen;
268 tmpD = (M) ? tmpD + xr*numEigen : tmpD;
271 tmpD = tmpD + xr*numEigen;
274 tmpD = tmpD + xr*numEigen;
278 if (startingEV > 0) {
282 timeStifOp -= MyWatch.WallTime();
283 K->Apply(copyX, copyKX);
284 timeStifOp += MyWatch.WallTime();
285 stifOp += startingEV;
286 timeMassOp -= MyWatch.WallTime();
289 M->Apply(copyX, copyMX);
291 timeMassOp += MyWatch.WallTime();
292 massOp += startingEV;
306 lwork2 = 2*numEigen + 27*numEigen*numEigen;
307 double *work2 =
new (nothrow)
double[lwork2];
316 highMem = (highMem > currentSize()) ? highMem : currentSize();
320 double *theta = tmpD;
321 tmpD = tmpD + numEigen;
323 double *normR = tmpD;
324 tmpD = tmpD + numEigen;
327 tmpD = tmpD + 9*numEigen*numEigen;
330 tmpD = tmpD + 9*numEigen*numEigen;
334 memRequested +=
sizeof(double)*lwork2/(1024.0*1024.0);
337 if (localVerbose > 2) {
338 resHistory =
new (nothrow)
double[maxIterEigenSolve*numEigen];
339 if (resHistory == 0) {
352 bool reStart =
false;
356 int firstIndex = knownEV;
359 if (localVerbose > 0) {
361 cout <<
" *|* Problem: ";
363 cout <<
"K*Q = M*Q D ";
365 cout <<
"K*Q = Q D ";
367 cout <<
" with preconditioner";
369 cout <<
" *|* Algorithm = LOBPCG (Knyazev's version)" << endl;
370 cout <<
" *|* Size of blocks = " << blockSize << endl;
371 cout <<
" *|* Number of requested eigenvalues = " << numEigen << endl;
373 cout.setf(ios::scientific, ios::floatfield);
374 cout <<
" *|* Tolerance for convergence = " << tolEigenSolve << endl;
375 cout <<
" *|* Norm used for convergence: ";
377 cout <<
"weighted L2-norm with user-provided weights" << endl;
379 cout <<
"L^2-norm" << endl;
381 cout <<
" *|* Input converged eigenvectors = " << startingEV << endl;
382 cout <<
"\n -- Start iterations -- \n";
385 timeOuterLoop -= MyWatch.WallTime();
386 for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) {
388 highMem = (highMem > currentSize()) ? highMem : currentSize();
390 int workingSize = numEigen - knownEV;
406 if ((outerIter == 1) || (reStart ==
true)) {
409 localSize = numEigen;
412 timeMassOp -= MyWatch.WallTime();
415 timeMassOp += MyWatch.WallTime();
416 massOp += workingSize;
419 timeStifOp -= MyWatch.WallTime();
421 timeStifOp += MyWatch.WallTime();
422 stifOp += workingSize;
429 timePrecOp -= MyWatch.WallTime();
430 Prec->ApplyInverse(RI, HI);
431 timePrecOp += MyWatch.WallTime();
432 precOp += workingSize;
435 memcpy(HI.Values(), RI.Values(), xr*workingSize*
sizeof(double));
439 timeMassOp -= MyWatch.WallTime();
442 timeMassOp += MyWatch.WallTime();
443 massOp += workingSize;
446 timeStifOp -= MyWatch.WallTime();
448 timeStifOp += MyWatch.WallTime();
449 stifOp += workingSize;
451 if (localSize == numEigen)
452 localSize += workingSize;
458 timeLocalProj -= MyWatch.WallTime();
459 modalTool.localProjection(numEigen, numEigen, xr, X.Values(), xr, KX.Values(), xr,
461 modalTool.localProjection(numEigen, numEigen, xr, X.Values(), xr, MX.Values(), xr,
463 if (localSize > numEigen) {
464 double *pointer = KK + numEigen*localSize;
465 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, KHI.Values(), xr,
466 pointer, localSize, S);
467 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, KHI.Values(), xr,
468 pointer + numEigen, localSize, S);
469 pointer = MM + numEigen*localSize;
470 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, MHI.Values(), xr,
471 pointer, localSize, S);
472 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, MHI.Values(), xr,
473 pointer + numEigen, localSize, S);
474 if (localSize > numEigen + workingSize) {
475 pointer = KK + (numEigen + workingSize)*localSize;
476 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, KPI.Values(), xr,
477 pointer, localSize, S);
478 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, KPI.Values(), xr,
479 pointer + numEigen, localSize, S);
480 modalTool.localProjection(workingSize, workingSize, xr, PI.Values(), xr, KPI.Values(), xr,
481 pointer + numEigen + workingSize, localSize, S);
482 pointer = MM + (numEigen + workingSize)*localSize;
483 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, MPI.Values(), xr,
484 pointer, localSize, S);
485 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, MPI.Values(), xr,
486 pointer + numEigen, localSize, S);
487 modalTool.localProjection(workingSize, workingSize, xr, PI.Values(), xr, MPI.Values(), xr,
488 pointer + numEigen + workingSize, localSize, S);
491 timeLocalProj += MyWatch.WallTime();
494 timeLocalSolve -= MyWatch.WallTime();
495 int nevLocal = localSize;
496 info = modalTool.directSolver(localSize, KK, localSize, MM, localSize, nevLocal,
497 S, localSize, theta, localVerbose,
498 (blockSize == 1) ? 1 : 0);
499 timeLocalSolve += MyWatch.WallTime();
507 if ((theta[0] < 0.0) || (nevLocal < numEigen)) {
508 if (localVerbose > 0) {
509 cout <<
" Iteration " << outerIter;
510 cout <<
" - Failure for spectral decomposition - RESTART with new random search\n";
512 if (workingSize == 1) {
525 if ((localSize == numEigen+workingSize) && (nevLocal == numEigen)) {
526 for (j = 0; j < nevLocal; ++j)
527 memcpy(S+j*numEigen, S+j*localSize, numEigen*
sizeof(
double));
528 localSize = numEigen;
531 if ((localSize == numEigen+2*workingSize) && (nevLocal <= numEigen+workingSize)) {
532 for (j = 0; j < nevLocal; ++j)
533 memcpy(S+j*(numEigen+workingSize), S+j*localSize, (numEigen+workingSize)*
sizeof(
double));
534 localSize = numEigen + workingSize;
539 timeLocalUpdate -= MyWatch.WallTime();
541 memcpy(R.Values(), X.Values(), xr*numEigen*
sizeof(double));
542 callBLAS.GEMM(
'N',
'N', xr, numEigen, numEigen, 1.0, R.Values(), xr, S, localSize,
543 0.0, X.Values(), xr);
545 memcpy(R.Values(), MX.Values(), xr*numEigen*
sizeof(double));
546 callBLAS.GEMM(
'N',
'N', xr, numEigen, numEigen, 1.0, R.Values(), xr, S, localSize,
547 0.0, MX.Values(), xr);
549 memcpy(R.Values(), KX.Values(), xr*numEigen*
sizeof(double));
550 callBLAS.GEMM(
'N',
'N', xr, numEigen, numEigen, 1.0, R.Values(), xr, S, localSize,
551 0.0, KX.Values(), xr);
553 if (localSize == numEigen + workingSize) {
554 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, HI.Values(), xr,
555 S + numEigen, localSize, 0.0, P.Values(), xr);
557 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, MHI.Values(), xr,
558 S + numEigen, localSize, 0.0, MP.Values(), xr);
560 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, KHI.Values(), xr,
561 S + numEigen, localSize, 0.0, KP.Values(), xr);
564 if (localSize > numEigen + workingSize) {
565 memcpy(RI.Values(), PI.Values(), xr*workingSize*
sizeof(double));
566 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, HI.Values(), xr,
567 S + numEigen, localSize, 0.0, P.Values(), xr);
568 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, RI.Values(), xr,
569 S + numEigen + workingSize, localSize, 1.0, P.Values(), xr);
571 memcpy(RI.Values(), MPI.Values(), xr*workingSize*
sizeof(double));
572 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, MHI.Values(), xr,
573 S + numEigen, localSize, 0.0, MP.Values(), xr);
574 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, RI.Values(), xr,
575 S + numEigen + workingSize, localSize, 1.0, MP.Values(), xr);
577 memcpy(RI.Values(), KPI.Values(), xr*workingSize*
sizeof(double));
578 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, KHI.Values(), xr,
579 S + numEigen, localSize, 0.0, KP.Values(), xr);
580 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, RI.Values(), xr,
581 S + numEigen + workingSize, localSize, 1.0, KP.Values(), xr);
584 if (localSize > numEigen) {
585 callBLAS.AXPY(xr*numEigen, 1.0, P.Values(), X.Values());
587 callBLAS.AXPY(xr*numEigen, 1.0, MP.Values(), MX.Values());
588 callBLAS.AXPY(xr*numEigen, 1.0, KP.Values(), KX.Values());
590 timeLocalUpdate += MyWatch.WallTime();
593 timeResidual -= MyWatch.WallTime();
594 memcpy(R.Values(), KX.Values(), xr*numEigen*
sizeof(double));
595 for (j = 0; j < numEigen; ++j) {
596 callBLAS.AXPY(xr, -theta[j], MX.Values() + j*xr, R.Values() + j*xr);
598 timeResidual += MyWatch.WallTime();
599 residual += numEigen;
602 timeNorm -= MyWatch.WallTime();
604 R.NormWeighted(*vectWeight, normR);
610 for (j = 0; j < numEigen; ++j) {
611 normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
613 timeNorm += MyWatch.WallTime();
617 accuracyCheck(&X, &MX, &R);
620 if (localSize == numEigen + workingSize)
621 localSize += workingSize;
624 timeNorm -= MyWatch.WallTime();
626 for (i=0; i < numEigen; ++i) {
627 if (normR[i] < tolEigenSolve) {
628 lambda[i] = theta[i];
632 timeNorm += MyWatch.WallTime();
635 if (localVerbose > 2) {
636 memcpy(resHistory + historyCount, normR, numEigen*
sizeof(
double));
637 historyCount += numEigen;
641 if (localVerbose > 0) {
642 cout <<
" Iteration " << outerIter;
643 cout <<
" - Number of converged eigenvectors " << knownEV << endl;
646 if (localVerbose > 1) {
649 cout.setf(ios::scientific, ios::floatfield);
650 for (i=0; i<numEigen; ++i) {
651 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
652 cout <<
" = " << normR[i] << endl;
656 for (i=0; i<numEigen; ++i) {
657 cout <<
" Iteration " << outerIter <<
" - Ritz eigenvalue " << i;
658 cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);
659 cout <<
" = " << theta[i] << endl;
665 if (knownEV >= numEigen) {
666 if (localVerbose == 1) {
669 cout.setf(ios::scientific, ios::floatfield);
670 for (i=0; i<numEigen; ++i) {
671 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
672 cout <<
" = " << normR[i] << endl;
680 if ((knownEV > 0) && (localSize > numEigen)) {
681 if (localSize == numEigen + workingSize)
682 localSize = 2*numEigen - knownEV;
683 if (localSize == numEigen + 2*workingSize)
684 localSize = 3*numEigen - 2*knownEV;
689 for (i=0; i<numEigen; ++i) {
690 if (normR[i] >= tolEigenSolve) {
697 if (firstIndex == numEigen-1)
700 while (firstIndex < knownEV) {
702 for (j = firstIndex; j < numEigen; ++j) {
703 if (normR[j] < tolEigenSolve) {
704 callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1);
705 callFortran.SWAP(xr, X.Values() + j*xr, 1, X.Values() + firstIndex*xr, 1);
706 callFortran.SWAP(xr, KX.Values() + j*xr, 1, KX.Values() + firstIndex*xr, 1);
707 callFortran.SWAP(xr, R.Values() + j*xr, 1, R.Values() + firstIndex*xr, 1);
708 callFortran.SWAP(xr, H.Values() + j*xr, 1, H.Values() + firstIndex*xr, 1);
709 callFortran.SWAP(xr, KH.Values() + j*xr, 1, KH.Values() + firstIndex*xr, 1);
710 callFortran.SWAP(xr, P.Values() + j*xr, 1, P.Values() + firstIndex*xr, 1);
711 callFortran.SWAP(xr, KP.Values() + j*xr, 1, KP.Values() + firstIndex*xr, 1);
713 callFortran.SWAP(xr, MX.Values() + j*xr, 1, MX.Values() + firstIndex*xr, 1);
714 callFortran.SWAP(xr, MH.Values() + j*xr, 1, MH.Values() + firstIndex*xr, 1);
715 callFortran.SWAP(xr, MP.Values() + j*xr, 1, MP.Values() + firstIndex*xr, 1);
721 for (i = firstIndex; i < numEigen; ++i) {
722 if (normR[i] >= tolEigenSolve) {
731 timeOuterLoop += MyWatch.WallTime();
732 highMem = (highMem > currentSize()) ? highMem : currentSize();
741 timePostProce -= MyWatch.WallTime();
742 if ((info == 0) && (knownEV > 0)) {
743 mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
745 timePostProce += MyWatch.WallTime();
747 return (info == 0) ? knownEV : info;
756 cout.setf(ios::scientific, ios::floatfield);
759 int myPid = MyComm.MyPID();
764 tmp = myVerify.errorEquality(X, MX, M);
766 cout <<
" >> Difference between MX and M*X = " << tmp << endl;
768 tmp = myVerify.errorOrthonormality(X, M);
770 cout <<
" >> Error in X^T M X - I = " << tmp << endl;
773 tmp = myVerify.errorOrthonormality(X, 0);
775 cout <<
" >> Error in X^T X - I = " << tmp << endl;
780 tmp = myVerify.errorOrthogonality(X, R);
782 cout <<
" >> Orthogonality X^T R up to " << tmp << endl;
788 void KnyazevLOBPCG::initializeCounters() {
807 timeLocalSolve = 0.0;
808 timeLocalUpdate = 0.0;
821 void KnyazevLOBPCG::algorithmInfo()
const {
823 int myPid = MyComm.MyPID();
826 cout <<
" Algorithm: LOBPCG (Knyazev's version) with Cholesky-based local eigensolver\n";
827 cout <<
" Block Size: " << blockSize << endl;
833 void KnyazevLOBPCG::historyInfo()
const {
837 cout <<
" Block Size: " << blockSize << endl;
839 cout <<
" Residuals\n";
842 cout.setf(ios::scientific, ios::floatfield);
843 for (j = 0; j < historyCount; ++j) {
844 cout << resHistory[j] << endl;
852 void KnyazevLOBPCG::memoryInfo()
const {
854 int myPid = MyComm.MyPID();
856 double maxHighMem = 0.0;
857 double tmp = highMem;
858 MyComm.MaxAll(&tmp, &maxHighMem, 1);
860 double maxMemRequested = 0.0;
862 MyComm.MaxAll(&tmp, &maxMemRequested, 1);
866 cout.setf(ios::fixed, ios::floatfield);
867 cout <<
" Memory requested per processor by the eigensolver = (EST) ";
869 cout << maxMemRequested <<
" MB " << endl;
871 cout <<
" High water mark in eigensolver = (EST) ";
873 cout << maxHighMem <<
" MB " << endl;
880 void KnyazevLOBPCG::operationInfo()
const {
882 int myPid = MyComm.MyPID();
885 cout <<
" --- Operations ---\n\n";
886 cout <<
" Total number of mass matrix multiplications = ";
888 cout << massOp << endl;
889 cout <<
" Total number of stiffness matrix operations = ";
891 cout << stifOp << endl;
892 cout <<
" Total number of preconditioner applications = ";
894 cout << precOp << endl;
895 cout <<
" Total number of computed eigen-residuals = ";
897 cout << residual << endl;
899 cout <<
" Total number of outer iterations = ";
901 cout << outerIter << endl;
902 cout <<
" Number of restarts = ";
904 cout << numRestart << endl;
911 void KnyazevLOBPCG::timeInfo()
const {
913 int myPid = MyComm.MyPID();
916 cout <<
" --- Timings ---\n\n";
917 cout.setf(ios::fixed, ios::floatfield);
919 cout <<
" Total time for outer loop = ";
921 cout << timeOuterLoop <<
" s" << endl;
922 cout <<
" Time for mass matrix operations = ";
924 cout << timeMassOp <<
" s ";
926 cout << 100*timeMassOp/timeOuterLoop <<
" %\n";
927 cout <<
" Time for stiffness matrix operations = ";
929 cout << timeStifOp <<
" s ";
931 cout << 100*timeStifOp/timeOuterLoop <<
" %\n";
932 cout <<
" Time for preconditioner applications = ";
934 cout << timePrecOp <<
" s ";
936 cout << 100*timePrecOp/timeOuterLoop <<
" %\n";
937 cout <<
" Time for local projection = ";
939 cout << timeLocalProj <<
" s ";
941 cout << 100*timeLocalProj/timeOuterLoop <<
" %\n";
942 cout <<
" Time for local eigensolve = ";
944 cout << timeLocalSolve <<
" s ";
946 cout << 100*timeLocalSolve/timeOuterLoop <<
" %\n";
947 cout <<
" Time for local update = ";
949 cout << timeLocalUpdate <<
" s ";
951 cout << 100*timeLocalUpdate/timeOuterLoop <<
" %\n";
952 cout <<
" Time for residual computations = ";
954 cout << timeResidual <<
" s ";
956 cout << 100*timeResidual/timeOuterLoop <<
" %\n";
957 cout <<
" Time for residuals norm computations = ";
959 cout << timeNorm <<
" s ";
961 cout << 100*timeNorm/timeOuterLoop <<
" %\n";
963 cout <<
" Total time for post-processing = ";
965 cout << timePostProce <<
" s\n";