24 double _tol,
int _maxIter,
int _verb) :
37 maxIterEigenSolve(_maxIter),
71 double _tol,
int _maxIter,
int _verb,
85 maxIterEigenSolve(_maxIter),
101 timeLocalUpdate(0.0),
129 return BRQMIN::reSolve(numEigen, Q, lambda);
134 int BRQMIN::reSolve(
int numEigen,
Epetra_MultiVector &Q,
double *lambda,
int startingEV) {
187 if (numEigen <= startingEV) {
191 int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, numEigen + blockSize);
195 int myPid = MyComm.MyPID();
203 int knownEV = startingEV;
204 int localVerbose = verbose*(myPid==0);
219 int xr = Q.MyLength();
224 tmp = (M == 0) ? 5*blockSize*xr : 7*blockSize*xr;
226 double *work1 =
new (nothrow)
double[tmp];
233 memRequested +=
sizeof(double)*tmp/(1024.0*1024.0);
235 highMem = (highMem > currentSize()) ? highMem : currentSize();
237 double *tmpD = work1;
240 tmpD = tmpD + xr*blockSize;
243 tmpD = (M) ? tmpD + xr*blockSize : tmpD;
246 tmpD = tmpD + xr*blockSize;
249 tmpD = tmpD + xr*blockSize;
252 tmpD = tmpD + xr*blockSize;
255 tmpD = tmpD + xr*blockSize;
270 lwork2 = 3*blockSize + 12*blockSize*blockSize;
271 double *work2 =
new (nothrow)
double[lwork2];
280 highMem = (highMem > currentSize()) ? highMem : currentSize();
284 double *theta = tmpD;
285 tmpD = tmpD + 2*blockSize;
287 double *normR = tmpD;
288 tmpD = tmpD + blockSize;
291 tmpD = tmpD + 4*blockSize*blockSize;
294 tmpD = tmpD + 4*blockSize*blockSize;
298 memRequested +=
sizeof(double)*lwork2/(1024.0*1024.0);
301 if (localVerbose > 2) {
302 resHistory =
new (nothrow)
double[maxIterEigenSolve*blockSize];
303 if (resHistory == 0) {
316 bool reStart =
false;
320 int twoBlocks = 2*blockSize;
321 int nFound = blockSize;
324 if (localVerbose > 0) {
326 cout <<
" *|* Problem: ";
328 cout <<
"K*Q = M*Q D ";
330 cout <<
"K*Q = Q D ";
332 cout <<
" with preconditioner";
334 cout <<
" *|* Algorithm = BRQMIN" << endl;
335 cout <<
" *|* Size of blocks = " << blockSize << endl;
336 cout <<
" *|* Number of requested eigenvalues = " << numEigen << endl;
338 cout.setf(ios::scientific, ios::floatfield);
339 cout <<
" *|* Tolerance for convergence = " << tolEigenSolve << endl;
340 cout <<
" *|* Norm used for convergence: ";
342 cout <<
"weighted L2-norm with user-provided weights" << endl;
344 cout <<
"L^2-norm" << endl;
346 cout <<
" *|* Input converged eigenvectors = " << startingEV << endl;
347 cout <<
"\n -- Start iterations -- \n";
350 timeOuterLoop -= MyWatch.WallTime();
351 for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) {
353 highMem = (highMem > currentSize()) ? highMem : currentSize();
355 if ((outerIter == 1) || (reStart ==
true)) {
358 localSize = blockSize;
367 timeMassOp -= MyWatch.WallTime();
370 timeMassOp += MyWatch.WallTime();
377 timeOrtho -= MyWatch.WallTime();
378 info = modalTool.massOrthonormalize(X, MX, M, copyQ, nFound, 0, R.Values());
379 timeOrtho += MyWatch.WallTime();
392 timeStifOp -= MyWatch.WallTime();
394 timeStifOp += MyWatch.WallTime();
404 timePrecOp -= MyWatch.WallTime();
405 Prec->ApplyInverse(R, H);
406 timePrecOp += MyWatch.WallTime();
410 memcpy(H.Values(), R.Values(), xr*blockSize*
sizeof(double));
413 timeSearchP -= MyWatch.WallTime();
415 if (localSize == blockSize) {
417 localSize = twoBlocks;
423 modalTool.localProjection(blockSize, blockSize, xr, KP.Values(), xr, P.Values(), xr,
426 callLAPACK.POTRF(
'U', blockSize, KK, blockSize, &info);
429 if (localVerbose > 0) {
430 cerr <<
" Iteration " << outerIter;
431 cerr <<
" - DPOTRF has a critical failure (" << info <<
")" << endl;
437 if (localVerbose > 0) {
438 cout <<
" Iteration " << outerIter;
439 cout <<
" - Failure for DPOTRF";
440 cout <<
" - RESET search directions to residuals\n";
448 modalTool.localProjection(blockSize, blockSize, xr, KP.Values(), xr, H.Values(), xr,
450 callLAPACK.POTRS(
'U', blockSize, blockSize, KK, blockSize, MM, blockSize, &info);
452 if (localVerbose > 0) {
453 cerr <<
" Iteration " << outerIter;
454 cerr <<
" - DPOTRS has a critical failure (" << info <<
")" << endl;
461 memcpy(R.Values(), P.Values(), xr*blockSize*
sizeof(double));
462 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, R.Values(), xr, MM, blockSize,
463 0.0, P.Values(), xr);
464 callBLAS.AXPY(xr*blockSize, -1.0, H.Values(), P.Values());
469 callBLAS.GEMM(
'T',
'N', blockSize, blockSize, xr, 1.0, P.Values(), xr,
470 KP.Values(), xr, 0.0, MM, blockSize);
471 MyComm.SumAll(MM, KK, blockSize*blockSize);
472 if (localVerbose > 0) {
474 for (j = 0; j < blockSize*blockSize; ++j) {
475 dotMax = (fabs(KK[j]) > dotMax) ? fabs(KK[j]) : dotMax;
478 cout <<
" K-Orthogonality check for search directions = " << dotMax << endl;
486 timeSearchP += MyWatch.WallTime();
489 timeMassOp -= MyWatch.WallTime();
492 timeMassOp += MyWatch.WallTime();
499 timeOrtho -= MyWatch.WallTime();
500 modalTool.massOrthonormalize(P, MP, M, copyQ, blockSize, 1, R.Values());
501 timeOrtho += MyWatch.WallTime();
505 timeStifOp -= MyWatch.WallTime();
507 timeStifOp += MyWatch.WallTime();
514 timeLocalProj -= MyWatch.WallTime();
515 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KX.Values(), xr,
517 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MX.Values(), xr,
519 if (localSize > blockSize) {
520 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KP.Values(), xr,
521 KK + blockSize*localSize, localSize, S);
522 modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, KP.Values(), xr,
523 KK + blockSize*localSize + blockSize, localSize, S);
524 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MP.Values(), xr,
525 MM + blockSize*localSize, localSize, S);
526 modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, MP.Values(), xr,
527 MM + blockSize*localSize + blockSize, localSize, S);
529 timeLocalProj += MyWatch.WallTime();
532 int nevLocal = localSize;
533 timeLocalSolve -= MyWatch.WallTime();
534 info = modalTool.directSolver(localSize, KK, localSize, MM, localSize, nevLocal,
535 S, localSize, theta, localVerbose,
536 (blockSize == 1) ? 1 : 0);
537 timeLocalSolve += MyWatch.WallTime();
545 if ((theta[0] < 0.0) || (nevLocal < blockSize)) {
546 if (localVerbose > 0) {
547 cout <<
" Iteration " << outerIter;
548 cout <<
"- Failure for spectral decomposition - RESTART with new random search\n";
550 if (blockSize == 1) {
557 nFound = blockSize - 1;
565 if ((localSize == twoBlocks) && (nevLocal == blockSize)) {
566 for (j = 0; j < nevLocal; ++j)
567 memcpy(S + j*blockSize, S + j*twoBlocks, blockSize*
sizeof(
double));
568 localSize = blockSize;
572 for (j = 0; j < nevLocal; ++j) {
573 double coeff = S[j + j*localSize];
575 callBLAS.SCAL(localSize, -1.0, S + j*localSize);
579 timeResidual -= MyWatch.WallTime();
580 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, KX.Values(), xr,
581 S, localSize, 0.0, R.Values(), xr);
582 if (localSize == twoBlocks) {
583 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr,
584 S + blockSize, localSize, 1.0, R.Values(), xr);
586 for (j = 0; j < blockSize; ++j)
587 callBLAS.SCAL(localSize, theta[j], S + j*localSize);
588 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, -1.0, MX.Values(), xr,
589 S, localSize, 1.0, R.Values(), xr);
590 if (localSize == twoBlocks) {
591 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, -1.0, MP.Values(), xr,
592 S + blockSize, localSize, 1.0, R.Values(), xr);
594 for (j = 0; j < blockSize; ++j)
595 callBLAS.SCAL(localSize, 1.0/theta[j], S + j*localSize);
596 timeResidual += MyWatch.WallTime();
599 timeNorm -= MyWatch.WallTime();
601 R.NormWeighted(*vectWeight, normR);
607 for (j = 0; j < blockSize; ++j) {
608 normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
609 if (normR[j] < tolEigenSolve)
612 timeNorm += MyWatch.WallTime();
615 if (localVerbose > 2) {
616 memcpy(resHistory + historyCount*blockSize, normR, blockSize*
sizeof(
double));
621 if (localVerbose > 0) {
622 cout <<
" Iteration " << outerIter <<
" - Number of converged eigenvectors ";
623 cout << knownEV + nFound << endl;
626 if (localVerbose > 1) {
629 cout.setf(ios::scientific, ios::floatfield);
630 for (i=0; i<blockSize; ++i) {
631 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
632 cout <<
" = " << normR[i] << endl;
636 for (i=0; i<blockSize; ++i) {
637 cout <<
" Iteration " << outerIter <<
" - Ritz eigenvalue " << i;
638 cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);
639 cout <<
" = " << theta[i] << endl;
647 timeLocalUpdate -= MyWatch.WallTime();
648 memcpy(H.Values(), X.Values(), xr*blockSize*
sizeof(double));
649 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
650 0.0, X.Values(), xr);
651 memcpy(H.Values(), KX.Values(), xr*blockSize*
sizeof(double));
652 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
653 0.0, KX.Values(), xr);
655 memcpy(H.Values(), MX.Values(), xr*blockSize*
sizeof(double));
656 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
657 0.0, MX.Values(), xr);
659 if (localSize == twoBlocks) {
660 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, P.Values(), xr,
661 S + blockSize, localSize, 1.0, X.Values(), xr);
662 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr,
663 S + blockSize, localSize, 1.0, KX.Values(), xr);
665 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, MP.Values(), xr,
666 S + blockSize, localSize, 1.0, MX.Values(), xr);
669 timeLocalUpdate += MyWatch.WallTime();
673 accuracyCheck(&X, &MX, &R, 0, (localSize>blockSize) ? &P : 0);
677 accuracyCheck(&X, &MX, &R, ©Q, (localSize>blockSize) ? &P : 0);
684 int firstIndex = blockSize;
685 for (j = 0; j < blockSize; ++j) {
686 if (normR[j] >= tolEigenSolve) {
691 while (firstIndex < nFound) {
692 for (j = firstIndex; j < blockSize; ++j) {
693 if (normR[j] < tolEigenSolve) {
695 callFortran.SWAP(localSize, S + j*localSize, 1, S + firstIndex*localSize, 1);
696 callFortran.SWAP(1, theta + j, 1, theta + firstIndex, 1);
697 callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1);
701 for (j = 0; j < blockSize; ++j) {
702 if (normR[j] >= tolEigenSolve) {
710 memcpy(lambda + knownEV, theta, nFound*
sizeof(
double));
713 if (knownEV + nFound >= numEigen) {
714 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, X.Values(), xr,
715 S, localSize, 0.0, R.Values(), xr);
716 if (localSize > blockSize) {
717 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, P.Values(), xr,
718 S + blockSize, localSize, 1.0, R.Values(), xr);
720 memcpy(Q.Values() + knownEV*xr, R.Values(), nFound*xr*
sizeof(double));
722 if (localVerbose == 1) {
725 cout.setf(ios::scientific, ios::floatfield);
726 for (i=0; i<blockSize; ++i) {
727 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
728 cout <<
" = " << normR[i] << endl;
736 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, X.Values(), xr,
737 S, localSize, 0.0, Q.Values() + knownEV*xr, xr);
738 if (localSize == twoBlocks) {
739 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, P.Values(), xr,
740 S + blockSize, localSize, 1.0, Q.Values() + knownEV*xr, xr);
745 timeRestart -= MyWatch.WallTime();
746 int leftOver = (nevLocal < blockSize + nFound) ? nevLocal - nFound : blockSize;
747 double *Snew = S + nFound*localSize;
748 memcpy(H.Values(), X.Values(), blockSize*xr*
sizeof(double));
749 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
750 Snew, localSize, 0.0, X.Values(), xr);
751 memcpy(H.Values(), KX.Values(), blockSize*xr*
sizeof(double));
752 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
753 Snew, localSize, 0.0, KX.Values(), xr);
755 memcpy(H.Values(), MX.Values(), blockSize*xr*
sizeof(double));
756 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
757 Snew, localSize, 0.0, MX.Values(), xr);
759 if (localSize == twoBlocks) {
760 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, P.Values(), xr,
761 Snew+blockSize, localSize, 1.0, X.Values(), xr);
762 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, KP.Values(), xr,
763 Snew+blockSize, localSize, 1.0, KX.Values(), xr);
765 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, MP.Values(), xr,
766 Snew+blockSize, localSize, 1.0, MX.Values(), xr);
769 if (nevLocal < blockSize + nFound) {
778 timeRestart += MyWatch.WallTime();
781 timeOuterLoop += MyWatch.WallTime();
782 highMem = (highMem > currentSize()) ? highMem : currentSize();
791 timePostProce -= MyWatch.WallTime();
792 if ((info == 0) && (knownEV > 0)) {
793 mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
795 timePostProce += MyWatch.WallTime();
797 return (info == 0) ? knownEV : info;
807 cout.setf(ios::scientific, ios::floatfield);
810 int myPid = MyComm.MyPID();
815 tmp = myVerify.errorEquality(X, MX, M);
817 cout <<
" >> Relative difference between MX and M*X = " << tmp << endl;
819 tmp = myVerify.errorOrthonormality(X, M);
821 cout <<
" >> Error in X^T M X - I = " << tmp << endl;
824 tmp = myVerify.errorOrthonormality(X, 0);
826 cout <<
" >> Error in X^T X - I = " << tmp << endl;
831 tmp = myVerify.errorOrthogonality(X, R);
833 cout <<
" >> Orthogonality X^T R up to " << tmp << endl;
840 tmp = myVerify.errorOrthonormality(Q, M);
842 cout <<
" >> Error in Q^T M Q - I = " << tmp << endl;
844 tmp = myVerify.errorOrthogonality(Q, X, M);
846 cout <<
" >> Orthogonality Q^T M X up to " << tmp << endl;
849 tmp = myVerify.errorOrthogonality(Q, P, M);
851 cout <<
" >> Orthogonality Q^T M P up to " << tmp << endl;
855 tmp = myVerify.errorOrthonormality(Q, 0);
857 cout <<
" >> Error in Q^T Q - I = " << tmp << endl;
859 tmp = myVerify.errorOrthogonality(Q, X, 0);
861 cout <<
" >> Orthogonality Q^T X up to " << tmp << endl;
864 tmp = myVerify.errorOrthogonality(Q, P, 0);
866 cout <<
" >> Orthogonality Q^T P up to " << tmp << endl;
873 void BRQMIN::initializeCounters() {
892 timeLocalSolve = 0.0;
893 timeLocalUpdate = 0.0;
909 void BRQMIN::algorithmInfo()
const {
911 int myPid = MyComm.MyPID();
914 cout <<
" Algorithm: BRQMIN with Cholesky-based local eigensolver\n";
915 cout <<
" Block Size: " << blockSize << endl;
921 void BRQMIN::historyInfo()
const {
925 cout <<
" Block Size: " << blockSize << endl;
927 cout <<
" Residuals\n";
930 cout.setf(ios::scientific, ios::floatfield);
931 for (j = 0; j < historyCount; ++j) {
933 for (ii = 0; ii < blockSize; ++ii)
934 cout << resHistory[blockSize*j + ii] << endl;
942 void BRQMIN::memoryInfo()
const {
944 int myPid = MyComm.MyPID();
946 double maxHighMem = 0.0;
947 double tmp = highMem;
948 MyComm.MaxAll(&tmp, &maxHighMem, 1);
950 double maxMemRequested = 0.0;
952 MyComm.MaxAll(&tmp, &maxMemRequested, 1);
956 cout.setf(ios::fixed, ios::floatfield);
957 cout <<
" Memory requested per processor by the eigensolver = (EST) ";
959 cout << maxMemRequested <<
" MB " << endl;
961 cout <<
" High water mark in eigensolver = (EST) ";
963 cout << maxHighMem <<
" MB " << endl;
970 void BRQMIN::operationInfo()
const {
972 int myPid = MyComm.MyPID();
975 cout <<
" --- Operations ---\n\n";
976 cout <<
" Total number of mass matrix multiplications = ";
978 cout << massOp + modalTool.getNumProj_MassMult() + modalTool.getNumNorm_MassMult() << endl;
979 cout <<
" Mass multiplications in eigensolver = ";
981 cout << massOp << endl;
982 cout <<
" Mass multiplications for orthogonalization = ";
984 cout << modalTool.getNumProj_MassMult() << endl;
985 cout <<
" Mass multiplications for normalization = ";
987 cout << modalTool.getNumNorm_MassMult() << endl;
988 cout <<
" Total number of stiffness matrix operations = ";
990 cout << stifOp << endl;
991 cout <<
" Total number of preconditioner applications = ";
993 cout << precOp << endl;
994 cout <<
" Total number of computed eigen-residuals = ";
996 cout << residual << endl;
998 cout <<
" Total number of outer iterations = ";
1000 cout << outerIter << endl;
1001 cout <<
" Number of restarts = ";
1003 cout << numRestart << endl;
1010 void BRQMIN::timeInfo()
const {
1012 int myPid = MyComm.MyPID();
1015 cout <<
" --- Timings ---\n\n";
1016 cout.setf(ios::fixed, ios::floatfield);
1018 cout <<
" Total time for outer loop = ";
1020 cout << timeOuterLoop <<
" s" << endl;
1021 cout <<
" Time for mass matrix operations = ";
1023 cout << timeMassOp <<
" s ";
1025 cout << 100*timeMassOp/timeOuterLoop <<
" %\n";
1026 cout <<
" Time for stiffness matrix operations = ";
1028 cout << timeStifOp <<
" s ";
1030 cout << 100*timeStifOp/timeOuterLoop <<
" %\n";
1031 cout <<
" Time for preconditioner applications = ";
1033 cout << timePrecOp <<
" s ";
1035 cout << 100*timePrecOp/timeOuterLoop <<
" %\n";
1036 cout <<
" Time for defining search directions = ";
1038 cout << timeSearchP <<
" s ";
1040 cout << 100*timeSearchP/timeOuterLoop <<
" %\n";
1041 cout <<
" Time for orthogonalizations = ";
1043 cout << timeOrtho <<
" s ";
1045 cout << 100*timeOrtho/timeOuterLoop <<
" %\n";
1046 cout <<
" Projection step : ";
1048 cout << modalTool.getTimeProj() <<
" s\n";
1049 cout <<
" Q^T mult. :: ";
1051 cout << modalTool.getTimeProj_QtMult() <<
" s\n";
1052 cout <<
" Q mult. :: ";
1054 cout << modalTool.getTimeProj_QMult() <<
" s\n";
1055 cout <<
" Mass mult. :: ";
1057 cout << modalTool.getTimeProj_MassMult() <<
" s\n";
1058 cout <<
" Normalization step : ";
1060 cout << modalTool.getTimeNorm() <<
" s\n";
1061 cout <<
" Mass mult. :: ";
1063 cout << modalTool.getTimeNorm_MassMult() <<
" s\n";
1064 cout <<
" Time for local projection = ";
1066 cout << timeLocalProj <<
" s ";
1068 cout << 100*timeLocalProj/timeOuterLoop <<
" %\n";
1069 cout <<
" Time for local eigensolve = ";
1071 cout << timeLocalSolve <<
" s ";
1073 cout << 100*timeLocalSolve/timeOuterLoop <<
" %\n";
1074 cout <<
" Time for local update = ";
1076 cout << timeLocalUpdate <<
" s ";
1078 cout << 100*timeLocalUpdate/timeOuterLoop <<
" %\n";
1079 cout <<
" Time for residual computations = ";
1081 cout << timeResidual <<
" s ";
1083 cout << 100*timeResidual/timeOuterLoop <<
" %\n";
1084 cout <<
" Time for residuals norm computations = ";
1086 cout << timeNorm <<
" s ";
1088 cout << 100*timeNorm/timeOuterLoop <<
" %\n";
1089 cout <<
" Time for restarting space definition = ";
1091 cout << timeRestart <<
" s ";
1093 cout << 100*timeRestart/timeOuterLoop <<
" %\n";
1095 cout <<
" Total time for post-processing = ";
1097 cout << timePostProce <<
" s\n";