19 #include "BlockDACG.h"
24 double _tol,
int _maxIter,
int _verb) :
36 maxIterEigenSolve(_maxIter),
70 double _tol,
int _maxIter,
int _verb,
83 maxIterEigenSolve(_maxIter),
115 BlockDACG::~BlockDACG() {
127 return BlockDACG::reSolve(numEigen, Q, lambda);
131 int BlockDACG::reSolve(
int numEigen,
Epetra_MultiVector &Q,
double *lambda,
int startingEV) {
184 if (numEigen <= startingEV) {
188 int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, numEigen + blockSize);
192 int myPid = MyComm.MyPID();
200 int knownEV = startingEV;
201 int localVerbose = verbose*(myPid==0);
216 int xr = Q.MyLength();
221 tmp = (M == 0) ? 5*blockSize*xr : 7*blockSize*xr;
223 double *work1 =
new (nothrow)
double[tmp];
230 memRequested +=
sizeof(double)*tmp/(1024.0*1024.0);
232 highMem = (highMem > currentSize()) ? highMem : currentSize();
234 double *tmpD = work1;
237 tmpD = tmpD + xr*blockSize;
240 tmpD = (M) ? tmpD + xr*blockSize : tmpD;
243 tmpD = tmpD + xr*blockSize;
246 tmpD = tmpD + xr*blockSize;
249 tmpD = tmpD + xr*blockSize;
252 tmpD = tmpD + xr*blockSize;
270 lwork2 = 5*blockSize + 12*blockSize*blockSize;
271 double *work2 =
new (nothrow)
double[lwork2];
280 highMem = (highMem > currentSize()) ? highMem : currentSize();
284 double *theta = tmpD;
285 tmpD = tmpD + 2*blockSize;
287 double *normR = tmpD;
288 tmpD = tmpD + blockSize;
290 double *oldHtR = tmpD;
291 tmpD = tmpD + blockSize;
293 double *currentHtR = tmpD;
294 tmpD = tmpD + blockSize;
295 memset(currentHtR, 0, blockSize*
sizeof(
double));
298 tmpD = tmpD + 4*blockSize*blockSize;
301 tmpD = tmpD + 4*blockSize*blockSize;
305 memRequested +=
sizeof(double)*lwork2/(1024.0*1024.0);
308 if (localVerbose > 2) {
309 resHistory =
new (nothrow)
double[maxIterEigenSolve*blockSize];
310 if (resHistory == 0) {
323 bool reStart =
false;
327 int twoBlocks = 2*blockSize;
328 int nFound = blockSize;
331 if (localVerbose > 0) {
333 cout <<
" *|* Problem: ";
335 cout <<
"K*Q = M*Q D ";
337 cout <<
"K*Q = Q D ";
339 cout <<
" with preconditioner";
341 cout <<
" *|* Algorithm = DACG (block version)" << endl;
342 cout <<
" *|* Size of blocks = " << blockSize << endl;
343 cout <<
" *|* Number of requested eigenvalues = " << numEigen << endl;
345 cout.setf(ios::scientific, ios::floatfield);
346 cout <<
" *|* Tolerance for convergence = " << tolEigenSolve << endl;
347 cout <<
" *|* Norm used for convergence: ";
349 cout <<
"weighted L2-norm with user-provided weights" << endl;
351 cout <<
"L^2-norm" << endl;
353 cout <<
" *|* Input converged eigenvectors = " << startingEV << endl;
354 cout <<
"\n -- Start iterations -- \n";
357 timeOuterLoop -= MyWatch.WallTime();
358 for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) {
360 highMem = (highMem > currentSize()) ? highMem : currentSize();
362 if ((outerIter == 1) || (reStart ==
true)) {
365 localSize = blockSize;
374 timeMassOp -= MyWatch.WallTime();
377 timeMassOp += MyWatch.WallTime();
384 timeOrtho -= MyWatch.WallTime();
385 info = modalTool.massOrthonormalize(X, MX, M, copyQ, nFound, 0, R.Values());
386 timeOrtho += MyWatch.WallTime();
399 timeStifOp -= MyWatch.WallTime();
401 timeStifOp += MyWatch.WallTime();
411 timePrecOp -= MyWatch.WallTime();
412 Prec->ApplyInverse(R, H);
413 timePrecOp += MyWatch.WallTime();
417 memcpy(H.Values(), R.Values(), xr*blockSize*
sizeof(double));
421 timeSearchP -= MyWatch.WallTime();
422 memcpy(oldHtR, currentHtR, blockSize*
sizeof(
double));
423 H.Dot(R, currentHtR);
425 if (localSize == blockSize) {
427 localSize = twoBlocks;
430 bool hasZeroDot =
false;
431 for (j = 0; j < blockSize; ++j) {
432 if (oldHtR[j] == 0.0) {
436 callBLAS.SCAL(xr, currentHtR[j]/oldHtR[j], P.Values() + j*xr);
438 if (hasZeroDot ==
true) {
440 if (localVerbose > 0) {
442 cout <<
" !! Null dot product -- Restart the search space !!\n";
445 if (blockSize == 1) {
452 nFound = blockSize - j;
459 callBLAS.AXPY(xr*blockSize, -1.0, H.Values(), P.Values());
461 timeSearchP += MyWatch.WallTime();
464 timeMassOp -= MyWatch.WallTime();
467 timeMassOp += MyWatch.WallTime();
474 timeOrtho -= MyWatch.WallTime();
475 modalTool.massOrthonormalize(P, MP, M, copyQ, blockSize, 1, R.Values());
476 timeOrtho += MyWatch.WallTime();
480 timeStifOp -= MyWatch.WallTime();
482 timeStifOp += MyWatch.WallTime();
489 timeLocalProj -= MyWatch.WallTime();
490 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KX.Values(), xr,
492 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MX.Values(), xr,
494 if (localSize > blockSize) {
495 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KP.Values(), xr,
496 KK + blockSize*localSize, localSize, S);
497 modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, KP.Values(), xr,
498 KK + blockSize*localSize + blockSize, localSize, S);
499 modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MP.Values(), xr,
500 MM + blockSize*localSize, localSize, S);
501 modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, MP.Values(), xr,
502 MM + blockSize*localSize + blockSize, localSize, S);
504 timeLocalProj += MyWatch.WallTime();
507 timeLocalSolve -= MyWatch.WallTime();
508 int nevLocal = localSize;
509 info = modalTool.directSolver(localSize, KK, localSize, MM, localSize, nevLocal,
510 S, localSize, theta, localVerbose,
511 (blockSize == 1) ? 1: 0);
512 timeLocalSolve += MyWatch.WallTime();
520 if ((theta[0] < 0.0) || (nevLocal < blockSize)) {
521 if (localVerbose > 0) {
522 cout <<
" Iteration " << outerIter;
523 cout <<
"- Failure for spectral decomposition - RESTART with new random search\n";
525 if (blockSize == 1) {
532 nFound = blockSize - 1;
540 if ((localSize == twoBlocks) && (nevLocal == blockSize)) {
541 for (j = 0; j < nevLocal; ++j)
542 memcpy(S + j*blockSize, S + j*twoBlocks, blockSize*
sizeof(
double));
543 localSize = blockSize;
548 for (j = 0; j < nevLocal; ++j) {
549 double coeff = S[j + j*localSize];
551 callBLAS.SCAL(localSize, -1.0, S + j*localSize);
555 timeResidual -= MyWatch.WallTime();
556 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, KX.Values(), xr,
557 S, localSize, 0.0, R.Values(), xr);
558 if (localSize == twoBlocks) {
559 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr,
560 S + blockSize, localSize, 1.0, R.Values(), xr);
562 for (j = 0; j < blockSize; ++j)
563 callBLAS.SCAL(localSize, theta[j], S + j*localSize);
564 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, -1.0, MX.Values(), xr,
565 S, localSize, 1.0, R.Values(), xr);
566 if (localSize == twoBlocks) {
567 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, -1.0, MP.Values(), xr,
568 S + blockSize, localSize, 1.0, R.Values(), xr);
570 for (j = 0; j < blockSize; ++j)
571 callBLAS.SCAL(localSize, 1.0/theta[j], S + j*localSize);
572 timeResidual += MyWatch.WallTime();
575 timeNorm -= MyWatch.WallTime();
577 R.NormWeighted(*vectWeight, normR);
583 for (j = 0; j < blockSize; ++j) {
584 normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
585 if (normR[j] < tolEigenSolve)
588 timeNorm += MyWatch.WallTime();
591 if (localVerbose > 2) {
592 memcpy(resHistory + historyCount*blockSize, normR, blockSize*
sizeof(
double));
597 if (localVerbose > 0) {
598 cout <<
" Iteration " << outerIter <<
" - Number of converged eigenvectors ";
599 cout << knownEV + nFound << endl;
602 if (localVerbose > 1) {
605 cout.setf(ios::scientific, ios::floatfield);
606 for (i=0; i<blockSize; ++i) {
607 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
608 cout <<
" = " << normR[i] << endl;
612 for (i=0; i<blockSize; ++i) {
613 cout <<
" Iteration " << outerIter <<
" - Ritz eigenvalue " << i;
614 cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);
615 cout <<
" = " << theta[i] << endl;
623 timeLocalUpdate -= MyWatch.WallTime();
624 memcpy(H.Values(), X.Values(), xr*blockSize*
sizeof(double));
625 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
626 0.0, X.Values(), xr);
627 memcpy(H.Values(), KX.Values(), xr*blockSize*
sizeof(double));
628 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
629 0.0, KX.Values(), xr);
631 memcpy(H.Values(), MX.Values(), xr*blockSize*
sizeof(double));
632 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
633 0.0, MX.Values(), xr);
635 if (localSize == twoBlocks) {
636 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, P.Values(), xr,
637 S + blockSize, localSize, 1.0, X.Values(), xr);
638 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr,
639 S + blockSize, localSize, 1.0, KX.Values(), xr);
641 callBLAS.GEMM(
'N',
'N', xr, blockSize, blockSize, 1.0, MP.Values(), xr,
642 S + blockSize, localSize, 1.0, MX.Values(), xr);
645 timeLocalUpdate += MyWatch.WallTime();
649 accuracyCheck(&X, &MX, &R, 0, (localSize>blockSize) ? &P : 0);
653 accuracyCheck(&X, &MX, &R, ©Q, (localSize>blockSize) ? &P : 0);
660 int firstIndex = blockSize;
661 for (j = 0; j < blockSize; ++j) {
662 if (normR[j] >= tolEigenSolve) {
667 while (firstIndex < nFound) {
668 for (j = firstIndex; j < blockSize; ++j) {
669 if (normR[j] < tolEigenSolve) {
671 callFortran.SWAP(localSize, S + j*localSize, 1, S + firstIndex*localSize, 1);
672 callFortran.SWAP(1, theta + j, 1, theta + firstIndex, 1);
673 callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1);
677 for (j = 0; j < blockSize; ++j) {
678 if (normR[j] >= tolEigenSolve) {
686 memcpy(lambda + knownEV, theta, nFound*
sizeof(
double));
689 if (knownEV + nFound >= numEigen) {
690 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, X.Values(), xr,
691 S, localSize, 0.0, R.Values(), xr);
692 if (localSize > blockSize) {
693 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, P.Values(), xr,
694 S + blockSize, localSize, 1.0, R.Values(), xr);
696 memcpy(Q.Values() + knownEV*xr, R.Values(), nFound*xr*
sizeof(double));
698 if (localVerbose == 1) {
701 cout.setf(ios::scientific, ios::floatfield);
702 for (i=0; i<blockSize; ++i) {
703 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
704 cout <<
" = " << normR[i] << endl;
712 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, X.Values(), xr,
713 S, localSize, 0.0, Q.Values() + knownEV*xr, xr);
714 if (localSize == twoBlocks) {
715 callBLAS.GEMM(
'N',
'N', xr, nFound, blockSize, 1.0, P.Values(), xr,
716 S + blockSize, localSize, 1.0, Q.Values() + knownEV*xr, xr);
721 timeRestart -= MyWatch.WallTime();
722 int leftOver = (nevLocal < blockSize + nFound) ? nevLocal - nFound : blockSize;
723 double *Snew = S + nFound*localSize;
724 memcpy(H.Values(), X.Values(), blockSize*xr*
sizeof(double));
725 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
726 Snew, localSize, 0.0, X.Values(), xr);
727 memcpy(H.Values(), KX.Values(), blockSize*xr*
sizeof(double));
728 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
729 Snew, localSize, 0.0, KX.Values(), xr);
731 memcpy(H.Values(), MX.Values(), blockSize*xr*
sizeof(double));
732 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
733 Snew, localSize, 0.0, MX.Values(), xr);
735 if (localSize == twoBlocks) {
736 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, P.Values(), xr,
737 Snew+blockSize, localSize, 1.0, X.Values(), xr);
738 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, KP.Values(), xr,
739 Snew+blockSize, localSize, 1.0, KX.Values(), xr);
741 callBLAS.GEMM(
'N',
'N', xr, leftOver, blockSize, 1.0, MP.Values(), xr,
742 Snew+blockSize, localSize, 1.0, MX.Values(), xr);
745 if (nevLocal < blockSize + nFound) {
754 timeRestart += MyWatch.WallTime();
757 timeOuterLoop += MyWatch.WallTime();
758 highMem = (highMem > currentSize()) ? highMem : currentSize();
767 timePostProce -= MyWatch.WallTime();
768 if ((info == 0) && (knownEV > 0)) {
769 mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
771 timePostProce += MyWatch.WallTime();
773 return (info == 0) ? knownEV : info;
783 cout.setf(ios::scientific, ios::floatfield);
786 int myPid = MyComm.MyPID();
791 tmp = myVerify.errorEquality(X, MX, M);
793 cout <<
" >> Difference between MX and M*X = " << tmp << endl;
795 tmp = myVerify.errorOrthonormality(X, M);
797 cout <<
" >> Error in X^T M X - I = " << tmp << endl;
800 tmp = myVerify.errorOrthonormality(X, 0);
802 cout <<
" >> Error in X^T X - I = " << tmp << endl;
807 tmp = myVerify.errorOrthogonality(X, R);
809 cout <<
" >> Orthogonality X^T R up to " << tmp << endl;
816 tmp = myVerify.errorOrthonormality(Q, M);
818 cout <<
" >> Error in Q^T M Q - I = " << tmp << endl;
820 tmp = myVerify.errorOrthogonality(Q, X, M);
822 cout <<
" >> Orthogonality Q^T M X up to " << tmp << endl;
825 tmp = myVerify.errorOrthogonality(Q, P, M);
827 cout <<
" >> Orthogonality Q^T M P up to " << tmp << endl;
831 tmp = myVerify.errorOrthonormality(Q, 0);
833 cout <<
" >> Error in Q^T Q - I = " << tmp << endl;
835 tmp = myVerify.errorOrthogonality(Q, X, 0);
837 cout <<
" >> Orthogonality Q^T X up to " << tmp << endl;
840 tmp = myVerify.errorOrthogonality(Q, P, 0);
842 cout <<
" >> Orthogonality Q^T P up to " << tmp << endl;
849 void BlockDACG::initializeCounters() {
868 timeLocalSolve = 0.0;
869 timeLocalUpdate = 0.0;
885 void BlockDACG::algorithmInfo()
const {
887 int myPid = MyComm.MyPID();
890 cout <<
" Algorithm: DACG (block version) with Cholesky-based local eigensolver\n";
891 cout <<
" Block Size: " << blockSize << endl;
897 void BlockDACG::historyInfo()
const {
901 cout <<
" Block Size: " << blockSize << endl;
903 cout <<
" Residuals\n";
906 cout.setf(ios::scientific, ios::floatfield);
907 for (j = 0; j < historyCount; ++j) {
909 for (ii = 0; ii < blockSize; ++ii)
910 cout << resHistory[blockSize*j + ii] << endl;
918 void BlockDACG::memoryInfo()
const {
920 int myPid = MyComm.MyPID();
922 double maxHighMem = 0.0;
923 double tmp = highMem;
924 MyComm.MaxAll(&tmp, &maxHighMem, 1);
926 double maxMemRequested = 0.0;
928 MyComm.MaxAll(&tmp, &maxMemRequested, 1);
932 cout.setf(ios::fixed, ios::floatfield);
933 cout <<
" Memory requested per processor by the eigensolver = (EST) ";
935 cout << maxMemRequested <<
" MB " << endl;
937 cout <<
" High water mark in eigensolver = (EST) ";
939 cout << maxHighMem <<
" MB " << endl;
946 void BlockDACG::operationInfo()
const {
948 int myPid = MyComm.MyPID();
951 cout <<
" --- Operations ---\n\n";
952 cout <<
" Total number of mass matrix multiplications = ";
954 cout << massOp + modalTool.getNumProj_MassMult() + modalTool.getNumNorm_MassMult() << endl;
955 cout <<
" Mass multiplications in eigensolver = ";
957 cout << massOp << endl;
958 cout <<
" Mass multiplications for orthogonalization = ";
960 cout << modalTool.getNumProj_MassMult() << endl;
961 cout <<
" Mass multiplications for normalization = ";
963 cout << modalTool.getNumNorm_MassMult() << endl;
964 cout <<
" Total number of stiffness matrix operations = ";
966 cout << stifOp << endl;
967 cout <<
" Total number of preconditioner applications = ";
969 cout << precOp << endl;
970 cout <<
" Total number of computed eigen-residuals = ";
972 cout << residual << endl;
974 cout <<
" Total number of outer iterations = ";
976 cout << outerIter << endl;
977 cout <<
" Number of restarts = ";
979 cout << numRestart << endl;
986 void BlockDACG::timeInfo()
const {
988 int myPid = MyComm.MyPID();
991 cout <<
" --- Timings ---\n\n";
992 cout.setf(ios::fixed, ios::floatfield);
994 cout <<
" Total time for outer loop = ";
996 cout << timeOuterLoop <<
" s" << endl;
997 cout <<
" Time for mass matrix operations = ";
999 cout << timeMassOp <<
" s ";
1001 cout << 100*timeMassOp/timeOuterLoop <<
" %\n";
1002 cout <<
" Time for stiffness matrix operations = ";
1004 cout << timeStifOp <<
" s ";
1006 cout << 100*timeStifOp/timeOuterLoop <<
" %\n";
1007 cout <<
" Time for preconditioner applications = ";
1009 cout << timePrecOp <<
" s ";
1011 cout << 100*timePrecOp/timeOuterLoop <<
" %\n";
1012 cout <<
" Time for defining search directions = ";
1014 cout << timeSearchP <<
" s ";
1016 cout << 100*timeSearchP/timeOuterLoop <<
" %\n";
1017 cout <<
" Time for orthogonalizations = ";
1019 cout << timeOrtho <<
" s ";
1021 cout << 100*timeOrtho/timeOuterLoop <<
" %\n";
1022 cout <<
" Projection step : ";
1024 cout << modalTool.getTimeProj() <<
" s\n";
1025 cout <<
" Q^T mult. :: ";
1027 cout << modalTool.getTimeProj_QtMult() <<
" s\n";
1028 cout <<
" Q mult. :: ";
1030 cout << modalTool.getTimeProj_QMult() <<
" s\n";
1031 cout <<
" Mass mult. :: ";
1033 cout << modalTool.getTimeProj_MassMult() <<
" s\n";
1034 cout <<
" Normalization step : ";
1036 cout << modalTool.getTimeNorm() <<
" s\n";
1037 cout <<
" Mass mult. :: ";
1039 cout << modalTool.getTimeNorm_MassMult() <<
" s\n";
1040 cout <<
" Time for local projection = ";
1042 cout << timeLocalProj <<
" s ";
1044 cout << 100*timeLocalProj/timeOuterLoop <<
" %\n";
1045 cout <<
" Time for local eigensolve = ";
1047 cout << timeLocalSolve <<
" s ";
1049 cout << 100*timeLocalSolve/timeOuterLoop <<
" %\n";
1050 cout <<
" Time for local update = ";
1052 cout << timeLocalUpdate <<
" s ";
1054 cout << 100*timeLocalUpdate/timeOuterLoop <<
" %\n";
1055 cout <<
" Time for residual computations = ";
1057 cout << timeResidual <<
" s ";
1059 cout << 100*timeResidual/timeOuterLoop <<
" %\n";
1060 cout <<
" Time for residuals norm computations = ";
1062 cout << timeNorm <<
" s ";
1064 cout << 100*timeNorm/timeOuterLoop <<
" %\n";
1065 cout <<
" Time for restarting space definition = ";
1067 cout << timeRestart <<
" s ";
1069 cout << 100*timeRestart/timeOuterLoop <<
" %\n";
1071 cout <<
" Total time for post-processing = ";
1073 cout << timePostProce <<
" s\n";