37 #include <Teuchos_Assert.hpp>
42 double _tol,
int _maxIterES,
int _maxIterLS,
int _verb,
double *_weight) :
55 maxIterEigenSolve(_maxIterES),
56 maxIterLinearSolve(_maxIterLS),
73 numCorrectionSolve(0),
82 timeCorrectionPrec(0.0),
83 timeCorrectionSolve(0.0),
112 if (spaceSizeHistory) {
113 delete[] spaceSizeHistory;
114 spaceSizeHistory = 0;
117 if (iterPCGHistory) {
118 delete[] iterPCGHistory;
127 double *invQtMPMQ,
int ldQtMPMQ,
double *PMQ,
double *work,
double *WS) {
155 int bC = B.NumVectors();
156 int bR = B.MyLength();
159 timeCorrectionPrec -= MyWatch.WallTime();
161 Prec->ApplyInverse(B, PrecB);
164 memcpy(PrecB.Values(), B.Values(), bR*bC*
sizeof(double));
166 timeCorrectionPrec += MyWatch.WallTime();
168 int uC = U->NumVectors();
169 int vC = (V) ? V->NumVectors() : 0;
174 callBLAS.
GEMM(
'T',
'N', sizeQ, bC, bR, 1.0, PMQ, bR, B.Values(), bR,
175 0.0, work + sizeQ*bC, sizeQ);
176 MyComm.SumAll(work + sizeQ*bC, work, sizeQ*bC);
178 memcpy(work + sizeQ*bC, work, sizeQ*bC*
sizeof(
double));
181 callLAPACK.POTRS(
'U', sizeQ, bC, invQtMPMQ, ldQtMPMQ, work, sizeQ, &info);
184 callBLAS.GEMM(
'N',
'N', bR, bC, sizeQ, -1.0, PMQ, bR, work, sizeQ, 1.0, PrecB.Values(), bR);
187 M->Apply(PrecB, MPB);
190 callBLAS.GEMM(
'T',
'N', uC, bC, bR, 1.0, U->Values(), bR, MPB.Values(), bR,
191 0.0, work + vC + sizeQ*bC, sizeQ);
193 callBLAS.GEMM(
'T',
'N', vC, bC, bR, 1.0, V->Values(), bR, MPB.Values(), bR,
194 0.0, work + sizeQ*bC, sizeQ);
196 MyComm.SumAll(work + sizeQ*bC, work, sizeQ*bC);
199 double *Mnorm =
new double[bC];
200 MPB.Dot(PrecB, Mnorm);
202 bool doSecond =
false;
203 for (
int j = 0; j < bC; ++j) {
204 double tolOrtho = 1.0e-28*Mnorm[j];
205 for (
int i = 0; i < sizeQ; ++i) {
206 double tmp = work[i + j*sizeQ];
207 if (tmp*tmp > tolOrtho) {
212 if (doSecond ==
true) {
214 callLAPACK.POTRS(
'U', sizeQ, bC, invQtMPMQ, ldQtMPMQ, work, sizeQ, &info);
216 callBLAS.GEMM(
'N',
'N', bR, bC, sizeQ, -1.0, PMQ, bR, work, sizeQ, 1.0, PrecB.Values(), bR);
222 numCorrectionPrec += bC;
231 double eta,
double tolCG,
int iterMax,
232 double *invQtMPMQ,
int ldQtMPMQ,
double *PMQ,
233 double *workPrec,
double *workPCG,
275 int xrow = Y.MyLength();
276 int xcol = Y.NumVectors();
279 int localVerbose = verbose*(MyComm.MyPID() == 0);
281 double *pointer = workPCG;
286 double *PtKP = pointer;
287 pointer = pointer + xcol*xcol;
290 double *coeff = pointer;
291 pointer = pointer + xcol*xcol;
294 double *workD = pointer;
295 pointer = pointer + xcol*xcol;
298 double *da = pointer;
299 pointer = pointer + xcol;
302 double *initNorm = pointer;
303 pointer = pointer + xcol;
306 double *resNorm = pointer;
307 pointer = pointer + xcol;
310 double *valZ = pointer;
311 pointer = pointer + xrow*xcol;
315 double *valP = pointer;
316 pointer = pointer + xrow*xcol;
320 double *valKP = pointer;
321 pointer = pointer + xrow*xcol;
326 double *UtKU = pointer;
327 pointer = pointer + xcol*xcol;
330 double *theta = pointer;
331 pointer = pointer + xcol;
334 double *resEig = pointer;
335 pointer = pointer + xcol;
338 double *oldEig = pointer;
339 pointer = pointer + xcol;
342 double *valKU = pointer;
343 pointer = pointer + xrow*xcol;
359 sizeQ += (U) ? U->NumVectors() : 0;
360 sizeQ += (V) ? V->NumVectors() : 0;
362 bool isNegative =
false;
364 Rlin.Norm2(initNorm);
366 if (localVerbose > 3) {
369 cout.setf(ios::scientific, ios::floatfield);
370 for (ii = 0; ii < xcol; ++ii) {
371 cout <<
" ... Initial Residual Norm " << ii <<
" = " << initNorm[ii] << endl;
377 timePCGLoop -= MyWatch.WallTime();
378 for (iter = 1; iter <= iterMax; ++iter) {
381 timePCGPrec -= MyWatch.WallTime();
382 jacobiPreconditioner(Rlin, Z, U, V, invQtMPMQ, ldQtMPMQ, PMQ, workPrec, MU.Values());
383 timePCGPrec += MyWatch.WallTime();
389 modalTool.massOrthonormalize(Z, MU, M, *orthoVec, blockSize, 1);
398 callBLAS.GEMM(
'T',
'N', xcol, xcol, xrow, 1.0, KP.Values(), xrow, Z.Values(), xrow,
400 MyComm.SumAll(workD, coeff, xcol*xcol);
403 callBLAS.GEMM(
'T',
'N', xcol, xcol, xcol, 1.0, PtKP, xcol, coeff, xcol,
405 for (ii = 0; ii < xcol; ++ii)
406 callFortran.SCAL_INCX(xcol, da[ii], workD + ii, xcol);
407 callBLAS.GEMM(
'N',
'N', xcol, xcol, xcol, 1.0, PtKP, xcol, workD, xcol,
412 memcpy(KP.Values(), P.Values(), xrow*xcol*
sizeof(double));
413 callBLAS.GEMM(
'N',
'N', xrow, xcol, xcol, 1.0, KP.Values(), xrow, coeff, xcol,
414 0.0, P.Values(), xrow);
416 P.Update(1.0, Z, -1.0);
420 timePCGOpMult -= MyWatch.WallTime();
422 numPCGstifOp += xcol;
428 callBLAS.AXPY(xrow*xcol, -eta, Z.Values(), KP.Values());
431 callBLAS.AXPY(xrow*xcol, -eta, P.Values(), KP.Values());
433 numPCGmassOp += xcol;
435 timePCGOpMult += MyWatch.WallTime();
438 callBLAS.GEMM(
'T',
'N', xcol, xcol, xrow, 1.0, P.Values(), xrow, KP.Values(), xrow,
440 MyComm.SumAll(workD, PtKP, xcol*xcol);
444 info = modalTool.directSolver(xcol, PtKP, xcol, 0, 0, nev, PtKP, xcol, da, 0, 10);
448 info = - iterMax - 1;
450 maxIterPCG = (iter > maxIterPCG) ? iter : maxIterPCG;
455 for (ii = 0; ii < xcol; ++ii) {
461 da[ii] = (da[ii] == 0.0) ? 0.0 : 1.0/da[ii];
465 if (isNegative ==
true) {
466 if (localVerbose > 0) {
469 cout.setf(ios::scientific, ios::floatfield);
470 cout <<
" !! Negative eigenvalue in block PCG (" << da[ii] <<
") !!\n";
475 maxIterPCG = (iter > maxIterPCG) ? iter : maxIterPCG;
480 callBLAS.GEMM(
'T',
'N', xcol, xcol, xrow, 1.0, P.Values(), xrow, Rlin.Values(), xrow,
482 MyComm.SumAll(workD, coeff, xcol*xcol);
485 callBLAS.GEMM(
'T',
'N', xcol, xcol, xcol, 1.0, PtKP, xcol, coeff, xcol,
487 for (ii = 0; ii < xcol; ++ii)
488 callFortran.SCAL_INCX(xcol, da[ii], workD + ii, xcol);
489 callBLAS.GEMM(
'N',
'N', xcol, xcol, xcol, 1.0, PtKP, xcol, workD, xcol,
493 callBLAS.GEMM(
'N',
'N', xrow, xcol, xcol, 1.0, P.Values(), xrow, coeff, xcol,
494 1.0, Y.Values(), xrow);
497 callBLAS.GEMM(
'N',
'N', xrow, xcol, xcol, -1.0, KP.Values(), xrow, coeff, xcol,
498 1.0, Rlin.Values(), xrow);
503 for (ii = 0; ii < xcol; ++ii) {
504 if (resNorm[ii] <= tolCG*initNorm[ii])
508 if (localVerbose > 3) {
510 for (ii = 0; ii < xcol; ++ii) {
513 cout << ii <<
" ... Residual = ";
515 cout.setf(ios::scientific, ios::floatfield);
516 cout << resNorm[ii] <<
" ... Right Hand Side = " << initNorm[ii] << endl;
521 if (nFound == xcol) {
526 timePCGEigCheck -= MyWatch.WallTime();
530 Z.Update(1.0, *U, -1.0, Y, 0.0);
534 numPCGstifOp += xcol;
535 callBLAS.GEMM(
'T',
'N', xcol, xcol, xrow, 1.0, Z.Values(), xrow, KU.Values(), xrow,
537 MyComm.SumAll(workD, UtKU, xcol*xcol);
543 numPCGmassOp += xcol;
544 callBLAS.GEMM(
'T',
'N', xcol, xcol, xrow, 1.0, Z.Values(), xrow, MU.Values(), xrow,
546 MyComm.SumAll(workD, coeff, xcol*xcol);
549 callBLAS.GEMM(
'T',
'N', xcol, xcol, xrow, 1.0, Z.Values(), xrow, Z.Values(), xrow,
551 MyComm.SumAll(workD, coeff, xcol*xcol);
555 info = modalTool.directSolver(xcol, UtKU, xcol, coeff, xcol, nev, workD, xcol, theta, 0,
556 (blockSize == 1) ? 1 : 0);
558 if ((info < 0) || (nev < xcol)) {
560 info = - iterMax - 1;
562 maxIterPCG = (iter > maxIterPCG) ? iter : maxIterPCG;
569 callBLAS.GEMM(
'N',
'N', xrow, xcol, xcol, 1.0, MU.Values(), xrow, workD, xcol,
570 0.0, Z.Values(), xrow);
572 for (ii = 0; ii < xcol; ++ii)
573 callBLAS.SCAL(xrow, theta[ii], Z.Values() + ii*xrow);
574 callBLAS.GEMM(
'N',
'N', xrow, xcol, xcol, 1.0, KU.Values(), xrow, workD, xcol,
575 -1.0, Z.Values(), xrow);
578 Z.NormWeighted(*vectWeight, resEig);
582 timePCGEigCheck += MyWatch.WallTime();
589 for (ii = 0; ii < xcol; ++ii) {
590 nFound = (resEig[ii] < tolEigenSolve*theta[ii]) ? nFound + 1 : nFound;
591 nGrow = (resEig[ii] > oldEig[ii]) ? nGrow + 1 : nGrow;
593 if ((nFound > 0) || (nGrow > 0)) {
599 memcpy(oldEig, resEig, xcol*
sizeof(
double));
602 timePCGLoop += MyWatch.WallTime();
605 maxIterPCG = (iter > maxIterPCG) ? iter : maxIterPCG;
614 return JDPCG::reSolve(numEigen, Q, lambda);
619 int JDPCG::reSolve(
int numEigen,
Epetra_MultiVector &Q,
double *lambda,
int startingEV) {
671 if (numEigen <= startingEV) {
675 int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, minimumSpaceDimension(numEigen));
679 int myPid = MyComm.MyPID();
684 cerr <<
" !!! The Epetra_Operator object for the mass matrix is not specified !!!" << endl;
690 if (numBlock*blockSize < numEigen) {
693 cerr <<
" !!! The space dimension (# of blocks x size of blocks) must be greater than ";
694 cerr <<
" the number of eigenvalues !!!\n";
695 cerr <<
" Number of blocks = " << numBlock << endl;
696 cerr <<
" Size of blocks = " << blockSize << endl;
697 cerr <<
" Number of eigenvalues = " << numEigen << endl;
709 int knownEV = startingEV;
710 int localVerbose = verbose*(myPid==0);
723 int xr = Q.MyLength();
724 int dimSearch = blockSize*numBlock;
736 lwork += (numEigen-startingEV + blockSize)*xr + 2*blockSize*xr;
737 lwork += (M) ? blockSize*xr : 0;
740 lwork += (maxIterLinearSolve > 0) ? 4*xr*blockSize + 6*blockSize + 4*blockSize*blockSize : 0;
743 lwork += blockSize + dimSearch + 2*dimSearch*dimSearch;
744 lwork += 2*(numEigen-startingEV+blockSize)*(numEigen-startingEV+blockSize);
745 lwork += 2*(numEigen-startingEV+blockSize)*blockSize;
747 double *work =
new (nothrow)
double[lwork];
754 memRequested +=
sizeof(double)*lwork/(1024.0*1024.0);
756 highMem = (highMem > currentSize()) ? highMem : currentSize();
761 tmpD = tmpD + (numEigen-startingEV+blockSize)*xr;
764 tmpD = tmpD + blockSize*xr;
767 tmpD = tmpD + blockSize*xr;
770 tmpD = tmpD + blockSize*xr;
774 if (maxIterLinearSolve > 0) {
775 workPCG = tmpD - blockSize*xr;
776 tmpD = tmpD + 4*xr*blockSize + 6*blockSize + 4*blockSize*blockSize;
793 double *theta = tmpD;
794 tmpD = tmpD + dimSearch;
796 double *normR = tmpD;
797 tmpD = tmpD + blockSize;
800 tmpD = tmpD + dimSearch*dimSearch;
801 memset(KK, 0, dimSearch*dimSearch*
sizeof(
double));
804 tmpD = tmpD + dimSearch*dimSearch;
806 double *QtMPMQ = tmpD;
807 tmpD = tmpD + (numEigen-startingEV+blockSize)*(numEigen-startingEV+blockSize);
808 int ldQtMPMQ = numEigen - startingEV + blockSize;
810 double *invQtMPMQ = tmpD;
811 tmpD = tmpD + (numEigen-startingEV+blockSize)*(numEigen-startingEV+blockSize);
813 double *tmpArray = tmpD;
816 if (localVerbose > 2) {
817 resHistory =
new (nothrow)
double[maxIterEigenSolve*blockSize];
818 spaceSizeHistory =
new (nothrow)
int[maxIterEigenSolve];
819 iterPCGHistory =
new (nothrow)
int[maxIterEigenSolve];
820 if ((resHistory == 0) || (spaceSizeHistory == 0) || (iterPCGHistory == 0)) {
831 bool reStart =
false;
832 bool criticalExit =
false;
835 int nFound = blockSize;
842 double sqrtTol = sqrt(tolEigenSolve);
843 double coefDecay = 0.5;
846 if (localVerbose > 0) {
848 cout <<
" *|* Problem: ";
850 cout <<
"K*Q = M*Q D ";
852 cout <<
"K*Q = Q D ";
854 cout <<
" with preconditioner";
856 cout <<
" *|* Algorithm = Jacobi-Davidson algorithm with PCG (block version)\n";
857 cout <<
" *|* Size of blocks = " << blockSize << endl;
858 cout <<
" *|* Largest size of search space = " << numBlock*blockSize << endl;
859 cout <<
" *|* Number of requested eigenvalues = " << numEigen << endl;
861 cout.setf(ios::scientific, ios::floatfield);
862 cout <<
" *|* Tolerance for convergence = " << tolEigenSolve << endl;
863 cout <<
" *|* Norm used for convergence: ";
865 cout <<
"weighted L2-norm with user-provided weights" << endl;
867 cout <<
"L^2-norm" << endl;
869 cout <<
" *|* Input converged eigenvectors = " << startingEV << endl;
870 cout <<
"\n -- Start iterations -- \n";
873 int maxBlock = (dimSearch/blockSize) - (knownEV/blockSize);
875 timeOuterLoop -= MyWatch.WallTime();
877 while (outerIter <= maxIterEigenSolve) {
879 highMem = (highMem > currentSize()) ? highMem : currentSize();
884 for (nb = bStart; nb < maxBlock; ++nb) {
887 if (outerIter > maxIterEigenSolve)
890 int localSize = nb*blockSize;
894 timeMassOp -= MyWatch.WallTime();
896 M->Apply(Xcurrent, MX);
897 timeMassOp += MyWatch.WallTime();
902 timeOrtho -= MyWatch.WallTime();
906 info = modalTool.massOrthonormalize(Xcurrent, MX, M, Q, nFound, 2, R.Values());
909 if (localSize == 0) {
911 info = modalTool.massOrthonormalize(Xcurrent, MX, M, copyQ, nFound, 0, R.Values());
915 info = modalTool.massOrthonormalize(Xcurrent, MX, M, copyQ, nFound, 0, R.Values());
923 info = modalTool.massOrthonormalize(Xcurrent, MX, M, copyQ, blockSize, 0, R.Values());
925 timeOrtho += MyWatch.WallTime();
938 timeStifOp -= MyWatch.WallTime();
939 K->Apply(Xcurrent, KX);
940 timeStifOp += MyWatch.WallTime();
944 if (knownEV + localSize == 0)
945 accuracyCheck(&Xcurrent, &MX, 0);
948 accuracyCheck(&Xcurrent, &MX, ©Q);
950 if (localVerbose > 0)
955 timeLocalProj -= MyWatch.WallTime();
956 for (j = 0; j <= nb; ++j) {
957 callBLAS.GEMM(
'T',
'N', blockSize, blockSize, xr,
958 1.0, X.Values()+(knownEV+j*blockSize)*xr, xr, KX.Values(), xr,
959 0.0, tmpArray + blockSize*blockSize, blockSize);
960 MyComm.SumAll(tmpArray + blockSize*blockSize, tmpArray, blockSize*blockSize);
962 for (iC = 0; iC < blockSize; ++iC) {
963 double *Kpointer = KK + localSize*dimSearch + j*blockSize + iC*dimSearch;
964 memcpy(Kpointer, tmpArray + iC*blockSize, blockSize*
sizeof(
double));
967 timeLocalProj += MyWatch.WallTime();
970 timeLocalSolve -= MyWatch.WallTime();
971 int nevLocal = localSize + blockSize;
972 info = modalTool.directSolver(localSize + blockSize, KK, dimSearch, 0, 0,
973 nevLocal, S, dimSearch, theta, localVerbose, 10);
974 timeLocalSolve += MyWatch.WallTime();
976 if ((info != 0) && (theta[0] < 0.0)) {
983 if (localVerbose > 0) {
984 cout <<
" Iteration " << outerIter;
985 cout <<
"- Failure for spectral decomposition - RESTART with new random search\n";
989 timeRestart -= MyWatch.WallTime();
992 timeRestart += MyWatch.WallTime();
1000 timeLocalUpdate -= MyWatch.WallTime();
1001 callBLAS.GEMM(
'N',
'N', xr, blockSize, localSize+blockSize, 1.0, X.Values()+knownEV*xr, xr,
1002 S, dimSearch, 0.0, KX.Values(), xr);
1003 timeLocalUpdate += MyWatch.WallTime();
1006 timeMassOp -= MyWatch.WallTime();
1009 timeMassOp += MyWatch.WallTime();
1010 massOp += blockSize;
1013 timeStifOp -= MyWatch.WallTime();
1015 timeStifOp += MyWatch.WallTime();
1016 stifOp += blockSize;
1019 timeResidual -= MyWatch.WallTime();
1020 for (j = 0; j < blockSize; ++j) {
1021 callBLAS.AXPY(xr, -theta[j], MX.Values()+j*xr, R.Values()+j*xr);
1023 timeResidual += MyWatch.WallTime();
1024 residual += blockSize;
1027 timeNorm -= MyWatch.WallTime();
1029 R.NormWeighted(*vectWeight, normR);
1035 for (j = 0; j < blockSize; ++j) {
1036 normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
1037 if (normR[j] < tolEigenSolve) {
1041 timeNorm += MyWatch.WallTime();
1043 maxSpaceSize = (maxSpaceSize > localSize+blockSize) ? maxSpaceSize : localSize+blockSize;
1044 sumSpaceSize += localSize + blockSize;
1047 if (localVerbose > 0) {
1048 cout <<
" Iteration " << outerIter <<
" - Number of converged eigenvectors ";
1049 cout << knownEV + nFound << endl;
1052 if (localVerbose > 2) {
1053 memcpy(resHistory + blockSize*historyCount, normR, blockSize*
sizeof(
double));
1054 spaceSizeHistory[historyCount] = localSize + blockSize;
1057 if (localVerbose > 1) {
1060 cout.setf(ios::scientific, ios::floatfield);
1061 for (i=0; i<blockSize; ++i) {
1062 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
1063 cout <<
" = " << normR[i] << endl;
1067 for (i=0; i<localSize; ++i) {
1068 cout <<
" Iteration " << outerIter <<
" - Ritz eigenvalue " << i;
1069 cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);
1070 cout <<
" = " << theta[i] << endl;
1080 if (localVerbose > 2) {
1081 iterPCGHistory[historyCount] = 0;
1088 if ((maxBlock > 1) && (nb == maxBlock - 1)) {
1089 if (localVerbose > 2) {
1090 iterPCGHistory[historyCount] = 0;
1098 timePrecOp -= MyWatch.WallTime();
1099 Prec->ApplyInverse(MX, PMX);
1100 timePrecOp += MyWatch.WallTime();
1101 precOp += blockSize;
1104 memcpy(PMX.Values(), MX.Values(), xr*blockSize*
sizeof(double));
1109 timeBuildQtMPMQ -= MyWatch.WallTime();
1110 int qLength = knownEV - startingEV + blockSize;
1111 callBLAS.GEMM(
'T',
'N', qLength, blockSize, xr, 1.0, PMQ.Values(), xr, MX.Values(), xr,
1112 0.0, tmpArray + qLength*blockSize, qLength);
1113 MyComm.SumAll(tmpArray + qLength*blockSize, tmpArray, qLength*blockSize);
1114 for (j = 0; j < blockSize; ++j) {
1115 memcpy(QtMPMQ + (knownEV-startingEV+j)*ldQtMPMQ, tmpArray + j*qLength,
1116 qLength*
sizeof(
double));
1118 timeBuildQtMPMQ += MyWatch.WallTime();
1121 for (j = 0; j < qLength; ++j)
1122 memcpy(invQtMPMQ + j*ldQtMPMQ, QtMPMQ + j*ldQtMPMQ, (j+1)*
sizeof(
double));
1123 callLAPACK.POTRF(
'U', qLength, invQtMPMQ, ldQtMPMQ, &info);
1129 << -info <<
" of LAPACK's DPOTRF routine had "
1130 "an illegal value.");
1132 if (localVerbose > 0) {
1133 cout <<
" Iteration " << outerIter;
1134 cout <<
" - Failure for local factorization (" << info <<
"/" << qLength <<
")";
1135 cout <<
" - RESTART with new random search";
1140 timeRestart -= MyWatch.WallTime();
1143 timeRestart += MyWatch.WallTime();
1146 if (localVerbose > 2) {
1147 iterPCGHistory[historyCount] = 0;
1157 : knownEV+localSize+blockSize, blockSize);
1158 if (normR[0] < sqrtTol)
1164 double maxDotQ = 0.0;
1167 maxDotQ = myVerify.errorOrthogonality(©Q, &R, 0);
1169 double maxDotU = myVerify.errorOrthogonality(&KX, &R, 0);
1171 cout <<
" >> Orthogonality for the right hand side of the correction equation = ";
1172 double tmp = (maxDotU > maxDotQ) ? maxDotU : maxDotQ;
1174 cout.setf(ios::scientific, ios::floatfield);
1175 cout << tmp << endl << endl;
1179 timeCorrectionSolve -= MyWatch.WallTime();
1180 if (startingEV > 0) {
1182 if (knownEV > startingEV) {
1184 info = jacobiPCG(R, Xnext, &KX, ©Q, eta, tolPCG, maxIterLinearSolve, invQtMPMQ,
1185 ldQtMPMQ, PMQ.Values(), tmpArray, workPCG, vectWeight, &startQ);
1188 info = jacobiPCG(R, Xnext, &KX, 0, eta, tolPCG, maxIterLinearSolve, invQtMPMQ,
1189 ldQtMPMQ, PMQ.Values(), tmpArray, workPCG, vectWeight, &startQ);
1195 info = jacobiPCG(R, Xnext, &KX, ©Q, eta, tolPCG, maxIterLinearSolve, invQtMPMQ,
1196 ldQtMPMQ, PMQ.Values(), tmpArray, workPCG, vectWeight);
1199 info = jacobiPCG(R, Xnext, &KX, 0, eta, tolPCG, maxIterLinearSolve, invQtMPMQ,
1200 ldQtMPMQ, PMQ.Values(), tmpArray, workPCG, vectWeight);
1203 timeCorrectionSolve += MyWatch.WallTime();
1206 double maxDotQ = 0.0;
1209 maxDotQ = myVerify.errorOrthogonality(©Q, &Xnext, M);
1211 double maxDotU = myVerify.errorOrthogonality(&KX, &Xnext, M);
1213 double tmp = (maxDotU > maxDotQ) ? maxDotU : maxDotQ;
1214 cout <<
" >> Orthogonality for the solution of the correction equation = ";
1216 cout.setf(ios::scientific, ios::floatfield);
1217 cout << tmp << endl << endl;
1221 numCorrectionSolve += blockSize;
1223 if ((info == -1) || (info == -maxIterLinearSolve-1)) {
1224 if (localVerbose > 0) {
1225 cout <<
" Iteration " << outerIter;
1226 cout <<
" - Failure inside PCG";
1227 cout <<
" - RESTART with new random search";
1230 if (localVerbose > 2) {
1231 iterPCGHistory[historyCount] = -1;
1236 timeRestart -= MyWatch.WallTime();
1239 timeRestart += MyWatch.WallTime();
1246 if (localVerbose > 2) {
1247 iterPCGHistory[historyCount] = (info < 0) ? -info : info;
1253 if (localVerbose > 2) {
1254 iterPCGHistory[historyCount] = info;
1260 tolPCG *= coefDecay;
1262 if (maxBlock == 1) {
1263 Xcurrent.Update(1.0, KX, -1.0);
1268 if (outerIter > maxIterEigenSolve)
1271 if (reStart ==
true) {
1276 if (criticalExit ==
true)
1280 if (knownEV + nFound >= numEigen) {
1281 for (j = 0; j < blockSize; ++j) {
1282 if (normR[j] < tolEigenSolve) {
1283 memcpy(X.Values() + knownEV*xr, KX.Values() + j*xr, xr*
sizeof(double));
1284 lambda[knownEV] = theta[j];
1288 if (localVerbose == 1) {
1291 cout.setf(ios::scientific, ios::floatfield);
1292 for (i=0; i<blockSize; ++i) {
1293 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
1294 cout <<
" = " << normR[i] << endl;
1302 if ((maxBlock == 1) && (nFound == 0)) {
1308 if (maxBlock == 1) {
1309 timeRestart -= MyWatch.WallTime();
1310 double *Xpointer = X.Values() + (knownEV+nFound)*xr;
1312 for (j = 0; j < blockSize; ++j) {
1313 if (normR[j] < tolEigenSolve) {
1314 memcpy(X.Values() + knownEV*xr, KX.Values() + j*xr, xr*
sizeof(double));
1315 lambda[knownEV] = theta[j];
1320 memcpy(Xpointer + (j-nFound)*xr, KX.Values() + j*xr, xr*
sizeof(double));
1326 timeRestart += MyWatch.WallTime();
1332 timeRestart -= MyWatch.WallTime();
1333 int firstIndex = blockSize;
1334 for (j = 0; j < blockSize; ++j) {
1335 if (normR[j] >= tolEigenSolve) {
1340 while (firstIndex < nFound) {
1341 for (j = firstIndex; j < blockSize; ++j) {
1342 if (normR[j] < tolEigenSolve) {
1344 callFortran.SWAP(nb*blockSize, S + j*dimSearch, 1, S + firstIndex*dimSearch, 1);
1345 callFortran.SWAP(1, theta + j, 1, theta + firstIndex, 1);
1346 callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1);
1350 for (j = 0; j < blockSize; ++j) {
1351 if (normR[j] >= tolEigenSolve) {
1359 memcpy(lambda + knownEV, theta, nFound*
sizeof(
double));
1362 bStart = ((nb - offSet) > 2) ? (nb - offSet)/2 : 0;
1365 memset(KK, 0, nb*blockSize*dimSearch*
sizeof(
double));
1366 for (j = 0; j < bStart*blockSize; ++j) {
1367 KK[j + j*dimSearch] = theta[j + nFound];
1370 oldCol = nb*blockSize;
1371 newCol = nFound + (bStart+1)*blockSize;
1372 newCol = (newCol > oldCol) ? oldCol : newCol;
1373 callFortran.GEQRF(oldCol, newCol, S, dimSearch, theta, R.Values(), xr*blockSize, &info);
1374 callFortran.ORMQR(
'R',
'N', xr, oldCol, newCol, S, dimSearch, theta,
1375 X.Values()+knownEV*xr, xr, R.Values(), xr*blockSize, &info);
1377 newCol = nFound + (bStart+1)*blockSize;
1378 if (newCol > oldCol) {
1382 timeRestart += MyWatch.WallTime();
1389 timeMassOp -= MyWatch.WallTime();
1391 M->Apply(Qconv, MQconv);
1392 timeMassOp += MyWatch.WallTime();
1397 timePrecOp -= MyWatch.WallTime();
1398 Prec->ApplyInverse(MQconv, PMQconv);
1399 timePrecOp += MyWatch.WallTime();
1403 memcpy(PMQconv.Values(), MQconv.Values(), xr*nFound*
sizeof(double));
1407 int newEV = knownEV - startingEV;
1408 timeBuildQtMPMQ -= MyWatch.WallTime();
1409 callBLAS.GEMM(
'T',
'N', newEV + nFound, nFound, xr, 1.0, PMQ.Values(), xr,
1410 MX.Values(), xr, 0.0, QtMPMQ + newEV*ldQtMPMQ, newEV + nFound);
1411 MyComm.SumAll(QtMPMQ + newEV*ldQtMPMQ, tmpArray, (newEV + nFound)*nFound);
1412 for (j = 0; j < nFound; ++j) {
1413 memcpy(QtMPMQ + (newEV+j)*ldQtMPMQ, tmpArray + j*(newEV+nFound),
1414 (newEV+nFound)*
sizeof(
double));
1416 timeBuildQtMPMQ += MyWatch.WallTime();
1423 maxBlock = (dimSearch/blockSize) - (knownEV/blockSize);
1426 if ((maxBlock > 1) && (newCol <= oldCol))
1430 timeOuterLoop += MyWatch.WallTime();
1431 highMem = (highMem > currentSize()) ? highMem : currentSize();
1439 timePostProce -= MyWatch.WallTime();
1440 if ((info == 0) && (knownEV > 0)) {
1441 mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
1443 timePostProce += MyWatch.WallTime();
1445 return (info == 0) ? knownEV : info;
1454 cout.setf(ios::scientific, ios::floatfield);
1457 int myPid = MyComm.MyPID();
1462 tmp = myVerify.errorEquality(X, MX, M);
1464 cout <<
" >> Difference between MX and M*X = " << tmp << endl;
1466 tmp = myVerify.errorOrthonormality(X, M);
1468 cout <<
" >> Error in X^T M X - I = " << tmp << endl;
1471 tmp = myVerify.errorOrthonormality(X, 0);
1473 cout <<
" >> Error in X^T X - I = " << tmp << endl;
1481 tmp = myVerify.errorOrthonormality(Q, M);
1483 cout <<
" >> Error in Q^T M Q - I = " << tmp << endl;
1485 tmp = myVerify.errorOrthogonality(Q, X, M);
1487 cout <<
" >> Orthogonality Q^T M X up to " << tmp << endl;
1491 tmp = myVerify.errorOrthonormality(Q, 0);
1493 cout <<
" >> Error in Q^T Q - I = " << tmp << endl;
1495 tmp = myVerify.errorOrthogonality(Q, X, 0);
1497 cout <<
" >> Orthogonality Q^T X up to " << tmp << endl;
1504 int JDPCG::minimumSpaceDimension(
int nev)
const {
1506 int myPid = MyComm.MyPID();
1508 if ((myPid == 0) && (numBlock*blockSize < nev)) {
1510 cerr <<
" !!! The space dimension (# of blocks x size of blocks) must be greater than ";
1511 cerr <<
" the number of eigenvalues !!!\n";
1512 cerr <<
" Number of blocks = " << numBlock << endl;
1513 cerr <<
" Size of blocks = " << blockSize << endl;
1514 cerr <<
" Number of eigenvalues = " << nev << endl;
1519 return nev + blockSize;
1524 void JDPCG::initializeCounters() {
1528 delete[] resHistory;
1534 if (spaceSizeHistory) {
1535 delete[] spaceSizeHistory;
1536 spaceSizeHistory = 0;
1541 if (iterPCGHistory) {
1542 delete[] iterPCGHistory;
1550 numCorrectionPrec = 0;
1551 numCorrectionSolve = 0;
1560 timeBuildQtMPMQ = 0.0;
1561 timeCorrectionPrec = 0.0;
1562 timeCorrectionSolve = 0.0;
1563 timeLocalProj = 0.0;
1564 timeLocalSolve = 0.0;
1565 timeLocalUpdate = 0.0;
1569 timeOuterLoop = 0.0;
1570 timePCGEigCheck = 0.0;
1572 timePCGOpMult = 0.0;
1574 timePostProce = 0.0;
1583 void JDPCG::algorithmInfo()
const {
1585 int myPid = MyComm.MyPID();
1588 cout <<
" Algorithm: Jacobi-Davidson algorithm with PCG (block version)\n";
1589 cout <<
" Block Size: " << blockSize << endl;
1590 cout <<
" Number of Blocks kept: " << numBlock << endl;
1596 void JDPCG::historyInfo()
const {
1600 cout <<
" Block Size: " << blockSize << endl;
1602 cout <<
" Residuals Search Space Dim. Inner Iter. \n";
1605 cout.setf(ios::scientific, ios::floatfield);
1606 for (j = 0; j < historyCount; ++j) {
1608 for (ii = 0; ii < blockSize; ++ii) {
1609 cout << resHistory[blockSize*j + ii] <<
" ";
1611 cout << spaceSizeHistory[j] <<
" ";
1613 cout << iterPCGHistory[j] << endl;
1622 void JDPCG::memoryInfo()
const {
1624 int myPid = MyComm.MyPID();
1626 double maxHighMem = 0.0;
1627 double tmp = highMem;
1628 MyComm.MaxAll(&tmp, &maxHighMem, 1);
1630 double maxMemRequested = 0.0;
1632 MyComm.MaxAll(&tmp, &maxMemRequested, 1);
1636 cout.setf(ios::fixed, ios::floatfield);
1637 cout <<
" Memory requested per processor by the eigensolver = (EST) ";
1639 cout << maxMemRequested <<
" MB " << endl;
1641 cout <<
" High water mark in eigensolver = (EST) ";
1643 cout << maxHighMem <<
" MB " << endl;
1650 void JDPCG::operationInfo()
const {
1652 int myPid = MyComm.MyPID();
1655 cout <<
" --- Operations ---\n\n";
1656 cout <<
" Total number of mass matrix multiplications = ";
1658 int tmp = massOp + modalTool.getNumProj_MassMult() + modalTool.getNumNorm_MassMult();
1659 tmp += numPCGmassOp;
1660 cout << tmp << endl;
1661 cout <<
" Mass multiplications in solver loop = ";
1663 cout << massOp << endl;
1664 cout <<
" Mass multiplications for orthogonalization = ";
1666 cout << modalTool.getNumProj_MassMult() << endl;
1667 cout <<
" Mass multiplications for normalization = ";
1669 cout << modalTool.getNumNorm_MassMult() << endl;
1670 cout <<
" Mass multiplications in PCG loop = ";
1672 cout << numPCGmassOp << endl;
1674 cout <<
" Total number of stiffness matrix operations = ";
1676 cout << stifOp + numPCGstifOp << endl;
1677 cout <<
" Stiffness multiplications in solver loop = ";
1679 cout << stifOp << endl;
1680 cout <<
" Stiffness multiplications in PCG loop = ";
1682 cout << numPCGstifOp << endl;
1684 cout <<
" Total number of preconditioner applications = ";
1686 cout << precOp << endl;
1688 cout <<
" Total number of PCG solve = ";
1690 cout << numCorrectionSolve << endl;
1691 cout <<
" Number of projected precond. appl. : ";
1693 cout << numCorrectionPrec << endl;
1694 cout.setf(ios::fixed, ios::floatfield);
1697 cout <<
" Average number of iter. per solve : ";
1699 cout << ((double) sumIterPCG)*blockSize/((double) numCorrectionSolve) << endl;
1700 cout <<
" Maximum number of iter. per solve : ";
1702 cout << maxIterPCG << endl;
1704 cout <<
" Total number of computed eigen-residuals = ";
1706 cout << residual << endl;
1708 cout <<
" Total number of outer iterations = ";
1710 cout << outerIter << endl;
1711 cout <<
" Number of restarts = ";
1713 cout << numRestart << endl;
1715 cout <<
" Maximum size of search space = ";
1717 cout << maxSpaceSize << endl;
1718 cout <<
" Average size of search space = ";
1719 cout.setf(ios::fixed, ios::floatfield);
1722 cout << ((double) sumSpaceSize)/((double) outerIter) << endl;
1729 void JDPCG::timeInfo()
const {
1731 int myPid = MyComm.MyPID();
1734 cout <<
" --- Timings ---\n\n";
1735 cout.setf(ios::fixed, ios::floatfield);
1737 cout <<
" Total time for outer loop = ";
1739 cout << timeOuterLoop <<
" s" << endl;
1740 cout <<
" Time for mass matrix operations = ";
1742 cout << timeMassOp <<
" s ";
1744 cout << 100*timeMassOp/timeOuterLoop <<
" %\n";
1745 cout <<
" Time for stiffness matrix operations = ";
1747 cout << timeStifOp <<
" s ";
1749 cout << 100*timeStifOp/timeOuterLoop <<
" %\n";
1750 cout <<
" Time for preconditioner applications = ";
1752 cout << timePrecOp <<
" s ";
1754 cout << 100*timePrecOp/timeOuterLoop <<
" %\n";
1755 cout <<
" Time for orthogonalizations = ";
1757 cout << timeOrtho <<
" s ";
1759 cout << 100*timeOrtho/timeOuterLoop <<
" %\n";
1760 cout <<
" Projection step : ";
1762 cout << modalTool.getTimeProj() <<
" s\n";
1763 cout <<
" Normalization step : ";
1765 cout << modalTool.getTimeNorm() <<
" s\n";
1766 cout <<
" Time for local projection = ";
1768 cout << timeLocalProj <<
" s ";
1770 cout << 100*timeLocalProj/timeOuterLoop <<
" %\n";
1771 cout <<
" Time for local eigensolve = ";
1773 cout << timeLocalSolve <<
" s ";
1775 cout << 100*timeLocalSolve/timeOuterLoop <<
" %\n";
1776 cout <<
" Time for local update = ";
1778 cout << timeLocalUpdate <<
" s ";
1780 cout << 100*timeLocalUpdate/timeOuterLoop <<
" %\n";
1781 cout <<
" Time for building the matrix QtMP^{-1}MQ = ";
1783 cout << timeBuildQtMPMQ <<
" s ";
1785 cout << 100*timeBuildQtMPMQ/timeOuterLoop <<
" %\n";
1786 cout <<
" Time for solving the correction equation = ";
1788 cout << timeCorrectionSolve <<
" s ";
1790 cout << 100*timeCorrectionSolve/timeOuterLoop <<
" %\n";
1791 cout <<
" Mult. Preconditioner : ";
1793 cout << timeCorrectionPrec <<
" s" << endl;
1794 cout <<
" Correction of Prec. : ";
1796 cout << (timePCGPrec - timeCorrectionPrec) <<
" s" << endl;
1797 cout <<
" Shifted Matrix Mult. : ";
1799 cout << timePCGOpMult <<
" s" << endl;
1800 cout <<
" Eigen-residuals checks : ";
1802 cout << timePCGEigCheck <<
" s" << endl;
1803 cout <<
" Time for restarting space definition = ";
1805 cout << timeRestart <<
" s ";
1807 cout << 100*timeRestart/timeOuterLoop <<
" %\n";
1808 cout <<
" Time for residual computations = ";
1810 cout << timeResidual <<
" s ";
1812 cout << 100*timeResidual/timeOuterLoop <<
" %\n";
1814 cout <<
" Total time for post-processing = ";
1816 cout << timePostProce <<
" s\n";
void GEMM(const char TRANSA, const char TRANSB, const int M, const int N, const int K, const float ALPHA, const float *A, const int LDA, const float *B, const int LDB, const float BETA, float *C, const int LDC) const
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)