19 #include "BlockPCGSolver.h"
21 #include <Teuchos_Assert.hpp>
25 double _tol,
int _iMax,
int _verb)
49 double _tol,
int _iMax,
int _verb)
71 BlockPCGSolver::~BlockPCGSolver() {
96 return K->Apply(X, Y);
103 int xcol = X.NumVectors();
106 if (Y.NumVectors() < xcol)
181 info = (xcol == 1) ? Solve(X, Y) : Solve(X, Y, xcol);
191 int localVerbose = verbose*(MyComm.MyPID() == 0);
193 int xr = X.MyLength();
197 if (lWorkSpace < wSize) {
200 workSpace =
new (nothrow)
double[wSize];
201 if (workSpace == 0) {
208 double *pointer = workSpace;
211 pointer = pointer + xr;
214 pointer = pointer + xr;
221 double initNorm = 0.0, rNorm = 0.0, newRZ = 0.0, oldRZ = 0.0, alpha = 0.0;
222 double tolSquare = tolCG*tolCG;
224 memcpy(r.Values(), X.Values(), xr*
sizeof(double));
225 tmp = callBLAS.DOT(xr, r.Values(), r.Values());
226 MyComm.SumAll(&tmp, &initNorm, 1);
230 if (localVerbose > 1) {
232 cout <<
" --- PCG Iterations --- " << endl;
236 for (iter = 1; iter <= iterMax; ++iter) {
239 Prec->ApplyInverse(r, z);
242 memcpy(z.Values(), r.Values(), xr*
sizeof(double));
246 tmp = callBLAS.DOT(xr, r.Values(), z.Values());
247 MyComm.SumAll(&tmp, &newRZ, 1);
248 memcpy(p.Values(), z.Values(), xr*
sizeof(double));
252 tmp = callBLAS.DOT(xr, r.Values(), z.Values());
253 MyComm.SumAll(&tmp, &newRZ, 1);
254 p.Update(1.0, z, newRZ/oldRZ);
259 tmp = callBLAS.DOT(xr, p.Values(), Kp.Values());
260 MyComm.SumAll(&tmp, &alpha, 1);
264 if (MyComm.MyPID() == 0) {
265 cerr << endl << endl;
267 cerr.setf(ios::scientific, ios::floatfield);
268 cerr <<
" !!! Non-positive value for p^TKp (" << alpha <<
") !!!";
269 cerr << endl << endl;
274 callBLAS.AXPY(xr, alpha, p.Values(), Y.Values());
277 callBLAS.AXPY(xr, alpha, Kp.Values(), r.Values());
280 tmp = callBLAS.DOT(xr, r.Values(), r.Values());
281 MyComm.SumAll(&tmp, &rNorm, 1);
283 if (localVerbose > 1) {
284 cout <<
" Iter. " << iter;
286 cout.setf(ios::scientific, ios::floatfield);
287 cout <<
" Residual reduction " << sqrt(rNorm/initNorm) << endl;
290 if (rNorm <= tolSquare*initNorm)
295 if (localVerbose == 1) {
297 cout <<
" --- End of PCG solve ---" << endl;
298 cout <<
" Iter. " << iter;
300 cout.setf(ios::scientific, ios::floatfield);
301 cout <<
" Residual reduction " << sqrt(rNorm/initNorm) << endl;
305 if (localVerbose > 1) {
311 minIter = (iter < minIter) ? iter : minIter;
312 maxIter = (iter > maxIter) ? iter : maxIter;
322 int xrow = X.MyLength();
323 int xcol = X.NumVectors();
324 int ycol = Y.NumVectors();
327 int localVerbose = verbose*(MyComm.MyPID() == 0);
331 callLAPACK.LAMCH(
'E', eps);
333 double *valX = X.Values();
335 int NB = 3 + callFortran.LAENV(1,
"dsytrd",
"u", blkSize, -1, -1, -1, 6, 1);
336 int lworkD = (blkSize > NB) ? blkSize*blkSize : NB*blkSize;
338 int wSize = 4*blkSize*xrow + 3*blkSize + 2*blkSize*blkSize + lworkD;
341 if (ycol % blkSize != 0) {
343 wSize += blkSize*xrow;
347 if (lWorkSpace < wSize) {
349 workSpace =
new (nothrow)
double[wSize];
350 if (workSpace == 0) {
357 double *pointer = workSpace;
360 double *PtKP = pointer;
361 pointer = pointer + blkSize*blkSize;
364 double *coeff = pointer;
365 pointer = pointer + blkSize*blkSize;
368 double *workD = pointer;
369 pointer = pointer + lworkD;
372 double *da = pointer;
373 pointer = pointer + blkSize;
376 double *initNorm = pointer;
377 pointer = pointer + blkSize;
380 double *resNorm = pointer;
381 pointer = pointer + blkSize;
384 double *valR = pointer;
385 pointer = pointer + xrow*blkSize;
389 double *valZ = pointer;
390 pointer = pointer + xrow*blkSize;
394 double *valP = pointer;
395 pointer = pointer + xrow*blkSize;
399 double *valKP = pointer;
400 pointer = pointer + xrow*blkSize;
404 double *valSOL = (useY ==
true) ? Y.Values() : pointer;
407 for (iRHS = 0; iRHS < xcol; iRHS += blkSize) {
409 int numVec = (iRHS + blkSize < xcol) ? blkSize : xcol - iRHS;
412 if (numVec < blkSize) {
415 memcpy(valR, valX + iRHS*xrow, numVec*xrow*
sizeof(
double));
418 valSOL = (useY ==
true) ? Y.Values() + iRHS*xrow : valSOL;
428 if (localVerbose > 1) {
430 cout <<
" Vectors " << iRHS <<
" to " << iRHS + numVec - 1 << endl;
431 if (localVerbose > 2) {
432 fprintf(stderr,
"\n");
433 for (ii = 0; ii < numVec; ++ii) {
434 cout <<
" ... Initial Residual Norm " << ii <<
" = " << initNorm[ii] << endl;
441 for (iter = 1; iter <= iterMax; ++iter) {
445 Prec->ApplyInverse(R, Z);
455 callBLAS.GEMM(
'T',
'N', blkSize, blkSize, xrow, 1.0, KP.Values(), xrow, Z.Values(), xrow,
456 0.0, workD, blkSize);
457 MyComm.SumAll(workD, coeff, blkSize*blkSize);
460 callBLAS.GEMM(
'T',
'N', blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, coeff, blkSize,
461 0.0, workD, blkSize);
462 for (ii = 0; ii < blkSize; ++ii)
463 callFortran.SCAL_INCX(blkSize, da[ii], workD + ii, blkSize);
464 callBLAS.GEMM(
'N',
'N', blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, workD, blkSize,
465 0.0, coeff, blkSize);
469 memcpy(KP.Values(), P.Values(), xrow*blkSize*
sizeof(double));
470 callBLAS.GEMM(
'N',
'N', xrow, blkSize, blkSize, 1.0, KP.Values(), xrow, coeff, blkSize,
471 0.0, P.Values(), xrow);
473 P.Update(1.0, Z, -1.0);
480 callBLAS.GEMM(
'T',
'N', blkSize, blkSize, xrow, 1.0, P.Values(), xrow, KP.Values(), xrow,
481 0.0, workD, blkSize);
482 MyComm.SumAll(workD, PtKP, blkSize*blkSize);
485 callFortran.SYEV(
'V',
'U', blkSize, PtKP, blkSize, da, workD, lworkD, &info);
492 for (ii = 0; ii < blkSize; ++ii) {
497 "eigenvalue for P^T K P: da[" << ii <<
"] = "
499 da[ii] = (da[ii] == 0.0) ? 0.0 : 1.0/da[ii];
503 callBLAS.GEMM(
'T',
'N', blkSize, blkSize, xrow, 1.0, P.Values(), xrow, R.Values(), xrow,
504 0.0, workD, blkSize);
505 MyComm.SumAll(workD, coeff, blkSize*blkSize);
508 callBLAS.GEMM(
'T',
'N', blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, coeff, blkSize,
509 0.0, workD, blkSize);
510 for (ii = 0; ii < blkSize; ++ii)
511 callFortran.SCAL_INCX(blkSize, da[ii], workD + ii, blkSize);
512 callBLAS.GEMM(
'N',
'N', blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, workD, blkSize,
513 0.0, coeff, blkSize);
516 callBLAS.GEMM(
'N',
'N', xrow, blkSize, blkSize, 1.0, P.Values(), xrow, coeff, blkSize,
520 callBLAS.GEMM(
'N',
'N', xrow, blkSize, blkSize, -1.0, KP.Values(), xrow, coeff, blkSize,
521 1.0, R.Values(), xrow);
526 for (ii = 0; ii < numVec; ++ii) {
527 if (resNorm[ii] <= tolCG*initNorm[ii])
531 if (localVerbose > 1) {
532 cout <<
" Vectors " << iRHS <<
" to " << iRHS + numVec - 1;
533 cout <<
" -- Iteration " << iter <<
" -- " << nFound <<
" converged vectors\n";
534 if (localVerbose > 2) {
536 for (ii = 0; ii < numVec; ++ii) {
539 cout << ii <<
" ... Residual = ";
541 cout.setf(ios::scientific, ios::floatfield);
542 cout << resNorm[ii] <<
" ... Right Hand Side = " << initNorm[ii] << endl;
548 if (nFound == numVec) {
556 memcpy(Y.Values() + xrow*iRHS, valSOL, numVec*xrow*
sizeof(double));
561 if (nFound == numVec) {
562 minIter = (iter < minIter) ? iter : minIter;
563 maxIter = (iter > maxIter) ? iter : maxIter;
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)