20 #ifndef KLU2_TSOLVE_HPP
21 #define KLU2_TSOLVE_HPP
23 #include "klu2_internal.h"
26 template <
typename Entry,
typename Int>
30 KLU_symbolic<Entry, Int> *Symbolic,
31 KLU_numeric<Entry, Int> *Numeric,
44 KLU_common<Entry, Int> *Common
47 Entry x [4], offik, s ;
49 Entry *Offx, *X, *Bz, *Udiag ;
50 Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
52 Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
62 if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
65 Common->status = KLU_INVALID ;
68 Common->status = KLU_OK ;
76 nblocks = Symbolic->nblocks ;
84 ASSERT (nblocks == Numeric->nblocks) ;
85 Pnum = Numeric->Pnum ;
86 Offp = Numeric->Offp ;
87 Offi = Numeric->Offi ;
88 Offx = (Entry *) Numeric->Offx ;
91 Llen = Numeric->Llen ;
93 Ulen = Numeric->Ulen ;
94 LUbx = (Unit **) Numeric->LUbx ;
95 Udiag = (Entry *) Numeric->Udiag ;
98 X = (Entry *) Numeric->Xwork ;
99 ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
105 for (chunk = 0 ; chunk < nrhs ; chunk += 4)
112 nr = MIN (nrhs - chunk, 4) ;
123 for (k = 0 ; k < n ; k++)
131 for (k = 0 ; k < n ; k++)
135 X [2*k + 1] = Bz [i + d ] ;
141 for (k = 0 ; k < n ; k++)
145 X [3*k + 1] = Bz [i + d ] ;
146 X [3*k + 2] = Bz [i + d*2] ;
152 for (k = 0 ; k < n ; k++)
156 X [4*k + 1] = Bz [i + d ] ;
157 X [4*k + 2] = Bz [i + d*2] ;
158 X [4*k + 3] = Bz [i + d*3] ;
168 for (block = 0 ; block < nblocks ; block++)
178 PRINTF ((
"tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
191 for (k = k1 ; k < k2 ; k++)
194 for (p = Offp [k] ; p < pend ; p++)
199 MULT_SUB_CONJ (X [k], X [Offi [p]],
205 MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
213 for (k = k1 ; k < k2 ; k++)
217 x [1] = X [2*k + 1] ;
218 for (p = Offp [k] ; p < pend ; p++)
224 CONJ (offik, Offx [p]) ;
231 MULT_SUB (x [0], offik, X [2*i]) ;
232 MULT_SUB (x [1], offik, X [2*i + 1]) ;
235 X [2*k + 1] = x [1] ;
241 for (k = k1 ; k < k2 ; k++)
245 x [1] = X [3*k + 1] ;
246 x [2] = X [3*k + 2] ;
247 for (p = Offp [k] ; p < pend ; p++)
253 CONJ (offik, Offx [p]) ;
260 MULT_SUB (x [0], offik, X [3*i]) ;
261 MULT_SUB (x [1], offik, X [3*i + 1]) ;
262 MULT_SUB (x [2], offik, X [3*i + 2]) ;
265 X [3*k + 1] = x [1] ;
266 X [3*k + 2] = x [2] ;
272 for (k = k1 ; k < k2 ; k++)
276 x [1] = X [4*k + 1] ;
277 x [2] = X [4*k + 2] ;
278 x [3] = X [4*k + 3] ;
279 for (p = Offp [k] ; p < pend ; p++)
285 CONJ(offik, Offx [p]) ;
292 MULT_SUB (x [0], offik, X [4*i]) ;
293 MULT_SUB (x [1], offik, X [4*i + 1]) ;
294 MULT_SUB (x [2], offik, X [4*i + 2]) ;
295 MULT_SUB (x [3], offik, X [4*i + 3]) ;
298 X [4*k + 1] = x [1] ;
299 X [4*k + 2] = x [2] ;
300 X [4*k + 3] = x [3] ;
315 CONJ (s, Udiag [k1]) ;
326 DIV (X [k1], X [k1], s) ;
330 DIV (X [2*k1], X [2*k1], s) ;
331 DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
335 DIV (X [3*k1], X [3*k1], s) ;
336 DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
337 DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
341 DIV (X [4*k1], X [4*k1], s) ;
342 DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
343 DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
344 DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
351 KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
357 KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
378 for (k = 0 ; k < n ; k++)
380 Bz [Pnum [k]] = X [k] ;
386 for (k = 0 ; k < n ; k++)
390 Bz [i + d ] = X [2*k + 1] ;
396 for (k = 0 ; k < n ; k++)
400 Bz [i + d ] = X [3*k + 1] ;
401 Bz [i + d*2] = X [3*k + 2] ;
407 for (k = 0 ; k < n ; k++)
411 Bz [i + d ] = X [4*k + 1] ;
412 Bz [i + d*2] = X [4*k + 2] ;
413 Bz [i + d*3] = X [4*k + 3] ;
427 for (k = 0 ; k < n ; k++)
429 SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
435 for (k = 0 ; k < n ; k++)
439 SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
440 SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
446 for (k = 0 ; k < n ; k++)
450 SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
451 SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
452 SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
458 for (k = 0 ; k < n ; k++)
462 SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
463 SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
464 SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
465 SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
482 template <
typename Entry,
typename Int>
486 KLU_symbolic<Entry, Int> *Symbolic,
487 KLU_numeric<Entry, Int> *Numeric,
502 KLU_common<Entry, Int> *Common
505 Entry x [4], offik, s ;
507 Entry *Offx, *X, *Bz, *Udiag ;
508 Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
510 Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
520 if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
521 B == NULL || Xout == NULL)
523 Common->status = KLU_INVALID ;
526 Common->status = KLU_OK ;
534 nblocks = Symbolic->nblocks ;
542 ASSERT (nblocks == Numeric->nblocks) ;
543 Pnum = Numeric->Pnum ;
544 Offp = Numeric->Offp ;
545 Offi = Numeric->Offi ;
546 Offx = (Entry *) Numeric->Offx ;
549 Llen = Numeric->Llen ;
551 Ulen = Numeric->Ulen ;
552 LUbx = (Unit **) Numeric->LUbx ;
553 Udiag = (Entry *) Numeric->Udiag ;
556 X = (Entry *) Numeric->Xwork ;
557 ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
563 for (chunk = 0 ; chunk < nrhs ; chunk += 4)
570 nr = MIN (nrhs - chunk, 4) ;
581 for (k = 0 ; k < n ; k++)
589 for (k = 0 ; k < n ; k++)
593 X [2*k + 1] = Bz [i + d ] ;
599 for (k = 0 ; k < n ; k++)
603 X [3*k + 1] = Bz [i + d ] ;
604 X [3*k + 2] = Bz [i + d*2] ;
610 for (k = 0 ; k < n ; k++)
614 X [4*k + 1] = Bz [i + d ] ;
615 X [4*k + 2] = Bz [i + d*2] ;
616 X [4*k + 3] = Bz [i + d*3] ;
626 for (block = 0 ; block < nblocks ; block++)
636 PRINTF ((
"tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
649 for (k = k1 ; k < k2 ; k++)
652 for (p = Offp [k] ; p < pend ; p++)
657 MULT_SUB_CONJ (X [k], X [Offi [p]],
663 MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
671 for (k = k1 ; k < k2 ; k++)
675 x [1] = X [2*k + 1] ;
676 for (p = Offp [k] ; p < pend ; p++)
682 CONJ (offik, Offx [p]) ;
689 MULT_SUB (x [0], offik, X [2*i]) ;
690 MULT_SUB (x [1], offik, X [2*i + 1]) ;
693 X [2*k + 1] = x [1] ;
699 for (k = k1 ; k < k2 ; k++)
703 x [1] = X [3*k + 1] ;
704 x [2] = X [3*k + 2] ;
705 for (p = Offp [k] ; p < pend ; p++)
711 CONJ (offik, Offx [p]) ;
718 MULT_SUB (x [0], offik, X [3*i]) ;
719 MULT_SUB (x [1], offik, X [3*i + 1]) ;
720 MULT_SUB (x [2], offik, X [3*i + 2]) ;
723 X [3*k + 1] = x [1] ;
724 X [3*k + 2] = x [2] ;
730 for (k = k1 ; k < k2 ; k++)
734 x [1] = X [4*k + 1] ;
735 x [2] = X [4*k + 2] ;
736 x [3] = X [4*k + 3] ;
737 for (p = Offp [k] ; p < pend ; p++)
743 CONJ(offik, Offx [p]) ;
750 MULT_SUB (x [0], offik, X [4*i]) ;
751 MULT_SUB (x [1], offik, X [4*i + 1]) ;
752 MULT_SUB (x [2], offik, X [4*i + 2]) ;
753 MULT_SUB (x [3], offik, X [4*i + 3]) ;
756 X [4*k + 1] = x [1] ;
757 X [4*k + 2] = x [2] ;
758 X [4*k + 3] = x [3] ;
773 CONJ (s, Udiag [k1]) ;
784 DIV (X [k1], X [k1], s) ;
788 DIV (X [2*k1], X [2*k1], s) ;
789 DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
793 DIV (X [3*k1], X [3*k1], s) ;
794 DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
795 DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
799 DIV (X [4*k1], X [4*k1], s) ;
800 DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
801 DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
802 DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
809 KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
815 KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
837 for (k = 0 ; k < n ; k++)
839 Xout [Pnum [k]] = X [k] ;
845 for (k = 0 ; k < n ; k++)
848 Xout [i ] = X [2*k ] ;
849 Xout [i + d ] = X [2*k + 1] ;
855 for (k = 0 ; k < n ; k++)
858 Xout [i ] = X [3*k ] ;
859 Xout [i + d ] = X [3*k + 1] ;
860 Xout [i + d*2] = X [3*k + 2] ;
866 for (k = 0 ; k < n ; k++)
869 Xout [i ] = X [4*k ] ;
870 Xout [i + d ] = X [4*k + 1] ;
871 Xout [i + d*2] = X [4*k + 2] ;
872 Xout [i + d*3] = X [4*k + 3] ;
886 for (k = 0 ; k < n ; k++)
888 SCALE_DIV_ASSIGN (Xout [Pnum [k]], X [k], Rs [k]) ;
894 for (k = 0 ; k < n ; k++)
898 SCALE_DIV_ASSIGN (Xout [i], X [2*k], rs) ;
899 SCALE_DIV_ASSIGN (Xout [i + d], X [2*k + 1], rs) ;
905 for (k = 0 ; k < n ; k++)
909 SCALE_DIV_ASSIGN (Xout [i], X [3*k], rs) ;
910 SCALE_DIV_ASSIGN (Xout [i + d], X [3*k + 1], rs) ;
911 SCALE_DIV_ASSIGN (Xout [i + d*2], X [3*k + 2], rs) ;
917 for (k = 0 ; k < n ; k++)
921 SCALE_DIV_ASSIGN (Xout [i], X [4*k], rs) ;
922 SCALE_DIV_ASSIGN (Xout [i + d], X [4*k + 1], rs) ;
923 SCALE_DIV_ASSIGN (Xout [i + d*2], X [4*k + 2], rs) ;
924 SCALE_DIV_ASSIGN (Xout [i + d*3], X [4*k + 3], rs) ;