20 #ifndef KLU2_SOLVE_HPP 
   21 #define KLU2_SOLVE_HPP 
   23 #include "klu2_internal.h" 
   26 template <
typename Entry, 
typename Int>
 
   30     KLU_symbolic<Entry, Int> *Symbolic,
 
   31     KLU_numeric<Entry, Int> *Numeric,
 
   40     KLU_common<Entry, Int> *Common
 
   43     Entry x [4], offik, s ;
 
   45     Entry *Offx, *X, *Bz, *Udiag ;
 
   46     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
 
   48     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
 
   58     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
 
   61         Common->status = KLU_INVALID ;
 
   64     Common->status = KLU_OK ;
 
   72     nblocks = Symbolic->nblocks ;
 
   80     ASSERT (nblocks == Numeric->nblocks) ;
 
   81     Pnum = Numeric->Pnum ;
 
   82     Offp = Numeric->Offp ;
 
   83     Offi = Numeric->Offi ;
 
   84     Offx = (Entry *) Numeric->Offx ;
 
   87     Llen = Numeric->Llen ;
 
   89     Ulen = Numeric->Ulen ;
 
   90     LUbx = (Unit **) Numeric->LUbx ;
 
   91     Udiag = (Entry *) Numeric->Udiag ;
 
   94     X = (Entry *) Numeric->Xwork ;
 
   96     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
 
  102     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
 
  109         nr = MIN (nrhs - chunk, 4) ;
 
  124                     for (k = 0 ; k < n ; k++)
 
  126                         X [k] = Bz [Pnum [k]] ;
 
  132                     for (k = 0 ; k < n ; k++)
 
  136                         X [2*k + 1] = Bz  [i + d  ] ;
 
  142                     for (k = 0 ; k < n ; k++)
 
  146                         X [3*k + 1] = Bz [i + d  ] ;
 
  147                         X [3*k + 2] = Bz [i + d*2] ;
 
  153                     for (k = 0 ; k < n ; k++)
 
  157                         X [4*k + 1] = Bz [i + d  ] ;
 
  158                         X [4*k + 2] = Bz [i + d*2] ;
 
  159                         X [4*k + 3] = Bz [i + d*3] ;
 
  173                     for (k = 0 ; k < n ; k++)
 
  175                         SCALE_DIV_ASSIGN (X [k], Bz  [Pnum [k]], Rs [k]) ;
 
  181                     for (k = 0 ; k < n ; k++)
 
  185                         SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
 
  186                         SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
 
  192                     for (k = 0 ; k < n ; k++)
 
  196                         SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
 
  197                         SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
 
  198                         SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
 
  204                     for (k = 0 ; k < n ; k++)
 
  208                         SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
 
  209                         SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
 
  210                         SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
 
  211                         SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
 
  221         for (block = nblocks-1 ; block >= 0 ; block--)
 
  231             PRINTF ((
"solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
 
  241                         DIV (X [k1], X [k1], s) ;
 
  245                         DIV (X [2*k1], X [2*k1], s) ;
 
  246                         DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
 
  250                         DIV (X [3*k1], X [3*k1], s) ;
 
  251                         DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
 
  252                         DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
 
  256                         DIV (X [4*k1], X [4*k1], s) ;
 
  257                         DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
 
  258                         DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
 
  259                         DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
 
  266                 KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
 
  268                 KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
 
  269                         Udiag + k1, nr, X + nr*k1) ;
 
  283                         for (k = k1 ; k < k2 ; k++)
 
  287                             for (p = Offp [k] ; p < pend ; p++)
 
  289                                 MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
 
  296                         for (k = k1 ; k < k2 ; k++)
 
  300                             x [1] = X [2*k + 1] ;
 
  301                             for (p = Offp [k] ; p < pend ; p++)
 
  305                                 MULT_SUB (X [2*i], offik, x [0]) ;
 
  306                                 MULT_SUB (X [2*i + 1], offik, x [1]) ;
 
  313                         for (k = k1 ; k < k2 ; k++)
 
  317                             x [1] = X [3*k + 1] ;
 
  318                             x [2] = X [3*k + 2] ;
 
  319                             for (p = Offp [k] ; p < pend ; p++)
 
  323                                 MULT_SUB (X [3*i], offik, x [0]) ;
 
  324                                 MULT_SUB (X [3*i + 1], offik, x [1]) ;
 
  325                                 MULT_SUB (X [3*i + 2], offik, x [2]) ;
 
  332                         for (k = k1 ; k < k2 ; k++)
 
  336                             x [1] = X [4*k + 1] ;
 
  337                             x [2] = X [4*k + 2] ;
 
  338                             x [3] = X [4*k + 3] ;
 
  339                             for (p = Offp [k] ; p < pend ; p++)
 
  343                                 MULT_SUB (X [4*i], offik, x [0]) ;
 
  344                                 MULT_SUB (X [4*i + 1], offik, x [1]) ;
 
  345                                 MULT_SUB (X [4*i + 2], offik, x [2]) ;
 
  346                                 MULT_SUB (X [4*i + 3], offik, x [3]) ;
 
  363                 for (k = 0 ; k < n ; k++)
 
  371                 for (k = 0 ; k < n ; k++)
 
  375                     Bz  [i + d  ] = X [2*k + 1] ;
 
  381                 for (k = 0 ; k < n ; k++)
 
  385                     Bz  [i + d  ] = X [3*k + 1] ;
 
  386                     Bz  [i + d*2] = X [3*k + 2] ;
 
  392                 for (k = 0 ; k < n ; k++)
 
  396                     Bz  [i + d  ] = X [4*k + 1] ;
 
  397                     Bz  [i + d*2] = X [4*k + 2] ;
 
  398                     Bz  [i + d*3] = X [4*k + 3] ;
 
  414 template <
typename Entry, 
typename Int>
 
  418     KLU_symbolic<Entry, Int> *Symbolic,
 
  419     KLU_numeric<Entry, Int> *Numeric,
 
  430     KLU_common<Entry, Int> *Common
 
  433     Entry x [4], offik, s ;
 
  435     Entry *Offx, *X, *Bz, *Udiag ;
 
  436     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
 
  438     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
 
  448     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
 
  449         B == NULL || Xout == NULL)
 
  451         Common->status = KLU_INVALID ;
 
  454     Common->status = KLU_OK ;
 
  462     nblocks = Symbolic->nblocks ;
 
  470     ASSERT (nblocks == Numeric->nblocks) ;
 
  471     Pnum = Numeric->Pnum ;
 
  472     Offp = Numeric->Offp ;
 
  473     Offi = Numeric->Offi ;
 
  474     Offx = (Entry *) Numeric->Offx ;
 
  477     Llen = Numeric->Llen ;
 
  479     Ulen = Numeric->Ulen ;
 
  480     LUbx = (Unit **) Numeric->LUbx ;
 
  481     Udiag = (Entry *) Numeric->Udiag ;
 
  484     X = (Entry *) Numeric->Xwork ;
 
  486     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
 
  492     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
 
  499         nr = MIN (nrhs - chunk, 4) ;
 
  514                     for (k = 0 ; k < n ; k++)
 
  516                         X [k] = Bz [Pnum [k]] ;
 
  522                     for (k = 0 ; k < n ; k++)
 
  526                         X [2*k + 1] = Bz  [i + d  ] ;
 
  532                     for (k = 0 ; k < n ; k++)
 
  536                         X [3*k + 1] = Bz [i + d  ] ;
 
  537                         X [3*k + 2] = Bz [i + d*2] ;
 
  543                     for (k = 0 ; k < n ; k++)
 
  547                         X [4*k + 1] = Bz [i + d  ] ;
 
  548                         X [4*k + 2] = Bz [i + d*2] ;
 
  549                         X [4*k + 3] = Bz [i + d*3] ;
 
  563                     for (k = 0 ; k < n ; k++)
 
  565                         SCALE_DIV_ASSIGN (X [k], Bz  [Pnum [k]], Rs [k]) ;
 
  571                     for (k = 0 ; k < n ; k++)
 
  575                         SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
 
  576                         SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
 
  582                     for (k = 0 ; k < n ; k++)
 
  586                         SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
 
  587                         SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
 
  588                         SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
 
  594                     for (k = 0 ; k < n ; k++)
 
  598                         SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
 
  599                         SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
 
  600                         SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
 
  601                         SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
 
  611         for (block = nblocks-1 ; block >= 0 ; block--)
 
  621             PRINTF ((
"solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
 
  631                         DIV (X [k1], X [k1], s) ;
 
  635                         DIV (X [2*k1], X [2*k1], s) ;
 
  636                         DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
 
  640                         DIV (X [3*k1], X [3*k1], s) ;
 
  641                         DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
 
  642                         DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
 
  646                         DIV (X [4*k1], X [4*k1], s) ;
 
  647                         DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
 
  648                         DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
 
  649                         DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
 
  656                 KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
 
  658                 KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
 
  659                         Udiag + k1, nr, X + nr*k1) ;
 
  673                         for (k = k1 ; k < k2 ; k++)
 
  677                             for (p = Offp [k] ; p < pend ; p++)
 
  679                                 MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
 
  686                         for (k = k1 ; k < k2 ; k++)
 
  690                             x [1] = X [2*k + 1] ;
 
  691                             for (p = Offp [k] ; p < pend ; p++)
 
  695                                 MULT_SUB (X [2*i], offik, x [0]) ;
 
  696                                 MULT_SUB (X [2*i + 1], offik, x [1]) ;
 
  703                         for (k = k1 ; k < k2 ; k++)
 
  707                             x [1] = X [3*k + 1] ;
 
  708                             x [2] = X [3*k + 2] ;
 
  709                             for (p = Offp [k] ; p < pend ; p++)
 
  713                                 MULT_SUB (X [3*i], offik, x [0]) ;
 
  714                                 MULT_SUB (X [3*i + 1], offik, x [1]) ;
 
  715                                 MULT_SUB (X [3*i + 2], offik, x [2]) ;
 
  722                         for (k = k1 ; k < k2 ; k++)
 
  726                             x [1] = X [4*k + 1] ;
 
  727                             x [2] = X [4*k + 2] ;
 
  728                             x [3] = X [4*k + 3] ;
 
  729                             for (p = Offp [k] ; p < pend ; p++)
 
  733                                 MULT_SUB (X [4*i], offik, x [0]) ;
 
  734                                 MULT_SUB (X [4*i + 1], offik, x [1]) ;
 
  735                                 MULT_SUB (X [4*i + 2], offik, x [2]) ;
 
  736                                 MULT_SUB (X [4*i + 3], offik, x [3]) ;
 
  754                 for (k = 0 ; k < n ; k++)
 
  756                     Xout  [Q [k]] = X [k] ;
 
  762                 for (k = 0 ; k < n ; k++)
 
  765                     Xout  [i      ] = X [2*k    ] ;
 
  766                     Xout  [i + d  ] = X [2*k + 1] ;
 
  772                 for (k = 0 ; k < n ; k++)
 
  775                     Xout  [i      ] = X [3*k    ] ;
 
  776                     Xout  [i + d  ] = X [3*k + 1] ;
 
  777                     Xout  [i + d*2] = X [3*k + 2] ;
 
  783                 for (k = 0 ; k < n ; k++)
 
  786                     Xout  [i      ] = X [4*k    ] ;
 
  787                     Xout  [i + d  ] = X [4*k + 1] ;
 
  788                     Xout  [i + d*2] = X [4*k + 2] ;
 
  789                     Xout  [i + d*3] = X [4*k + 3] ;