Amesos2 - Direct Sparse Solver Interfaces  Version of the Day
klu2_solve.hpp
1 /* ========================================================================== */
2 /* === KLU_solve ============================================================ */
3 /* ========================================================================== */
4 // @HEADER
5 // ***********************************************************************
6 //
7 // KLU2: A Direct Linear Solver package
8 // Copyright 2011 Sandia Corporation
9 //
10 // Under terms of Contract DE-AC04-94AL85000, with Sandia Corporation, the
11 // U.S. Government retains certain rights in this software.
12 //
13 // This library is free software; you can redistribute it and/or modify
14 // it under the terms of the GNU Lesser General Public License as
15 // published by the Free Software Foundation; either version 2.1 of the
16 // License, or (at your option) any later version.
17 //
18 // This library is distributed in the hope that it will be useful, but
19 // WITHOUT ANY WARRANTY; without even the implied warranty of
20 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 // Lesser General Public License for more details.
22 //
23 // You should have received a copy of the GNU Lesser General Public
24 // License along with this library; if not, write to the Free Software
25 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 // USA
27 // Questions? Contact Mike A. Heroux (maherou@sandia.gov)
28 //
29 // KLU2 is derived work from KLU, licensed under LGPL, and copyrighted by
30 // University of Florida. The Authors of KLU are Timothy A. Davis and
31 // Eka Palamadai. See Doc/KLU_README.txt for the licensing and copyright
32 // information for KLU.
33 //
34 // ***********************************************************************
35 // @HEADER
36 
37 /* Solve Ax=b using the symbolic and numeric objects from KLU_analyze
38  * (or KLU_analyze_given) and KLU_factor. Note that no iterative refinement is
39  * performed. Uses Numeric->Xwork as workspace (undefined on input and output),
40  * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
41  * Numeric->Iwork).
42  */
43 
44 #ifndef KLU2_SOLVE_HPP
45 #define KLU2_SOLVE_HPP
46 
47 #include "klu2_internal.h"
48 #include "klu2.hpp"
49 
50 template <typename Entry, typename Int>
51 Int KLU_solve
52 (
53  /* inputs, not modified */
54  KLU_symbolic<Entry, Int> *Symbolic,
55  KLU_numeric<Entry, Int> *Numeric,
56  Int d, /* leading dimension of B */
57  Int nrhs, /* number of right-hand-sides */
58 
59  /* right-hand-side on input, overwritten with solution to Ax=b on output */
60  /* TODO: Switched from double to Entry, check for all cases */
61  Entry B [ ], /* size n*nrhs, in column-oriented form, with
62  * leading dimension d. */
63  /* --------------- */
64  KLU_common<Entry, Int> *Common
65 )
66 {
67  Entry x [4], offik, s ;
68  double rs, *Rs ;
69  Entry *Offx, *X, *Bz, *Udiag ;
70  Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
71  Unit **LUbx ;
72  Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
73 
74  /* ---------------------------------------------------------------------- */
75  /* check inputs */
76  /* ---------------------------------------------------------------------- */
77 
78  if (Common == NULL)
79  {
80  return (FALSE) ;
81  }
82  if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
83  B == NULL)
84  {
85  Common->status = KLU_INVALID ;
86  return (FALSE) ;
87  }
88  Common->status = KLU_OK ;
89 
90  /* ---------------------------------------------------------------------- */
91  /* get the contents of the Symbolic object */
92  /* ---------------------------------------------------------------------- */
93 
94  Bz = (Entry *) B ;
95  n = Symbolic->n ;
96  nblocks = Symbolic->nblocks ;
97  Q = Symbolic->Q ;
98  R = Symbolic->R ;
99 
100  /* ---------------------------------------------------------------------- */
101  /* get the contents of the Numeric object */
102  /* ---------------------------------------------------------------------- */
103 
104  ASSERT (nblocks == Numeric->nblocks) ;
105  Pnum = Numeric->Pnum ;
106  Offp = Numeric->Offp ;
107  Offi = Numeric->Offi ;
108  Offx = (Entry *) Numeric->Offx ;
109 
110  Lip = Numeric->Lip ;
111  Llen = Numeric->Llen ;
112  Uip = Numeric->Uip ;
113  Ulen = Numeric->Ulen ;
114  LUbx = (Unit **) Numeric->LUbx ;
115  Udiag = (Entry *) Numeric->Udiag ;
116 
117  Rs = Numeric->Rs ;
118  X = (Entry *) Numeric->Xwork ;
119 
120  ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
121 
122  /* ---------------------------------------------------------------------- */
123  /* solve in chunks of 4 columns at a time */
124  /* ---------------------------------------------------------------------- */
125 
126  for (chunk = 0 ; chunk < nrhs ; chunk += 4)
127  {
128 
129  /* ------------------------------------------------------------------ */
130  /* get the size of the current chunk */
131  /* ------------------------------------------------------------------ */
132 
133  nr = MIN (nrhs - chunk, 4) ;
134 
135  /* ------------------------------------------------------------------ */
136  /* scale and permute the right hand side, X = P*(R\B) */
137  /* ------------------------------------------------------------------ */
138 
139  if (Rs == NULL)
140  {
141 
142  /* no scaling */
143  switch (nr)
144  {
145 
146  case 1:
147 
148  for (k = 0 ; k < n ; k++)
149  {
150  X [k] = Bz [Pnum [k]] ;
151  }
152  break ;
153 
154  case 2:
155 
156  for (k = 0 ; k < n ; k++)
157  {
158  i = Pnum [k] ;
159  X [2*k ] = Bz [i ] ;
160  X [2*k + 1] = Bz [i + d ] ;
161  }
162  break ;
163 
164  case 3:
165 
166  for (k = 0 ; k < n ; k++)
167  {
168  i = Pnum [k] ;
169  X [3*k ] = Bz [i ] ;
170  X [3*k + 1] = Bz [i + d ] ;
171  X [3*k + 2] = Bz [i + d*2] ;
172  }
173  break ;
174 
175  case 4:
176 
177  for (k = 0 ; k < n ; k++)
178  {
179  i = Pnum [k] ;
180  X [4*k ] = Bz [i ] ;
181  X [4*k + 1] = Bz [i + d ] ;
182  X [4*k + 2] = Bz [i + d*2] ;
183  X [4*k + 3] = Bz [i + d*3] ;
184  }
185  break ;
186  }
187 
188  }
189  else
190  {
191 
192  switch (nr)
193  {
194 
195  case 1:
196 
197  for (k = 0 ; k < n ; k++)
198  {
199  SCALE_DIV_ASSIGN (X [k], Bz [Pnum [k]], Rs [k]) ;
200  }
201  break ;
202 
203  case 2:
204 
205  for (k = 0 ; k < n ; k++)
206  {
207  i = Pnum [k] ;
208  rs = Rs [k] ;
209  SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
210  SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
211  }
212  break ;
213 
214  case 3:
215 
216  for (k = 0 ; k < n ; k++)
217  {
218  i = Pnum [k] ;
219  rs = Rs [k] ;
220  SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
221  SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
222  SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
223  }
224  break ;
225 
226  case 4:
227 
228  for (k = 0 ; k < n ; k++)
229  {
230  i = Pnum [k] ;
231  rs = Rs [k] ;
232  SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
233  SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
234  SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
235  SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
236  }
237  break ;
238  }
239  }
240 
241  /* ------------------------------------------------------------------ */
242  /* solve X = (L*U + Off)\X */
243  /* ------------------------------------------------------------------ */
244 
245  for (block = nblocks-1 ; block >= 0 ; block--)
246  {
247 
248  /* -------------------------------------------------------------- */
249  /* the block of size nk is from rows/columns k1 to k2-1 */
250  /* -------------------------------------------------------------- */
251 
252  k1 = R [block] ;
253  k2 = R [block+1] ;
254  nk = k2 - k1 ;
255  PRINTF (("solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
256 
257  /* solve the block system */
258  if (nk == 1)
259  {
260  s = Udiag [k1] ;
261  switch (nr)
262  {
263 
264  case 1:
265  DIV (X [k1], X [k1], s) ;
266  break ;
267 
268  case 2:
269  DIV (X [2*k1], X [2*k1], s) ;
270  DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
271  break ;
272 
273  case 3:
274  DIV (X [3*k1], X [3*k1], s) ;
275  DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
276  DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
277  break ;
278 
279  case 4:
280  DIV (X [4*k1], X [4*k1], s) ;
281  DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
282  DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
283  DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
284  break ;
285 
286  }
287  }
288  else
289  {
290  KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
291  X + nr*k1) ;
292  KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
293  Udiag + k1, nr, X + nr*k1) ;
294  }
295 
296  /* -------------------------------------------------------------- */
297  /* block back-substitution for the off-diagonal-block entries */
298  /* -------------------------------------------------------------- */
299 
300  if (block > 0)
301  {
302  switch (nr)
303  {
304 
305  case 1:
306 
307  for (k = k1 ; k < k2 ; k++)
308  {
309  pend = Offp [k+1] ;
310  x [0] = X [k] ;
311  for (p = Offp [k] ; p < pend ; p++)
312  {
313  MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
314  }
315  }
316  break ;
317 
318  case 2:
319 
320  for (k = k1 ; k < k2 ; k++)
321  {
322  pend = Offp [k+1] ;
323  x [0] = X [2*k ] ;
324  x [1] = X [2*k + 1] ;
325  for (p = Offp [k] ; p < pend ; p++)
326  {
327  i = Offi [p] ;
328  offik = Offx [p] ;
329  MULT_SUB (X [2*i], offik, x [0]) ;
330  MULT_SUB (X [2*i + 1], offik, x [1]) ;
331  }
332  }
333  break ;
334 
335  case 3:
336 
337  for (k = k1 ; k < k2 ; k++)
338  {
339  pend = Offp [k+1] ;
340  x [0] = X [3*k ] ;
341  x [1] = X [3*k + 1] ;
342  x [2] = X [3*k + 2] ;
343  for (p = Offp [k] ; p < pend ; p++)
344  {
345  i = Offi [p] ;
346  offik = Offx [p] ;
347  MULT_SUB (X [3*i], offik, x [0]) ;
348  MULT_SUB (X [3*i + 1], offik, x [1]) ;
349  MULT_SUB (X [3*i + 2], offik, x [2]) ;
350  }
351  }
352  break ;
353 
354  case 4:
355 
356  for (k = k1 ; k < k2 ; k++)
357  {
358  pend = Offp [k+1] ;
359  x [0] = X [4*k ] ;
360  x [1] = X [4*k + 1] ;
361  x [2] = X [4*k + 2] ;
362  x [3] = X [4*k + 3] ;
363  for (p = Offp [k] ; p < pend ; p++)
364  {
365  i = Offi [p] ;
366  offik = Offx [p] ;
367  MULT_SUB (X [4*i], offik, x [0]) ;
368  MULT_SUB (X [4*i + 1], offik, x [1]) ;
369  MULT_SUB (X [4*i + 2], offik, x [2]) ;
370  MULT_SUB (X [4*i + 3], offik, x [3]) ;
371  }
372  }
373  break ;
374  }
375  }
376  }
377 
378  /* ------------------------------------------------------------------ */
379  /* permute the result, Bz = Q*X */
380  /* ------------------------------------------------------------------ */
381 
382  switch (nr)
383  {
384 
385  case 1:
386 
387  for (k = 0 ; k < n ; k++)
388  {
389  Bz [Q [k]] = X [k] ;
390  }
391  break ;
392 
393  case 2:
394 
395  for (k = 0 ; k < n ; k++)
396  {
397  i = Q [k] ;
398  Bz [i ] = X [2*k ] ;
399  Bz [i + d ] = X [2*k + 1] ;
400  }
401  break ;
402 
403  case 3:
404 
405  for (k = 0 ; k < n ; k++)
406  {
407  i = Q [k] ;
408  Bz [i ] = X [3*k ] ;
409  Bz [i + d ] = X [3*k + 1] ;
410  Bz [i + d*2] = X [3*k + 2] ;
411  }
412  break ;
413 
414  case 4:
415 
416  for (k = 0 ; k < n ; k++)
417  {
418  i = Q [k] ;
419  Bz [i ] = X [4*k ] ;
420  Bz [i + d ] = X [4*k + 1] ;
421  Bz [i + d*2] = X [4*k + 2] ;
422  Bz [i + d*3] = X [4*k + 3] ;
423  }
424  break ;
425  }
426 
427  /* ------------------------------------------------------------------ */
428  /* go to the next chunk of B */
429  /* ------------------------------------------------------------------ */
430 
431  Bz += d*4 ;
432  }
433  return (TRUE) ;
434 }
435 
436 
437 // Require Xout and B to have equal number of entries, pre-allocated
438 template <typename Entry, typename Int>
439 Int KLU_solve2
440 (
441  /* inputs, not modified */
442  KLU_symbolic<Entry, Int> *Symbolic,
443  KLU_numeric<Entry, Int> *Numeric,
444  Int d, /* leading dimension of B */
445  Int nrhs, /* number of right-hand-sides */
446 
447  /* right-hand-side on input, overwritten with solution to Ax=b on output */
448  /* TODO: Switched from double to Entry, check for all cases */
449  Entry B [ ], /* size n*1, in column-oriented form, with
450  * leading dimension d. */
451  Entry Xout [ ], /* size n*1, in column-oriented form, with
452  * leading dimension d. */
453  /* --------------- */
454  KLU_common<Entry, Int> *Common
455 )
456 {
457  Entry x [4], offik, s ;
458  double rs, *Rs ;
459  Entry *Offx, *X, *Bz, *Udiag ;
460  Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
461  Unit **LUbx ;
462  Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
463 
464  /* ---------------------------------------------------------------------- */
465  /* check inputs */
466  /* ---------------------------------------------------------------------- */
467 
468  if (Common == NULL)
469  {
470  return (FALSE) ;
471  }
472  if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
473  B == NULL || Xout == NULL)
474  {
475  Common->status = KLU_INVALID ;
476  return (FALSE) ;
477  }
478  Common->status = KLU_OK ;
479 
480  /* ---------------------------------------------------------------------- */
481  /* get the contents of the Symbolic object */
482  /* ---------------------------------------------------------------------- */
483 
484  Bz = (Entry *) B ;
485  n = Symbolic->n ;
486  nblocks = Symbolic->nblocks ;
487  Q = Symbolic->Q ;
488  R = Symbolic->R ;
489 
490  /* ---------------------------------------------------------------------- */
491  /* get the contents of the Numeric object */
492  /* ---------------------------------------------------------------------- */
493 
494  ASSERT (nblocks == Numeric->nblocks) ;
495  Pnum = Numeric->Pnum ;
496  Offp = Numeric->Offp ;
497  Offi = Numeric->Offi ;
498  Offx = (Entry *) Numeric->Offx ;
499 
500  Lip = Numeric->Lip ;
501  Llen = Numeric->Llen ;
502  Uip = Numeric->Uip ;
503  Ulen = Numeric->Ulen ;
504  LUbx = (Unit **) Numeric->LUbx ;
505  Udiag = (Entry *) Numeric->Udiag ;
506 
507  Rs = Numeric->Rs ;
508  X = (Entry *) Numeric->Xwork ;
509 
510  ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
511 
512  /* ---------------------------------------------------------------------- */
513  /* solve in chunks of 4 columns at a time */
514  /* ---------------------------------------------------------------------- */
515 
516  for (chunk = 0 ; chunk < nrhs ; chunk += 4)
517  {
518 
519  /* ------------------------------------------------------------------ */
520  /* get the size of the current chunk */
521  /* ------------------------------------------------------------------ */
522 
523  nr = MIN (nrhs - chunk, 4) ;
524 
525  /* ------------------------------------------------------------------ */
526  /* scale and permute the right hand side, X = P*(R\B) */
527  /* ------------------------------------------------------------------ */
528 
529  if (Rs == NULL)
530  {
531 
532  /* no scaling */
533  switch (nr)
534  {
535 
536  case 1:
537 
538  for (k = 0 ; k < n ; k++)
539  {
540  X [k] = Bz [Pnum [k]] ;
541  }
542  break ;
543 
544  case 2:
545 
546  for (k = 0 ; k < n ; k++)
547  {
548  i = Pnum [k] ;
549  X [2*k ] = Bz [i ] ;
550  X [2*k + 1] = Bz [i + d ] ;
551  }
552  break ;
553 
554  case 3:
555 
556  for (k = 0 ; k < n ; k++)
557  {
558  i = Pnum [k] ;
559  X [3*k ] = Bz [i ] ;
560  X [3*k + 1] = Bz [i + d ] ;
561  X [3*k + 2] = Bz [i + d*2] ;
562  }
563  break ;
564 
565  case 4:
566 
567  for (k = 0 ; k < n ; k++)
568  {
569  i = Pnum [k] ;
570  X [4*k ] = Bz [i ] ;
571  X [4*k + 1] = Bz [i + d ] ;
572  X [4*k + 2] = Bz [i + d*2] ;
573  X [4*k + 3] = Bz [i + d*3] ;
574  }
575  break ;
576  }
577 
578  }
579  else
580  {
581 
582  switch (nr)
583  {
584 
585  case 1:
586 
587  for (k = 0 ; k < n ; k++)
588  {
589  SCALE_DIV_ASSIGN (X [k], Bz [Pnum [k]], Rs [k]) ;
590  }
591  break ;
592 
593  case 2:
594 
595  for (k = 0 ; k < n ; k++)
596  {
597  i = Pnum [k] ;
598  rs = Rs [k] ;
599  SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
600  SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
601  }
602  break ;
603 
604  case 3:
605 
606  for (k = 0 ; k < n ; k++)
607  {
608  i = Pnum [k] ;
609  rs = Rs [k] ;
610  SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
611  SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
612  SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
613  }
614  break ;
615 
616  case 4:
617 
618  for (k = 0 ; k < n ; k++)
619  {
620  i = Pnum [k] ;
621  rs = Rs [k] ;
622  SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
623  SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
624  SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
625  SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
626  }
627  break ;
628  }
629  }
630 
631  /* ------------------------------------------------------------------ */
632  /* solve X = (L*U + Off)\X */
633  /* ------------------------------------------------------------------ */
634 
635  for (block = nblocks-1 ; block >= 0 ; block--)
636  {
637 
638  /* -------------------------------------------------------------- */
639  /* the block of size nk is from rows/columns k1 to k2-1 */
640  /* -------------------------------------------------------------- */
641 
642  k1 = R [block] ;
643  k2 = R [block+1] ;
644  nk = k2 - k1 ;
645  PRINTF (("solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
646 
647  /* solve the block system */
648  if (nk == 1)
649  {
650  s = Udiag [k1] ;
651  switch (nr)
652  {
653 
654  case 1:
655  DIV (X [k1], X [k1], s) ;
656  break ;
657 
658  case 2:
659  DIV (X [2*k1], X [2*k1], s) ;
660  DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
661  break ;
662 
663  case 3:
664  DIV (X [3*k1], X [3*k1], s) ;
665  DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
666  DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
667  break ;
668 
669  case 4:
670  DIV (X [4*k1], X [4*k1], s) ;
671  DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
672  DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
673  DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
674  break ;
675 
676  }
677  }
678  else
679  {
680  KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
681  X + nr*k1) ;
682  KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
683  Udiag + k1, nr, X + nr*k1) ;
684  }
685 
686  /* -------------------------------------------------------------- */
687  /* block back-substitution for the off-diagonal-block entries */
688  /* -------------------------------------------------------------- */
689 
690  if (block > 0)
691  {
692  switch (nr)
693  {
694 
695  case 1:
696 
697  for (k = k1 ; k < k2 ; k++)
698  {
699  pend = Offp [k+1] ;
700  x [0] = X [k] ;
701  for (p = Offp [k] ; p < pend ; p++)
702  {
703  MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
704  }
705  }
706  break ;
707 
708  case 2:
709 
710  for (k = k1 ; k < k2 ; k++)
711  {
712  pend = Offp [k+1] ;
713  x [0] = X [2*k ] ;
714  x [1] = X [2*k + 1] ;
715  for (p = Offp [k] ; p < pend ; p++)
716  {
717  i = Offi [p] ;
718  offik = Offx [p] ;
719  MULT_SUB (X [2*i], offik, x [0]) ;
720  MULT_SUB (X [2*i + 1], offik, x [1]) ;
721  }
722  }
723  break ;
724 
725  case 3:
726 
727  for (k = k1 ; k < k2 ; k++)
728  {
729  pend = Offp [k+1] ;
730  x [0] = X [3*k ] ;
731  x [1] = X [3*k + 1] ;
732  x [2] = X [3*k + 2] ;
733  for (p = Offp [k] ; p < pend ; p++)
734  {
735  i = Offi [p] ;
736  offik = Offx [p] ;
737  MULT_SUB (X [3*i], offik, x [0]) ;
738  MULT_SUB (X [3*i + 1], offik, x [1]) ;
739  MULT_SUB (X [3*i + 2], offik, x [2]) ;
740  }
741  }
742  break ;
743 
744  case 4:
745 
746  for (k = k1 ; k < k2 ; k++)
747  {
748  pend = Offp [k+1] ;
749  x [0] = X [4*k ] ;
750  x [1] = X [4*k + 1] ;
751  x [2] = X [4*k + 2] ;
752  x [3] = X [4*k + 3] ;
753  for (p = Offp [k] ; p < pend ; p++)
754  {
755  i = Offi [p] ;
756  offik = Offx [p] ;
757  MULT_SUB (X [4*i], offik, x [0]) ;
758  MULT_SUB (X [4*i + 1], offik, x [1]) ;
759  MULT_SUB (X [4*i + 2], offik, x [2]) ;
760  MULT_SUB (X [4*i + 3], offik, x [3]) ;
761  }
762  }
763  break ;
764  }
765  }
766  }
767 
768  /* ------------------------------------------------------------------ */
769  /* permute the result, Bz = Q*X */
770  /* store result in Xout */
771  /* ------------------------------------------------------------------ */
772 
773  switch (nr)
774  {
775 
776  case 1:
777 
778  for (k = 0 ; k < n ; k++)
779  {
780  Xout [Q [k]] = X [k] ;
781  }
782  break ;
783 
784  case 2:
785 
786  for (k = 0 ; k < n ; k++)
787  {
788  i = Q [k] ;
789  Xout [i ] = X [2*k ] ;
790  Xout [i + d ] = X [2*k + 1] ;
791  }
792  break ;
793 
794  case 3:
795 
796  for (k = 0 ; k < n ; k++)
797  {
798  i = Q [k] ;
799  Xout [i ] = X [3*k ] ;
800  Xout [i + d ] = X [3*k + 1] ;
801  Xout [i + d*2] = X [3*k + 2] ;
802  }
803  break ;
804 
805  case 4:
806 
807  for (k = 0 ; k < n ; k++)
808  {
809  i = Q [k] ;
810  Xout [i ] = X [4*k ] ;
811  Xout [i + d ] = X [4*k + 1] ;
812  Xout [i + d*2] = X [4*k + 2] ;
813  Xout [i + d*3] = X [4*k + 3] ;
814  }
815  break ;
816  }
817 
818  /* ------------------------------------------------------------------ */
819  /* go to the next chunk of B */
820  /* ------------------------------------------------------------------ */
821 
822  Xout += d*4 ;
823  }
824  return (TRUE) ;
825 }
826 
827 
828 #endif