Amesos Package Browser (Single Doxygen Collection)  Development
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
amesos_klu_tsolve.c
Go to the documentation of this file.
1 /* ========================================================================== */
2 /* === KLU_tsolve =========================================================== */
3 /* ========================================================================== */
4 
5 /* Solve A'x=b using the symbolic and numeric objects from KLU_analyze
6  * (or KLU_analyze_given) and KLU_factor. Note that no iterative refinement is
7  * performed. Uses Numeric->Xwork as workspace (undefined on input and output),
8  * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
9  * Numeric->Iwork).
10  */
11 
12 #include "amesos_klu_internal.h"
13 
15 (
16  /* inputs, not modified */
17  KLU_symbolic *Symbolic,
18  KLU_numeric *Numeric,
19  Int d, /* leading dimension of B */
20  Int nrhs, /* number of right-hand-sides */
21 
22  /* right-hand-side on input, overwritten with solution to Ax=b on output */
23  double B [ ], /* size n*nrhs, in column-oriented form, with
24  * leading dimension d. */
25 #ifdef COMPLEX
26  Int conj_solve, /* TRUE for conjugate transpose solve, FALSE for
27  * array transpose solve. Used for the complex
28  * case only. */
29 #endif
30  /* --------------- */
31  KLU_common *Common
32 )
33 {
34  Entry x [4], offik, s ;
35  double rs, *Rs ;
36  Entry *Offx, *X, *Bz, *Udiag ;
37  Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
38  Unit **LUbx ;
39  Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
40 #ifdef KLU_ENABLE_OPENMP
41  Entry *X1 ;
42  int tid ;
43 #endif
44 
45  /* ---------------------------------------------------------------------- */
46  /* check inputs */
47  /* ---------------------------------------------------------------------- */
48 
49  if (Common == NULL)
50  {
51  return (FALSE) ;
52  }
53  if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
54  B == NULL)
55  {
56  Common->status = KLU_INVALID ;
57  return (FALSE) ;
58  }
59  Common->status = KLU_OK ;
60 
61  /* ---------------------------------------------------------------------- */
62  /* get the contents of the Symbolic object */
63  /* ---------------------------------------------------------------------- */
64 
65  Bz = (Entry *) B ;
66  n = Symbolic->n ;
67  nblocks = Symbolic->nblocks ;
68  Q = Symbolic->Q ;
69  R = Symbolic->R ;
70 
71  /* ---------------------------------------------------------------------- */
72  /* get the contents of the Numeric object */
73  /* ---------------------------------------------------------------------- */
74 
75  ASSERT (nblocks == Numeric->nblocks) ;
76  Pnum = Numeric->Pnum ;
77  Offp = Numeric->Offp ;
78  Offi = Numeric->Offi ;
79  Offx = (Entry *) Numeric->Offx ;
80 
81  Lip = Numeric->Lip ;
82  Llen = Numeric->Llen ;
83  Uip = Numeric->Uip ;
84  Ulen = Numeric->Ulen ;
85  LUbx = (Unit **) Numeric->LUbx ;
86  Udiag = Numeric->Udiag ;
87 
88  Rs = Numeric->Rs ;
89 #ifdef KLU_ENABLE_OPENMP
90  X1 = (Entry *) Numeric->Xwork ;
91 #else
92  X = (Entry *) Numeric->Xwork ;
93 #endif
94  ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
95 
96  /* ---------------------------------------------------------------------- */
97  /* solve in chunks of 4 columns at a time */
98  /* ---------------------------------------------------------------------- */
99 
100 #ifdef KLU_ENABLE_OPENMP
101 #pragma omp parallel for schedule(guided) private(tid, nr, k, block, k1, k2, nk, pend, p, s, i, Bz, X, x, offik, rs)
102 #endif
103  for (chunk = 0 ; chunk < nrhs ; chunk += 4)
104  {
105 #ifdef KLU_ENABLE_OPENMP
106  Bz = ((Entry *) B) + d*chunk ;
107  tid = omp_get_thread_num();
108  X = X1 + (tid * n * 4);
109 #endif
110 
111 
112  /* ------------------------------------------------------------------ */
113  /* get the size of the current chunk */
114  /* ------------------------------------------------------------------ */
115 
116  nr = MIN (nrhs - chunk, 4) ;
117 
118  /* ------------------------------------------------------------------ */
119  /* permute the right hand side, X = Q'*B */
120  /* ------------------------------------------------------------------ */
121 
122  switch (nr)
123  {
124 
125  case 1:
126 
127  for (k = 0 ; k < n ; k++)
128  {
129  X [k] = Bz [Q [k]] ;
130  }
131  break ;
132 
133  case 2:
134 
135  for (k = 0 ; k < n ; k++)
136  {
137  i = Q [k] ;
138  X [2*k ] = Bz [i ] ;
139  X [2*k + 1] = Bz [i + d ] ;
140  }
141  break ;
142 
143  case 3:
144 
145  for (k = 0 ; k < n ; k++)
146  {
147  i = Q [k] ;
148  X [3*k ] = Bz [i ] ;
149  X [3*k + 1] = Bz [i + d ] ;
150  X [3*k + 2] = Bz [i + d*2] ;
151  }
152  break ;
153 
154  case 4:
155 
156  for (k = 0 ; k < n ; k++)
157  {
158  i = Q [k] ;
159  X [4*k ] = Bz [i ] ;
160  X [4*k + 1] = Bz [i + d ] ;
161  X [4*k + 2] = Bz [i + d*2] ;
162  X [4*k + 3] = Bz [i + d*3] ;
163  }
164  break ;
165 
166  }
167 
168  /* ------------------------------------------------------------------ */
169  /* solve X = (L*U + Off)'\X */
170  /* ------------------------------------------------------------------ */
171 
172  for (block = 0 ; block < nblocks ; block++)
173  {
174 
175  /* -------------------------------------------------------------- */
176  /* the block of size nk is from rows/columns k1 to k2-1 */
177  /* -------------------------------------------------------------- */
178 
179  k1 = R [block] ;
180  k2 = R [block+1] ;
181  nk = k2 - k1 ;
182  PRINTF (("tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
183 
184  /* -------------------------------------------------------------- */
185  /* block back-substitution for the off-diagonal-block entries */
186  /* -------------------------------------------------------------- */
187 
188  if (block > 0)
189  {
190  switch (nr)
191  {
192 
193  case 1:
194 
195  for (k = k1 ; k < k2 ; k++)
196  {
197  pend = Offp [k+1] ;
198  for (p = Offp [k] ; p < pend ; p++)
199  {
200 #ifdef COMPLEX
201  if (conj_solve)
202  {
203  MULT_SUB_CONJ (X [k], X [Offi [p]],
204  Offx [p]) ;
205  }
206  else
207 #endif
208  {
209  MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
210  }
211  }
212  }
213  break ;
214 
215  case 2:
216 
217  for (k = k1 ; k < k2 ; k++)
218  {
219  pend = Offp [k+1] ;
220  x [0] = X [2*k ] ;
221  x [1] = X [2*k + 1] ;
222  for (p = Offp [k] ; p < pend ; p++)
223  {
224  i = Offi [p] ;
225 #ifdef COMPLEX
226  if (conj_solve)
227  {
228  CONJ (offik, Offx [p]) ;
229  }
230  else
231 #endif
232  {
233  offik = Offx [p] ;
234  }
235  MULT_SUB (x [0], offik, X [2*i]) ;
236  MULT_SUB (x [1], offik, X [2*i + 1]) ;
237  }
238  X [2*k ] = x [0] ;
239  X [2*k + 1] = x [1] ;
240  }
241  break ;
242 
243  case 3:
244 
245  for (k = k1 ; k < k2 ; k++)
246  {
247  pend = Offp [k+1] ;
248  x [0] = X [3*k ] ;
249  x [1] = X [3*k + 1] ;
250  x [2] = X [3*k + 2] ;
251  for (p = Offp [k] ; p < pend ; p++)
252  {
253  i = Offi [p] ;
254 #ifdef COMPLEX
255  if (conj_solve)
256  {
257  CONJ (offik, Offx [p]) ;
258  }
259  else
260 #endif
261  {
262  offik = Offx [p] ;
263  }
264  MULT_SUB (x [0], offik, X [3*i]) ;
265  MULT_SUB (x [1], offik, X [3*i + 1]) ;
266  MULT_SUB (x [2], offik, X [3*i + 2]) ;
267  }
268  X [3*k ] = x [0] ;
269  X [3*k + 1] = x [1] ;
270  X [3*k + 2] = x [2] ;
271  }
272  break ;
273 
274  case 4:
275 
276  for (k = k1 ; k < k2 ; k++)
277  {
278  pend = Offp [k+1] ;
279  x [0] = X [4*k ] ;
280  x [1] = X [4*k + 1] ;
281  x [2] = X [4*k + 2] ;
282  x [3] = X [4*k + 3] ;
283  for (p = Offp [k] ; p < pend ; p++)
284  {
285  i = Offi [p] ;
286 #ifdef COMPLEX
287  if (conj_solve)
288  {
289  CONJ(offik, Offx [p]) ;
290  }
291  else
292 #endif
293  {
294  offik = Offx [p] ;
295  }
296  MULT_SUB (x [0], offik, X [4*i]) ;
297  MULT_SUB (x [1], offik, X [4*i + 1]) ;
298  MULT_SUB (x [2], offik, X [4*i + 2]) ;
299  MULT_SUB (x [3], offik, X [4*i + 3]) ;
300  }
301  X [4*k ] = x [0] ;
302  X [4*k + 1] = x [1] ;
303  X [4*k + 2] = x [2] ;
304  X [4*k + 3] = x [3] ;
305  }
306  break ;
307  }
308  }
309 
310  /* -------------------------------------------------------------- */
311  /* solve the block system */
312  /* -------------------------------------------------------------- */
313 
314  if (nk == 1)
315  {
316 #ifdef COMPLEX
317  if (conj_solve)
318  {
319  CONJ (s, Udiag [k1]) ;
320  }
321  else
322 #endif
323  {
324  s = Udiag [k1] ;
325  }
326  switch (nr)
327  {
328 
329  case 1:
330  DIV (X [k1], X [k1], s) ;
331  break ;
332 
333  case 2:
334  DIV (X [2*k1], X [2*k1], s) ;
335  DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
336  break ;
337 
338  case 3:
339  DIV (X [3*k1], X [3*k1], s) ;
340  DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
341  DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
342  break ;
343 
344  case 4:
345  DIV (X [4*k1], X [4*k1], s) ;
346  DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
347  DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
348  DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
349  break ;
350 
351  }
352  }
353  else
354  {
355  KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
356  Udiag + k1, nr,
357 #ifdef COMPLEX
358  conj_solve,
359 #endif
360  X + nr*k1) ;
361  KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
362 #ifdef COMPLEX
363  conj_solve,
364 #endif
365  X + nr*k1) ;
366  }
367  }
368 
369  /* ------------------------------------------------------------------ */
370  /* scale and permute the result, Bz = P'(R\X) */
371  /* ------------------------------------------------------------------ */
372 
373  if (Rs == NULL)
374  {
375 
376  /* no scaling */
377  switch (nr)
378  {
379 
380  case 1:
381 
382  for (k = 0 ; k < n ; k++)
383  {
384  Bz [Pnum [k]] = X [k] ;
385  }
386  break ;
387 
388  case 2:
389 
390  for (k = 0 ; k < n ; k++)
391  {
392  i = Pnum [k] ;
393  Bz [i ] = X [2*k ] ;
394  Bz [i + d ] = X [2*k + 1] ;
395  }
396  break ;
397 
398  case 3:
399 
400  for (k = 0 ; k < n ; k++)
401  {
402  i = Pnum [k] ;
403  Bz [i ] = X [3*k ] ;
404  Bz [i + d ] = X [3*k + 1] ;
405  Bz [i + d*2] = X [3*k + 2] ;
406  }
407  break ;
408 
409  case 4:
410 
411  for (k = 0 ; k < n ; k++)
412  {
413  i = Pnum [k] ;
414  Bz [i ] = X [4*k ] ;
415  Bz [i + d ] = X [4*k + 1] ;
416  Bz [i + d*2] = X [4*k + 2] ;
417  Bz [i + d*3] = X [4*k + 3] ;
418  }
419  break ;
420  }
421 
422  }
423  else
424  {
425 
426  switch (nr)
427  {
428 
429  case 1:
430 
431  for (k = 0 ; k < n ; k++)
432  {
433  SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
434  }
435  break ;
436 
437  case 2:
438 
439  for (k = 0 ; k < n ; k++)
440  {
441  i = Pnum [k] ;
442  rs = Rs [k] ;
443  SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
444  SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
445  }
446  break ;
447 
448  case 3:
449 
450  for (k = 0 ; k < n ; k++)
451  {
452  i = Pnum [k] ;
453  rs = Rs [k] ;
454  SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
455  SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
456  SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
457  }
458  break ;
459 
460  case 4:
461 
462  for (k = 0 ; k < n ; k++)
463  {
464  i = Pnum [k] ;
465  rs = Rs [k] ;
466  SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
467  SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
468  SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
469  SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
470  }
471  break ;
472  }
473  }
474 
475  /* ------------------------------------------------------------------ */
476  /* go to the next chunk of B */
477  /* ------------------------------------------------------------------ */
478 
479 #ifndef KLU_ENABLE_OPENMP
480  Bz += d*4 ;
481 #endif
482  }
483  return (TRUE) ;
484 }
#define CONJ(a, x)
#define COMPLEX
#define KLU_INVALID
#define KLU_symbolic
#define Int
#define FALSE
double Unit
#define NULL
#define ASSERT(expression)
#define MULT_SUB(c, a, b)
#define KLU_numeric
#define MULT_SUB_CONJ(c, a, b)
#define SCALE_DIV_ASSIGN(a, c, s)
#define KLU_valid
Int KLU_tsolve(KLU_symbolic *Symbolic, KLU_numeric *Numeric, Int d, Int nrhs, double B[], KLU_common *Common)
#define DIV(c, a, b)
#define KLU_common
#define MIN(a, b)
#define KLU_ltsolve
#define Entry
#define PRINTF(params)
#define KLU_utsolve
int n
#define TRUE
#define KLU_OK