Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Sacado_Fad_BLASImp.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 #include "Teuchos_Assert.hpp"
31 
32 template <typename OrdinalType, typename FadType>
34 ArrayTraits(bool use_dynamic_,
35  OrdinalType workspace_size_) :
36  use_dynamic(use_dynamic_),
37  workspace_size(workspace_size_),
38  workspace(NULL),
39  workspace_pointer(NULL)
40 {
41  if (workspace_size > 0) {
44  }
45 }
46 
47 template <typename OrdinalType, typename FadType>
50  use_dynamic(a.use_dynamic),
51  workspace_size(a.workspace_size),
52  workspace(NULL),
53  workspace_pointer(NULL)
54 {
55  if (workspace_size > 0) {
58  }
59 }
60 
61 
62 template <typename OrdinalType, typename FadType>
65 {
66 // #ifdef SACADO_DEBUG
67 // TEUCHOS_TEST_FOR_EXCEPTION(workspace_pointer != workspace,
68 // std::logic_error,
69 // "ArrayTraits::~ArrayTraits(): " <<
70 // "Destructor called with non-zero used workspace. " <<
71 // "Currently used size is " << workspace_pointer-workspace <<
72 // ".");
73 
74 // #endif
75 
76  if (workspace_size > 0)
77  delete [] workspace;
78 }
79 
80 template <typename OrdinalType, typename FadType>
81 void
83 unpack(const FadType& a, OrdinalType& n_dot, ValueType& val,
84  const ValueType*& dot) const
85 {
86  n_dot = a.size();
87  val = a.val();
88  if (n_dot > 0)
89  dot = &a.fastAccessDx(0);
90  else
91  dot = NULL;
92 }
93 
94 template <typename OrdinalType, typename FadType>
95 void
97 unpack(const FadType* a, OrdinalType n, OrdinalType inc,
98  OrdinalType& n_dot, OrdinalType& inc_val, OrdinalType& inc_dot,
99  const ValueType*& cval, const ValueType*& cdot) const
100 {
101  if (n == 0) {
102  n_dot = 0;
103  inc_val = 0;
104  inc_dot = 0;
105  cval = NULL;
106  cdot = NULL;
107  return;
108  }
109 
110  n_dot = a[0].size();
111  bool is_contiguous = is_array_contiguous(a, n, n_dot);
112  if (is_contiguous) {
113  inc_val = inc;
114  inc_dot = inc;
115  cval = &a[0].val();
116  if (n_dot > 0)
117  cdot = &a[0].fastAccessDx(0);
118  }
119  else {
120  inc_val = 1;
121  inc_dot = 0;
122  ValueType *val = allocate_array(n);
123  ValueType *dot = NULL;
124  if (n_dot > 0) {
125  inc_dot = 1;
126  dot = allocate_array(n*n_dot);
127  }
128  for (OrdinalType i=0; i<n; i++) {
129  val[i] = a[i*inc].val();
130  for (OrdinalType j=0; j<n_dot; j++)
131  dot[j*n+i] = a[i*inc].fastAccessDx(j);
132  }
133 
134  cval = val;
135  cdot = dot;
136  }
137 }
138 
139 template <typename OrdinalType, typename FadType>
140 void
142 unpack(const FadType* A, OrdinalType m, OrdinalType n, OrdinalType lda,
143  OrdinalType& n_dot, OrdinalType& lda_val, OrdinalType& lda_dot,
144  const ValueType*& cval, const ValueType*& cdot) const
145 {
146  if (m*n == 0) {
147  n_dot = 0;
148  lda_val = 0;
149  lda_dot = 0;
150  cval = NULL;
151  cdot = NULL;
152  return;
153  }
154 
155  n_dot = A[0].size();
156  bool is_contiguous = is_array_contiguous(A, m*n, n_dot);
157  if (is_contiguous) {
158  lda_val = lda;
159  lda_dot = lda;
160  cval = &A[0].val();
161  if (n_dot > 0)
162  cdot = &A[0].fastAccessDx(0);
163  }
164  else {
165  lda_val = m;
166  lda_dot = 0;
167  ValueType *val = allocate_array(m*n);
168  ValueType *dot = NULL;
169  if (n_dot > 0) {
170  lda_dot = m;
171  dot = allocate_array(m*n*n_dot);
172  }
173  for (OrdinalType j=0; j<n; j++) {
174  for (OrdinalType i=0; i<m; i++) {
175  val[j*m+i] = A[j*lda+i].val();
176  for (OrdinalType k=0; k<n_dot; k++)
177  dot[(k*n+j)*m+i] = A[j*lda+i].fastAccessDx(k);
178  }
179  }
180 
181  cval = val;
182  cdot = dot;
183  }
184 }
185 
186 template <typename OrdinalType, typename FadType>
187 void
189 unpack(const ValueType& a, OrdinalType& n_dot, ValueType& val,
190  const ValueType*& dot) const
191 {
192  n_dot = 0;
193  val = a;
194  dot = NULL;
195 }
196 
197 template <typename OrdinalType, typename FadType>
198 void
200 unpack(const ValueType* a, OrdinalType n, OrdinalType inc,
201  OrdinalType& n_dot, OrdinalType& inc_val, OrdinalType& inc_dot,
202  const ValueType*& cval, const ValueType*& cdot) const
203 {
204  n_dot = 0;
205  inc_val = inc;
206  inc_dot = 0;
207  cval = a;
208  cdot = NULL;
209 }
210 
211 template <typename OrdinalType, typename FadType>
212 void
214 unpack(const ValueType* A, OrdinalType m, OrdinalType n, OrdinalType lda,
215  OrdinalType& n_dot, OrdinalType& lda_val, OrdinalType& lda_dot,
216  const ValueType*& cval, const ValueType*& cdot) const
217 {
218  n_dot = 0;
219  lda_val = lda;
220  lda_dot = 0;
221  cval = A;
222  cdot = NULL;
223 }
224 
225 template <typename OrdinalType, typename FadType>
226 void
228 unpack(const ScalarType& a, OrdinalType& n_dot, ScalarType& val,
229  const ScalarType*& dot) const
230 {
231  n_dot = 0;
232  val = a;
233  dot = NULL;
234 }
235 
236 template <typename OrdinalType, typename FadType>
237 void
239 unpack(const ScalarType* a, OrdinalType n, OrdinalType inc,
240  OrdinalType& n_dot, OrdinalType& inc_val, OrdinalType& inc_dot,
241  const ScalarType*& cval, const ScalarType*& cdot) const
242 {
243  n_dot = 0;
244  inc_val = inc;
245  inc_dot = 0;
246  cval = a;
247  cdot = NULL;
248 }
249 
250 template <typename OrdinalType, typename FadType>
251 void
253 unpack(const ScalarType* A, OrdinalType m, OrdinalType n, OrdinalType lda,
254  OrdinalType& n_dot, OrdinalType& lda_val, OrdinalType& lda_dot,
255  const ScalarType*& cval, const ScalarType*& cdot) const
256 {
257  n_dot = 0;
258  lda_val = lda;
259  lda_dot = 0;
260  cval = A;
261  cdot = NULL;
262 }
263 
264 template <typename OrdinalType, typename FadType>
265 void
267 unpack(FadType& a, OrdinalType& n_dot, OrdinalType& final_n_dot, ValueType& val,
268  ValueType*& dot) const
269 {
270  n_dot = a.size();
271  val = a.val();
272 #ifdef SACADO_DEBUG
273  TEUCHOS_TEST_FOR_EXCEPTION(n_dot > 0 && final_n_dot > 0 && final_n_dot != n_dot,
274  std::logic_error,
275  "ArrayTraits::unpack(): FadType has wrong number of " <<
276  "derivative components. Got " << n_dot <<
277  ", expected " << final_n_dot << ".");
278 #endif
279  if (n_dot > final_n_dot)
280  final_n_dot = n_dot;
281 
282  OrdinalType n_avail = a.availableSize();
283  if (n_avail < final_n_dot) {
284  dot = alloate(final_n_dot);
285  for (OrdinalType i=0; i<final_n_dot; i++)
286  dot[i] = 0.0;
287  }
288  else if (n_avail > 0)
289  dot = &a.fastAccessDx(0);
290  else
291  dot = NULL;
292 }
293 
294 template <typename OrdinalType, typename FadType>
295 void
297 unpack(FadType* a, OrdinalType n, OrdinalType inc, OrdinalType& n_dot,
298  OrdinalType& final_n_dot, OrdinalType& inc_val, OrdinalType& inc_dot,
299  ValueType*& val, ValueType*& dot) const
300 {
301  if (n == 0) {
302  inc_val = 0;
303  inc_dot = 0;
304  val = NULL;
305  dot = NULL;
306  return;
307  }
308 
309  n_dot = a[0].size();
310  bool is_contiguous = is_array_contiguous(a, n, n_dot);
311 #ifdef SACADO_DEBUG
312  TEUCHOS_TEST_FOR_EXCEPTION(n_dot > 0 && final_n_dot > 0 && final_n_dot != n_dot,
313  std::logic_error,
314  "ArrayTraits::unpack(): FadType has wrong number of " <<
315  "derivative components. Got " << n_dot <<
316  ", expected " << final_n_dot << ".");
317 #endif
318  if (n_dot > final_n_dot)
319  final_n_dot = n_dot;
320 
321  if (is_contiguous) {
322  inc_val = inc;
323  val = &a[0].val();
324  }
325  else {
326  inc_val = 1;
327  val = allocate_array(n);
328  for (OrdinalType i=0; i<n; i++)
329  val[i] = a[i*inc].val();
330  }
331 
332  OrdinalType n_avail = a[0].availableSize();
333  if (is_contiguous && n_avail >= final_n_dot && final_n_dot > 0) {
334  inc_dot = inc;
335  dot = &a[0].fastAccessDx(0);
336  }
337  else if (final_n_dot > 0) {
338  inc_dot = 1;
339  dot = allocate_array(n*final_n_dot);
340  for (OrdinalType i=0; i<n; i++) {
341  if (n_dot > 0)
342  for (OrdinalType j=0; j<n_dot; j++)
343  dot[j*n+i] = a[i*inc].fastAccessDx(j);
344  else
345  for (OrdinalType j=0; j<final_n_dot; j++)
346  dot[j*n+i] = 0.0;
347  }
348  }
349  else {
350  inc_dot = 0;
351  dot = NULL;
352  }
353 }
354 
355 template <typename OrdinalType, typename FadType>
356 void
358 unpack(FadType* A, OrdinalType m, OrdinalType n, OrdinalType lda,
359  OrdinalType& n_dot, OrdinalType& final_n_dot,
360  OrdinalType& lda_val, OrdinalType& lda_dot,
361  ValueType*& val, ValueType*& dot) const
362 {
363  if (m*n == 0) {
364  lda_val = 0;
365  lda_dot = 0;
366  val = NULL;
367  dot = NULL;
368  return;
369  }
370 
371  n_dot = A[0].size();
372  bool is_contiguous = is_array_contiguous(A, m*n, n_dot);
373 #ifdef SACADO_DEBUG
374  TEUCHOS_TEST_FOR_EXCEPTION(n_dot > 0 && final_n_dot > 0 && final_n_dot != n_dot,
375  std::logic_error,
376  "ArrayTraits::unpack(): FadType has wrong number of " <<
377  "derivative components. Got " << n_dot <<
378  ", expected " << final_n_dot << ".");
379 #endif
380  if (n_dot > final_n_dot)
381  final_n_dot = n_dot;
382 
383  if (is_contiguous) {
384  lda_val = lda;
385  val = &A[0].val();
386  }
387  else {
388  lda_val = m;
389  val = allocate_array(m*n);
390  for (OrdinalType j=0; j<n; j++)
391  for (OrdinalType i=0; i<m; i++)
392  val[j*m+i] = A[j*lda+i].val();
393  }
394 
395  OrdinalType n_avail = A[0].availableSize();
396  if (is_contiguous && n_avail >= final_n_dot && final_n_dot > 0) {
397  lda_dot = lda;
398  dot = &A[0].fastAccessDx(0);
399  }
400  else if (final_n_dot > 0) {
401  lda_dot = m;
402  dot = allocate_array(m*n*final_n_dot);
403  for (OrdinalType j=0; j<n; j++) {
404  for (OrdinalType i=0; i<m; i++) {
405  if (n_dot > 0)
406  for (OrdinalType k=0; k<n_dot; k++)
407  dot[(k*n+j)*m+i] = A[j*lda+i].fastAccessDx(k);
408  else
409  for (OrdinalType k=0; k<final_n_dot; k++)
410  dot[(k*n+j)*m+i] = 0.0;
411  }
412  }
413  }
414  else {
415  lda_dot = 0;
416  dot = NULL;
417  }
418 }
419 
420 template <typename OrdinalType, typename FadType>
421 void
423 pack(FadType& a, OrdinalType n_dot, const ValueType& val,
424  const ValueType* dot) const
425 {
426  a.val() = val;
427 
428  if (n_dot == 0)
429  return;
430 
431  if (a.size() != n_dot)
432  a.resize(n_dot);
433  if (a.dx() != dot)
434  for (OrdinalType i=0; i<n_dot; i++)
435  a.fastAccessDx(i) = dot[i];
436 }
437 
438 template <typename OrdinalType, typename FadType>
439 void
441 pack(FadType* a, OrdinalType n, OrdinalType inc,
442  OrdinalType n_dot, OrdinalType inc_val, OrdinalType inc_dot,
443  const ValueType* val, const ValueType* dot) const
444 {
445  if (n == 0)
446  return;
447 
448  // Copy values
449  if (&a[0].val() != val)
450  for (OrdinalType i=0; i<n; i++)
451  a[i*inc].val() = val[i*inc_val];
452 
453  if (n_dot == 0)
454  return;
455 
456  // Resize derivative arrays
457  if (a[0].size() != n_dot)
458  for (OrdinalType i=0; i<n; i++)
459  a[i*inc].resize(n_dot);
460 
461  // Copy derivatives
462  if (a[0].dx() != dot)
463  for (OrdinalType i=0; i<n; i++)
464  for (OrdinalType j=0; j<n_dot; j++)
465  a[i*inc].fastAccessDx(j) = dot[(i+j*n)*inc_dot];
466 }
467 
468 template <typename OrdinalType, typename FadType>
469 void
471 pack(FadType* A, OrdinalType m, OrdinalType n, OrdinalType lda,
472  OrdinalType n_dot, OrdinalType lda_val, OrdinalType lda_dot,
473  const ValueType* val, const ValueType* dot) const
474 {
475  if (m*n == 0)
476  return;
477 
478  // Copy values
479  if (&A[0].val() != val)
480  for (OrdinalType j=0; j<n; j++)
481  for (OrdinalType i=0; i<m; i++)
482  A[i+j*lda].val() = val[i+j*lda_val];
483 
484  if (n_dot == 0)
485  return;
486 
487  // Resize derivative arrays
488  if (A[0].size() != n_dot)
489  for (OrdinalType j=0; j<n; j++)
490  for (OrdinalType i=0; i<m; i++)
491  A[i+j*lda].resize(n_dot);
492 
493  // Copy derivatives
494  if (A[0].dx() != dot)
495  for (OrdinalType j=0; j<n; j++)
496  for (OrdinalType i=0; i<m; i++)
497  for (OrdinalType k=0; k<n_dot; k++)
498  A[i+j*lda].fastAccessDx(k) = dot[i+(j+k*n)*lda_dot];
499 }
500 
501 template <typename OrdinalType, typename FadType>
502 void
504 free(const FadType& a, OrdinalType n_dot, const ValueType* dot) const
505 {
506  if (n_dot > 0 && a.dx() != dot) {
507  free_array(dot, n_dot);
508  }
509 }
510 
511 template <typename OrdinalType, typename FadType>
512 void
514 free(const FadType* a, OrdinalType n, OrdinalType n_dot,
515  OrdinalType inc_val, OrdinalType inc_dot,
516  const ValueType* val, const ValueType* dot) const
517 {
518  if (n == 0)
519  return;
520 
521  if (val != &a[0].val())
522  free_array(val, n*inc_val);
523 
524  if (n_dot > 0 && a[0].dx() != dot)
525  free_array(dot, n*inc_dot*n_dot);
526 }
527 
528 template <typename OrdinalType, typename FadType>
529 void
531 free(const FadType* A, OrdinalType m, OrdinalType n, OrdinalType n_dot,
532  OrdinalType lda_val, OrdinalType lda_dot,
533  const ValueType* val, const ValueType* dot) const
534 {
535  if (m*n == 0)
536  return;
537 
538  if (val != &A[0].val())
539  free_array(val, lda_val*n);
540 
541  if (n_dot > 0 && A[0].dx() != dot)
542  free_array(dot, lda_dot*n*n_dot);
543 }
544 
545 template <typename OrdinalType, typename FadType>
548 allocate_array(OrdinalType size) const
549 {
550  if (use_dynamic)
551  return new ValueType[size];
552 
553 #ifdef SACADO_DEBUG
554  TEUCHOS_TEST_FOR_EXCEPTION(workspace_pointer + size - workspace > workspace_size,
555  std::logic_error,
556  "ArrayTraits::allocate_array(): " <<
557  "Requested workspace memory beyond size allocated. " <<
558  "Workspace size is " << workspace_size <<
559  ", currently used is " << workspace_pointer-workspace <<
560  ", requested size is " << size << ".");
561 
562 #endif
563 
564  ValueType *v = workspace_pointer;
565  workspace_pointer += size;
566  return v;
567 }
568 
569 template <typename OrdinalType, typename FadType>
570 void
572 free_array(const ValueType* ptr, OrdinalType size) const
573 {
574  if (use_dynamic && ptr != NULL)
575  delete [] ptr;
576  else
577  workspace_pointer -= size;
578 }
579 
580 template <typename OrdinalType, typename FadType>
581 bool
583 is_array_contiguous(const FadType* a, OrdinalType n, OrdinalType n_dot) const
584 {
585  return (n > 0) &&
586  (&(a[n-1].val())-&(a[0].val()) == n-1) &&
587  (a[n-1].dx()-a[0].dx() == n-1);
588 }
589 
590 template <typename OrdinalType, typename FadType>
592 BLAS(bool use_default_impl_,
593  bool use_dynamic_, OrdinalType static_workspace_size_) :
594  BLASType(),
595  arrayTraits(use_dynamic_, static_workspace_size_),
596  blas(),
597  use_default_impl(use_default_impl_)
598 {
599 }
600 
601 template <typename OrdinalType, typename FadType>
603 BLAS(const BLAS& x) :
604  BLASType(x),
605  arrayTraits(x.arrayTraits),
606  blas(x.blas),
607  use_default_impl(x.use_default_impl)
608 {
609 }
610 
611 template <typename OrdinalType, typename FadType>
614 {
615 }
616 
617 template <typename OrdinalType, typename FadType>
618 void
620 SCAL(const OrdinalType n, const FadType& alpha, FadType* x,
621  const OrdinalType incx) const
622 {
623  if (use_default_impl) {
624  BLASType::SCAL(n,alpha,x,incx);
625  return;
626  }
627 
628  // Unpack input values & derivatives
629  ValueType alpha_val;
630  const ValueType *alpha_dot;
631  ValueType *x_val, *x_dot;
632  OrdinalType n_alpha_dot = 0, n_x_dot = 0, n_dot = 0;
633  OrdinalType incx_val, incx_dot;
634  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
635  n_dot = n_alpha_dot;
636  arrayTraits.unpack(x, n, incx, n_x_dot, n_dot, incx_val, incx_dot,
637  x_val, x_dot);
638 
639 #ifdef SACADO_DEBUG
640  // Check sizes are consistent
641  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
642  (n_x_dot != n_dot && n_x_dot != 0),
643  std::logic_error,
644  "BLAS::SCAL(): All arguments must have " <<
645  "the same number of derivative components, or none");
646 #endif
647 
648  // Call differentiated routine
649  if (n_x_dot > 0)
650  blas.SCAL(n*n_x_dot, alpha_val, x_dot, incx_dot);
651  for (OrdinalType i=0; i<n_alpha_dot; i++)
652  blas.AXPY(n, alpha_dot[i], x_val, incx_val, x_dot+i*n*incx_dot, incx_dot);
653  blas.SCAL(n, alpha_val, x_val, incx_val);
654 
655  // Pack values and derivatives for result
656  arrayTraits.pack(x, n, incx, n_dot, incx_val, incx_dot, x_val, x_dot);
657 
658  // Free temporary arrays
659  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
660  arrayTraits.free(x, n, n_dot, incx_val, incx_dot, x_val, x_dot);
661 }
662 
663 template <typename OrdinalType, typename FadType>
664 void
666 COPY(const OrdinalType n, const FadType* x, const OrdinalType incx,
667  FadType* y, const OrdinalType incy) const
668 {
669  if (use_default_impl) {
670  BLASType::COPY(n,x,incx,y,incy);
671  return;
672  }
673 
674  if (n == 0)
675  return;
676 
677  OrdinalType n_x_dot = x[0].size();
678  OrdinalType n_y_dot = y[0].size();
679  if (n_x_dot == 0 || n_y_dot == 0 || n_x_dot != n_y_dot ||
680  !arrayTraits.is_array_contiguous(x, n, n_x_dot) ||
681  !arrayTraits.is_array_contiguous(y, n, n_y_dot))
682  BLASType::COPY(n,x,incx,y,incy);
683  else {
684  blas.COPY(n, &x[0].val(), incx, &y[0].val(), incy);
685  blas.COPY(n*n_x_dot, &x[0].fastAccessDx(0), incx, &y[0].fastAccessDx(0),
686  incy);
687  }
688 }
689 
690 template <typename OrdinalType, typename FadType>
691 template <typename alpha_type, typename x_type>
692 void
694 AXPY(const OrdinalType n, const alpha_type& alpha, const x_type* x,
695  const OrdinalType incx, FadType* y, const OrdinalType incy) const
696 {
697  if (use_default_impl) {
698  BLASType::AXPY(n,alpha,x,incx,y,incy);
699  return;
700  }
701 
702  // Unpack input values & derivatives
703  typename ArrayValueType<alpha_type>::type alpha_val;
704  const typename ArrayValueType<alpha_type>::type *alpha_dot;
705  const typename ArrayValueType<x_type>::type *x_val, *x_dot;
706  ValueType *y_val, *y_dot;
707  OrdinalType n_alpha_dot, n_x_dot, n_y_dot, n_dot;
708  OrdinalType incx_val, incy_val, incx_dot, incy_dot;
709  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
710  arrayTraits.unpack(x, n, incx, n_x_dot, incx_val, incx_dot, x_val, x_dot);
711 
712  // Compute size
713  n_dot = 0;
714  if (n_alpha_dot > 0)
715  n_dot = n_alpha_dot;
716  else if (n_x_dot > 0)
717  n_dot = n_x_dot;
718 
719  // Unpack and allocate y
720  arrayTraits.unpack(y, n, incy, n_y_dot, n_dot, incy_val, incy_dot, y_val,
721  y_dot);
722 
723 #ifdef SACADO_DEBUG
724  // Check sizes are consistent
725  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
726  (n_x_dot != n_dot && n_x_dot != 0) ||
727  (n_y_dot != n_dot && n_y_dot != 0),
728  std::logic_error,
729  "BLAS::AXPY(): All arguments must have " <<
730  "the same number of derivative components, or none");
731 #endif
732 
733  // Call differentiated routine
734  if (n_x_dot > 0)
735  blas.AXPY(n*n_x_dot, alpha_val, x_dot, incx_dot, y_dot, incy_dot);
736  for (OrdinalType i=0; i<n_alpha_dot; i++)
737  blas.AXPY(n, alpha_dot[i], x_val, incx_val, y_dot+i*n*incy_dot, incy_dot);
738  blas.AXPY(n, alpha_val, x_val, incx_val, y_val, incy_val);
739 
740  // Pack values and derivatives for result
741  arrayTraits.pack(y, n, incy, n_dot, incy_val, incy_dot, y_val, y_dot);
742 
743  // Free temporary arrays
744  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
745  arrayTraits.free(x, n, n_x_dot, incx_val, incx_dot, x_val, x_dot);
746  arrayTraits.free(y, n, n_dot, incy_val, incy_dot, y_val, y_dot);
747 }
748 
749 template <typename OrdinalType, typename FadType>
750 template <typename x_type, typename y_type>
751 FadType
753 DOT(const OrdinalType n, const x_type* x, const OrdinalType incx,
754  const y_type* y, const OrdinalType incy) const
755 {
756  if (use_default_impl)
757  return BLASType::DOT(n,x,incx,y,incy);
758 
759  // Unpack input values & derivatives
760  const typename ArrayValueType<x_type>::type *x_val, *x_dot;
761  const typename ArrayValueType<y_type>::type *y_val, *y_dot;
762  OrdinalType n_x_dot, n_y_dot;
763  OrdinalType incx_val, incy_val, incx_dot, incy_dot;
764  arrayTraits.unpack(x, n, incx, n_x_dot, incx_val, incx_dot, x_val, x_dot);
765  arrayTraits.unpack(y, n, incy, n_y_dot, incy_val, incy_dot, y_val, y_dot);
766 
767  // Compute size
768  OrdinalType n_z_dot = 0;
769  if (n_x_dot > 0)
770  n_z_dot = n_x_dot;
771  else if (n_y_dot > 0)
772  n_z_dot = n_y_dot;
773 
774  // Unpack and allocate z
775  FadType z(n_z_dot, 0.0);
776  ValueType& z_val = z.val();
777  ValueType *z_dot = &z.fastAccessDx(0);
778 
779  // Call differentiated routine
780  Fad_DOT(n, x_val, incx_val, n_x_dot, x_dot, incx_dot,
781  y_val, incy_val, n_y_dot, y_dot, incy_dot,
782  z_val, n_z_dot, z_dot);
783 
784  // Free temporary arrays
785  arrayTraits.free(x, n, n_x_dot, incx_val, incx_dot, x_val, x_dot);
786  arrayTraits.free(y, n, n_y_dot, incy_val, incy_dot, y_val, y_dot);
787 
788  return z;
789 }
790 
791 template <typename OrdinalType, typename FadType>
794 NRM2(const OrdinalType n, const FadType* x, const OrdinalType incx) const
795 {
796  if (use_default_impl)
797  return BLASType::NRM2(n,x,incx);
798 
799  // Unpack input values & derivatives
800  const ValueType *x_val, *x_dot;
801  OrdinalType n_x_dot, incx_val, incx_dot;
802  arrayTraits.unpack(x, n, incx, n_x_dot, incx_val, incx_dot, x_val, x_dot);
803 
804  // Unpack and allocate z
805  MagnitudeType z(n_x_dot, 0.0);
806 
807  // Call differentiated routine
808  z.val() = blas.NRM2(n, x_val, incx_val);
809  // if (!Teuchos::ScalarTraits<FadType>::isComplex && incx_dot == 1)
810  // blas.GEMV(Teuchos::TRANS, n, n_x_dot, 1.0/z.val(), x_dot, n, x_val,
811  // incx_val, 1.0, &z.fastAccessDx(0), OrdinalType(1));
812  // else
813  for (OrdinalType i=0; i<n_x_dot; i++)
814  z.fastAccessDx(i) =
815  Teuchos::ScalarTraits<ValueType>::magnitude(blas.DOT(n, x_dot+i*n*incx_dot, incx_dot, x_val, incx_val)) / z.val();
816 
817  // Free temporary arrays
818  arrayTraits.free(x, n, n_x_dot, incx_val, incx_dot, x_val, x_dot);
819 
820  return z;
821 }
822 
823 template <typename OrdinalType, typename FadType>
824 template <typename alpha_type, typename A_type, typename x_type,
825  typename beta_type>
826 void
828 GEMV(Teuchos::ETransp trans, const OrdinalType m, const OrdinalType n,
829  const alpha_type& alpha, const A_type* A,
830  const OrdinalType lda, const x_type* x,
831  const OrdinalType incx, const beta_type& beta,
832  FadType* y, const OrdinalType incy) const
833 {
834  if (use_default_impl) {
835  BLASType::GEMV(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
836  return;
837  }
838 
839  OrdinalType n_x_rows = n;
840  OrdinalType n_y_rows = m;
841  if (trans != Teuchos::NO_TRANS) {
842  n_x_rows = m;
843  n_y_rows = n;
844  }
845 
846  // Unpack input values & derivatives
847  typename ArrayValueType<alpha_type>::type alpha_val;
848  const typename ArrayValueType<alpha_type>::type *alpha_dot;
849  typename ArrayValueType<beta_type>::type beta_val;
850  const typename ArrayValueType<beta_type>::type *beta_dot;
851  const typename ArrayValueType<A_type>::type *A_val = 0, *A_dot = 0;
852  const typename ArrayValueType<x_type>::type *x_val = 0, *x_dot = 0;
853  ValueType *y_val, *y_dot;
854  OrdinalType n_alpha_dot, n_A_dot, n_x_dot, n_beta_dot, n_y_dot = 0, n_dot;
855  OrdinalType lda_val, incx_val, incy_val, lda_dot, incx_dot, incy_dot;
856  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
857  arrayTraits.unpack(A, m, n, lda, n_A_dot, lda_val, lda_dot, A_val, A_dot);
858  arrayTraits.unpack(x, n_x_rows, incx, n_x_dot, incx_val, incx_dot, x_val,
859  x_dot);
860  arrayTraits.unpack(beta, n_beta_dot, beta_val, beta_dot);
861 
862  // Compute size
863  n_dot = 0;
864  if (n_alpha_dot > 0)
865  n_dot = n_alpha_dot;
866  else if (n_A_dot > 0)
867  n_dot = n_A_dot;
868  else if (n_x_dot > 0)
869  n_dot = n_x_dot;
870  else if (n_beta_dot > 0)
871  n_dot = n_beta_dot;
872 
873  // Unpack and allocate y
874  arrayTraits.unpack(y, n_y_rows, incy, n_y_dot, n_dot, incy_val, incy_dot,
875  y_val, y_dot);
876 
877  // Call differentiated routine
878  Fad_GEMV(trans, m, n, alpha_val, n_alpha_dot, alpha_dot, A_val, lda_val,
879  n_A_dot, A_dot, lda_dot, x_val, incx_val, n_x_dot, x_dot, incx_dot,
880  beta_val, n_beta_dot, beta_dot, y_val, incy_val, n_y_dot, y_dot,
881  incy_dot, n_dot);
882 
883  // Pack values and derivatives for result
884  arrayTraits.pack(y, n_y_rows, incy, n_dot, incy_val, incy_dot, y_val, y_dot);
885 
886  // Free temporary arrays
887  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
888  arrayTraits.free(A, m, n, n_A_dot, lda_val, lda_dot, A_val, A_dot);
889  arrayTraits.free(x, n_x_rows, n_x_dot, incx_val, incx_dot, x_val, x_dot);
890  arrayTraits.free(beta, n_beta_dot, beta_dot);
891  arrayTraits.free(y, n_y_rows, n_dot, incy_val, incy_dot, y_val, y_dot);
892 }
893 
894 template <typename OrdinalType, typename FadType>
895 template <typename A_type>
896 void
899  const OrdinalType n, const A_type* A, const OrdinalType lda,
900  FadType* x, const OrdinalType incx) const
901 {
902  if (use_default_impl) {
903  BLASType::TRMV(uplo,trans,diag,n,A,lda,x,incx);
904  return;
905  }
906 
907  // Unpack input values & derivatives
908  const typename ArrayValueType<A_type>::type *A_val, *A_dot;
909  ValueType *x_val, *x_dot;
910  OrdinalType n_A_dot = 0, n_x_dot = 0, n_dot = 0;
911  OrdinalType lda_val, incx_val, lda_dot, incx_dot;
912  arrayTraits.unpack(A, n, n, lda, n_A_dot, lda_val, lda_dot, A_val, A_dot);
913  n_dot = n_A_dot;
914  arrayTraits.unpack(x, n, incx, n_x_dot, n_dot, incx_val, incx_dot, x_val,
915  x_dot);
916 
917 #ifdef SACADO_DEBUG
918  // Check sizes are consistent
919  TEUCHOS_TEST_FOR_EXCEPTION((n_A_dot != n_dot && n_A_dot != 0) ||
920  (n_x_dot != n_dot && n_x_dot != 0),
921  std::logic_error,
922  "BLAS::TRMV(): All arguments must have " <<
923  "the same number of derivative components, or none");
924 #endif
925 
926  // Compute [xd_1 .. xd_n] = A*[xd_1 .. xd_n]
927  if (n_x_dot > 0) {
928  if (incx_dot == 1)
929  blas.TRMM(Teuchos::LEFT_SIDE, uplo, trans, diag, n, n_x_dot, 1.0, A_val,
930  lda_val, x_dot, n);
931  else
932  for (OrdinalType i=0; i<n_x_dot; i++)
933  blas.TRMV(uplo, trans, diag, n, A_val, lda_val, x_dot+i*incx_dot*n,
934  incx_dot);
935  }
936 
937  // Compute [xd_1 .. xd_n] = [Ad_1*x .. Ad_n*x]
938  if (gemv_Ax.size() != std::size_t(n))
939  gemv_Ax.resize(n);
940  for (OrdinalType i=0; i<n_A_dot; i++) {
941  blas.COPY(n, x_val, incx_val, &gemv_Ax[0], OrdinalType(1));
942  blas.TRMV(uplo, trans, Teuchos::NON_UNIT_DIAG, n, A_dot+i*lda_dot*n,
943  lda_dot, &gemv_Ax[0], OrdinalType(1));
944  blas.AXPY(n, 1.0, &gemv_Ax[0], OrdinalType(1), x_dot+i*incx_dot*n,
945  incx_dot);
946  }
947 
948  // Compute x = A*x
949  blas.TRMV(uplo, trans, diag, n, A_val, lda_val, x_val, incx_val);
950 
951  // Pack values and derivatives for result
952  arrayTraits.pack(x, n, incx, n_dot, incx_val, incx_dot, x_val, x_dot);
953 
954  // Free temporary arrays
955  arrayTraits.free(A, n, n, n_A_dot, lda_val, lda_dot, A_val, A_dot);
956  arrayTraits.free(x, n, n_dot, incx_val, incx_dot, x_val, x_dot);
957 }
958 
959 template <typename OrdinalType, typename FadType>
960 template <typename alpha_type, typename x_type, typename y_type>
961 void
963 GER(const OrdinalType m, const OrdinalType n, const alpha_type& alpha,
964  const x_type* x, const OrdinalType incx,
965  const y_type* y, const OrdinalType incy,
966  FadType* A, const OrdinalType lda) const
967 {
968  if (use_default_impl) {
969  BLASType::GER(m,n,alpha,x,incx,y,incy,A,lda);
970  return;
971  }
972 
973  // Unpack input values & derivatives
974  typename ArrayValueType<alpha_type>::type alpha_val;
975  const typename ArrayValueType<alpha_type>::type *alpha_dot;
976  const typename ArrayValueType<x_type>::type *x_val = 0, *x_dot = 0;
977  const typename ArrayValueType<y_type>::type *y_val = 0, *y_dot = 0;
978  ValueType *A_val, *A_dot;
979  OrdinalType n_alpha_dot, n_x_dot, n_y_dot, n_A_dot, n_dot;
980  OrdinalType lda_val, incx_val, incy_val, lda_dot, incx_dot, incy_dot;
981  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
982  arrayTraits.unpack(x, m, incx, n_x_dot, incx_val, incx_dot, x_val, x_dot);
983  arrayTraits.unpack(y, n, incy, n_y_dot, incy_val, incy_dot, y_val, y_dot);
984 
985  // Compute size
986  n_dot = 0;
987  if (n_alpha_dot > 0)
988  n_dot = n_alpha_dot;
989  else if (n_x_dot > 0)
990  n_dot = n_x_dot;
991  else if (n_y_dot > 0)
992  n_dot = n_y_dot;
993 
994  // Unpack and allocate A
995  arrayTraits.unpack(A, m, n, lda, n_A_dot, n_dot, lda_val, lda_dot, A_val,
996  A_dot);
997 
998  // Call differentiated routine
999  Fad_GER(m, n, alpha_val, n_alpha_dot, alpha_dot, x_val, incx_val,
1000  n_x_dot, x_dot, incx_dot, y_val, incy_val, n_y_dot, y_dot,
1001  incy_dot, A_val, lda_val, n_A_dot, A_dot, lda_dot, n_dot);
1002 
1003  // Pack values and derivatives for result
1004  arrayTraits.pack(A, m, n, lda, n_dot, lda_val, lda_dot, A_val, A_dot);
1005 
1006  // Free temporary arrays
1007  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
1008  arrayTraits.free(x, m, n_x_dot, incx_val, incx_dot, x_val, x_dot);
1009  arrayTraits.free(y, n, n_y_dot, incy_val, incy_dot, y_val, y_dot);
1010  arrayTraits.free(A, m, n, n_dot, lda_val, lda_dot, A_val, A_dot);
1011 }
1012 
1013 template <typename OrdinalType, typename FadType>
1014 template <typename alpha_type, typename A_type, typename B_type,
1015  typename beta_type>
1016 void
1019  const OrdinalType m, const OrdinalType n, const OrdinalType k,
1020  const alpha_type& alpha, const A_type* A, const OrdinalType lda,
1021  const B_type* B, const OrdinalType ldb, const beta_type& beta,
1022  FadType* C, const OrdinalType ldc) const
1023 {
1024  if (use_default_impl) {
1025  BLASType::GEMM(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
1026  return;
1027  }
1028 
1029  OrdinalType n_A_rows = m;
1030  OrdinalType n_A_cols = k;
1031  if (transa != Teuchos::NO_TRANS) {
1032  n_A_rows = k;
1033  n_A_cols = m;
1034  }
1035 
1036  OrdinalType n_B_rows = k;
1037  OrdinalType n_B_cols = n;
1038  if (transb != Teuchos::NO_TRANS) {
1039  n_B_rows = n;
1040  n_B_cols = k;
1041  }
1042 
1043  // Unpack input values & derivatives
1044  typename ArrayValueType<alpha_type>::type alpha_val;
1045  const typename ArrayValueType<alpha_type>::type *alpha_dot;
1046  typename ArrayValueType<beta_type>::type beta_val;
1047  const typename ArrayValueType<beta_type>::type *beta_dot;
1048  const typename ArrayValueType<A_type>::type *A_val, *A_dot;
1049  const typename ArrayValueType<B_type>::type *B_val, *B_dot;
1050  ValueType *C_val, *C_dot;
1051  OrdinalType n_alpha_dot, n_A_dot, n_B_dot, n_beta_dot, n_C_dot = 0, n_dot;
1052  OrdinalType lda_val, ldb_val, ldc_val, lda_dot, ldb_dot, ldc_dot;
1053  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
1054  arrayTraits.unpack(A, n_A_rows, n_A_cols, lda, n_A_dot, lda_val, lda_dot,
1055  A_val, A_dot);
1056  arrayTraits.unpack(B, n_B_rows, n_B_cols, ldb, n_B_dot, ldb_val, ldb_dot,
1057  B_val, B_dot);
1058  arrayTraits.unpack(beta, n_beta_dot, beta_val, beta_dot);
1059 
1060  // Compute size
1061  n_dot = 0;
1062  if (n_alpha_dot > 0)
1063  n_dot = n_alpha_dot;
1064  else if (n_A_dot > 0)
1065  n_dot = n_A_dot;
1066  else if (n_B_dot > 0)
1067  n_dot = n_B_dot;
1068  else if (n_beta_dot > 0)
1069  n_dot = n_beta_dot;
1070 
1071  // Unpack and allocate C
1072  arrayTraits.unpack(C, m, n, ldc, n_C_dot, n_dot, ldc_val, ldc_dot, C_val,
1073  C_dot);
1074 
1075  // Call differentiated routine
1076  Fad_GEMM(transa, transb, m, n, k,
1077  alpha_val, n_alpha_dot, alpha_dot,
1078  A_val, lda_val, n_A_dot, A_dot, lda_dot,
1079  B_val, ldb_val, n_B_dot, B_dot, ldb_dot,
1080  beta_val, n_beta_dot, beta_dot,
1081  C_val, ldc_val, n_C_dot, C_dot, ldc_dot, n_dot);
1082 
1083  // Pack values and derivatives for result
1084  arrayTraits.pack(C, m, n, ldc, n_dot, ldc_val, ldc_dot, C_val, C_dot);
1085 
1086  // Free temporary arrays
1087  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
1088  arrayTraits.free(A, n_A_rows, n_A_cols, n_A_dot, lda_val, lda_dot, A_val,
1089  A_dot);
1090  arrayTraits.free(B, n_B_rows, n_B_cols, n_B_dot, ldb_val, ldb_dot, B_val,
1091  B_dot);
1092  arrayTraits.free(beta, n_beta_dot, beta_dot);
1093  arrayTraits.free(C, m, n, n_dot, ldc_val, ldc_dot, C_val, C_dot);
1094 }
1095 
1096 template <typename OrdinalType, typename FadType>
1097 template <typename alpha_type, typename A_type, typename B_type,
1098  typename beta_type>
1099 void
1102  const OrdinalType m, const OrdinalType n,
1103  const alpha_type& alpha, const A_type* A, const OrdinalType lda,
1104  const B_type* B, const OrdinalType ldb, const beta_type& beta,
1105  FadType* C, const OrdinalType ldc) const
1106 {
1107  if (use_default_impl) {
1108  BLASType::SYMM(side,uplo,m,n,alpha,A,lda,B,ldb,beta,C,ldc);
1109  return;
1110  }
1111 
1112  OrdinalType n_A_rows = m;
1113  OrdinalType n_A_cols = m;
1114  if (side == Teuchos::RIGHT_SIDE) {
1115  n_A_rows = n;
1116  n_A_cols = n;
1117  }
1118 
1119  // Unpack input values & derivatives
1120  typename ArrayValueType<alpha_type>::type alpha_val;
1121  const typename ArrayValueType<alpha_type>::type *alpha_dot;
1122  typename ArrayValueType<beta_type>::type beta_val;
1123  const typename ArrayValueType<beta_type>::type *beta_dot;
1124  const typename ArrayValueType<A_type>::type *A_val, *A_dot;
1125  const typename ArrayValueType<B_type>::type *B_val, *B_dot;
1126  ValueType *C_val, *C_dot;
1127  OrdinalType n_alpha_dot, n_A_dot, n_B_dot, n_beta_dot, n_C_dot, n_dot;
1128  OrdinalType lda_val, ldb_val, ldc_val, lda_dot, ldb_dot, ldc_dot;
1129  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
1130  arrayTraits.unpack(A, n_A_rows, n_A_cols, lda, n_A_dot, lda_val, lda_dot,
1131  A_val, A_dot);
1132  arrayTraits.unpack(B, m, n, ldb, n_B_dot, ldb_val, ldb_dot, B_val, B_dot);
1133  arrayTraits.unpack(beta, n_beta_dot, beta_val, beta_dot);
1134 
1135  // Compute size
1136  n_dot = 0;
1137  if (n_alpha_dot > 0)
1138  n_dot = n_alpha_dot;
1139  else if (n_A_dot > 0)
1140  n_dot = n_A_dot;
1141  else if (n_B_dot > 0)
1142  n_dot = n_B_dot;
1143  else if (n_beta_dot > 0)
1144  n_dot = n_beta_dot;
1145 
1146  // Unpack and allocate C
1147  arrayTraits.unpack(C, m, n, ldc, n_C_dot, n_dot, ldc_val, ldc_dot, C_val,
1148  C_dot);
1149 
1150  // Call differentiated routine
1151  Fad_SYMM(side, uplo, m, n,
1152  alpha_val, n_alpha_dot, alpha_dot,
1153  A_val, lda_val, n_A_dot, A_dot, lda_dot,
1154  B_val, ldb_val, n_B_dot, B_dot, ldb_dot,
1155  beta_val, n_beta_dot, beta_dot,
1156  C_val, ldc_val, n_C_dot, C_dot, ldc_dot, n_dot);
1157 
1158  // Pack values and derivatives for result
1159  arrayTraits.pack(C, m, n, ldc, n_dot, ldc_val, ldc_dot, C_val, C_dot);
1160 
1161  // Free temporary arrays
1162  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
1163  arrayTraits.free(A, n_A_rows, n_A_cols, n_A_dot, lda_val, lda_dot, A_val,
1164  A_dot);
1165  arrayTraits.free(B, m, n, n_B_dot, ldb_val, ldb_dot, B_val, B_dot);
1166  arrayTraits.free(beta, n_beta_dot, beta_dot);
1167  arrayTraits.free(C, m, n, n_dot, ldc_val, ldc_dot, C_val, C_dot);
1168 }
1169 
1170 template <typename OrdinalType, typename FadType>
1171 template <typename alpha_type, typename A_type>
1172 void
1175  Teuchos::ETransp transa, Teuchos::EDiag diag,
1176  const OrdinalType m, const OrdinalType n,
1177  const alpha_type& alpha, const A_type* A, const OrdinalType lda,
1178  FadType* B, const OrdinalType ldb) const
1179 {
1180  if (use_default_impl) {
1181  BLASType::TRMM(side,uplo,transa,diag,m,n,alpha,A,lda,B,ldb);
1182  return;
1183  }
1184 
1185  OrdinalType n_A_rows = m;
1186  OrdinalType n_A_cols = m;
1187  if (side == Teuchos::RIGHT_SIDE) {
1188  n_A_rows = n;
1189  n_A_cols = n;
1190  }
1191 
1192  // Unpack input values & derivatives
1193  typename ArrayValueType<alpha_type>::type alpha_val;
1194  const typename ArrayValueType<alpha_type>::type *alpha_dot;
1195  const typename ArrayValueType<A_type>::type *A_val, *A_dot;
1196  ValueType *B_val, *B_dot;
1197  OrdinalType n_alpha_dot, n_A_dot, n_B_dot, n_dot;
1198  OrdinalType lda_val, ldb_val, lda_dot, ldb_dot;
1199  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
1200  arrayTraits.unpack(A, n_A_rows, n_A_cols, lda, n_A_dot, lda_val, lda_dot,
1201  A_val, A_dot);
1202 
1203  // Compute size
1204  n_dot = 0;
1205  if (n_alpha_dot > 0)
1206  n_dot = n_alpha_dot;
1207  else if (n_A_dot > 0)
1208  n_dot = n_A_dot;
1209 
1210  // Unpack and allocate B
1211  arrayTraits.unpack(B, m, n, ldb, n_B_dot, n_dot, ldb_val, ldb_dot, B_val,
1212  B_dot);
1213 
1214  // Call differentiated routine
1215  Fad_TRMM(side, uplo, transa, diag, m, n,
1216  alpha_val, n_alpha_dot, alpha_dot,
1217  A_val, lda_val, n_A_dot, A_dot, lda_dot,
1218  B_val, ldb_val, n_B_dot, B_dot, ldb_dot, n_dot);
1219 
1220  // Pack values and derivatives for result
1221  arrayTraits.pack(B, m, n, ldb, n_dot, ldb_val, ldb_dot, B_val, B_dot);
1222 
1223  // Free temporary arrays
1224  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
1225  arrayTraits.free(A, n_A_rows, n_A_cols, n_A_dot, lda_val, lda_dot, A_val,
1226  A_dot);
1227  arrayTraits.free(B, m, n, n_dot, ldb_val, ldb_dot, B_val, B_dot);
1228 }
1229 
1230 template <typename OrdinalType, typename FadType>
1231 template <typename alpha_type, typename A_type>
1232 void
1235  Teuchos::ETransp transa, Teuchos::EDiag diag,
1236  const OrdinalType m, const OrdinalType n,
1237  const alpha_type& alpha, const A_type* A, const OrdinalType lda,
1238  FadType* B, const OrdinalType ldb) const
1239 {
1240  if (use_default_impl) {
1241  BLASType::TRSM(side,uplo,transa,diag,m,n,alpha,A,lda,B,ldb);
1242  return;
1243  }
1244 
1245  OrdinalType n_A_rows = m;
1246  OrdinalType n_A_cols = m;
1247  if (side == Teuchos::RIGHT_SIDE) {
1248  n_A_rows = n;
1249  n_A_cols = n;
1250  }
1251 
1252  // Unpack input values & derivatives
1253  typename ArrayValueType<alpha_type>::type alpha_val;
1254  const typename ArrayValueType<alpha_type>::type *alpha_dot;
1255  const typename ArrayValueType<A_type>::type *A_val, *A_dot;
1256  ValueType *B_val, *B_dot;
1257  OrdinalType n_alpha_dot, n_A_dot, n_B_dot, n_dot;
1258  OrdinalType lda_val, ldb_val, lda_dot, ldb_dot;
1259  arrayTraits.unpack(alpha, n_alpha_dot, alpha_val, alpha_dot);
1260  arrayTraits.unpack(A, n_A_rows, n_A_cols, lda, n_A_dot, lda_val, lda_dot,
1261  A_val, A_dot);
1262 
1263  // Compute size
1264  n_dot = 0;
1265  if (n_alpha_dot > 0)
1266  n_dot = n_alpha_dot;
1267  else if (n_A_dot > 0)
1268  n_dot = n_A_dot;
1269 
1270  // Unpack and allocate B
1271  arrayTraits.unpack(B, m, n, ldb, n_B_dot, n_dot, ldb_val, ldb_dot, B_val,
1272  B_dot);
1273 
1274  // Call differentiated routine
1275  Fad_TRSM(side, uplo, transa, diag, m, n,
1276  alpha_val, n_alpha_dot, alpha_dot,
1277  A_val, lda_val, n_A_dot, A_dot, lda_dot,
1278  B_val, ldb_val, n_B_dot, B_dot, ldb_dot, n_dot);
1279 
1280  // Pack values and derivatives for result
1281  arrayTraits.pack(B, m, n, ldb, n_dot, ldb_val, ldb_dot, B_val, B_dot);
1282 
1283  // Free temporary arrays
1284  arrayTraits.free(alpha, n_alpha_dot, alpha_dot);
1285  arrayTraits.free(A, n_A_rows, n_A_cols, n_A_dot, lda_val, lda_dot, A_val,
1286  A_dot);
1287  arrayTraits.free(B, m, n, n_dot, ldb_val, ldb_dot, B_val, B_dot);
1288 }
1289 
1290 template <typename OrdinalType, typename FadType>
1291 template <typename x_type, typename y_type>
1292 void
1294 Fad_DOT(const OrdinalType n,
1295  const x_type* x,
1296  const OrdinalType incx,
1297  const OrdinalType n_x_dot,
1298  const x_type* x_dot,
1299  const OrdinalType incx_dot,
1300  const y_type* y,
1301  const OrdinalType incy,
1302  const OrdinalType n_y_dot,
1303  const y_type* y_dot,
1304  const OrdinalType incy_dot,
1305  ValueType& z,
1306  const OrdinalType n_z_dot,
1307  ValueType* z_dot) const
1308 {
1309 #ifdef SACADO_DEBUG
1310  // Check sizes are consistent
1311  TEUCHOS_TEST_FOR_EXCEPTION((n_x_dot != n_z_dot && n_x_dot != 0) ||
1312  (n_y_dot != n_z_dot && n_y_dot != 0),
1313  std::logic_error,
1314  "BLAS::Fad_DOT(): All arguments must have " <<
1315  "the same number of derivative components, or none");
1316 #endif
1317 
1318  // Compute [zd_1 .. zd_n] = [xd_1 .. xd_n]^T*y
1319  if (n_x_dot > 0) {
1320  if (incx_dot == OrdinalType(1))
1321  blas.GEMV(Teuchos::TRANS, n, n_x_dot, 1.0, x_dot, n, y, incy, 0.0, z_dot,
1322  OrdinalType(1));
1323  else
1324  for (OrdinalType i=0; i<n_z_dot; i++)
1325  z_dot[i] = blas.DOT(n, x_dot+i*incx_dot*n, incx_dot, y, incy);
1326  }
1327 
1328  // Compute [zd_1 .. zd_n] += [yd_1 .. yd_n]^T*x
1329  if (n_y_dot > 0) {
1330  if (incy_dot == OrdinalType(1) &&
1332  blas.GEMV(Teuchos::TRANS, n, n_y_dot, 1.0, y_dot, n, x, incx, 1.0, z_dot,
1333  OrdinalType(1));
1334  else
1335  for (OrdinalType i=0; i<n_z_dot; i++)
1336  z_dot[i] += blas.DOT(n, x, incx, y_dot+i*incy_dot*n, incy_dot);
1337  }
1338 
1339  // Compute z = x^T*y
1340  z = blas.DOT(n, x, incx, y, incy);
1341 }
1342 
1343 template <typename OrdinalType, typename FadType>
1344 template <typename alpha_type, typename A_type, typename x_type,
1345  typename beta_type>
1346 void
1349  const OrdinalType m,
1350  const OrdinalType n,
1351  const alpha_type& alpha,
1352  const OrdinalType n_alpha_dot,
1353  const alpha_type* alpha_dot,
1354  const A_type* A,
1355  const OrdinalType lda,
1356  const OrdinalType n_A_dot,
1357  const A_type* A_dot,
1358  const OrdinalType lda_dot,
1359  const x_type* x,
1360  const OrdinalType incx,
1361  const OrdinalType n_x_dot,
1362  const x_type* x_dot,
1363  const OrdinalType incx_dot,
1364  const beta_type& beta,
1365  const OrdinalType n_beta_dot,
1366  const beta_type* beta_dot,
1367  ValueType* y,
1368  const OrdinalType incy,
1369  const OrdinalType n_y_dot,
1370  ValueType* y_dot,
1371  const OrdinalType incy_dot,
1372  const OrdinalType n_dot) const
1373 {
1374 #ifdef SACADO_DEBUG
1375  // Check sizes are consistent
1376  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
1377  (n_A_dot != n_dot && n_A_dot != 0) ||
1378  (n_x_dot != n_dot && n_x_dot != 0) ||
1379  (n_beta_dot != n_dot && n_beta_dot != 0) ||
1380  (n_y_dot != n_dot && n_y_dot != 0),
1381  std::logic_error,
1382  "BLAS::Fad_GEMV(): All arguments must have " <<
1383  "the same number of derivative components, or none");
1384 #endif
1385  OrdinalType n_A_rows = m;
1386  OrdinalType n_A_cols = n;
1387  OrdinalType n_x_rows = n;
1388  OrdinalType n_y_rows = m;
1389  if (trans == Teuchos::TRANS) {
1390  n_A_rows = n;
1391  n_A_cols = m;
1392  n_x_rows = m;
1393  n_y_rows = n;
1394  }
1395 
1396  // Compute [yd_1 .. yd_n] = beta*[yd_1 .. yd_n]
1397  if (n_y_dot > 0)
1398  blas.SCAL(n_y_rows*n_y_dot, beta, y_dot, incy_dot);
1399 
1400  // Compute [yd_1 .. yd_n] = alpha*A*[xd_1 .. xd_n]
1401  if (n_x_dot > 0) {
1402  if (incx_dot == 1)
1403  blas.GEMM(trans, Teuchos::NO_TRANS, n_A_rows, n_dot, n_A_cols,
1404  alpha, A, lda, x_dot, n_x_rows, 1.0, y_dot, n_y_rows);
1405  else
1406  for (OrdinalType i=0; i<n_x_dot; i++)
1407  blas.GEMV(trans, m, n, alpha, A, lda, x_dot+i*incx_dot*n_x_rows,
1408  incx_dot, 1.0, y_dot+i*incy_dot*n_y_rows, incy_dot);
1409  }
1410 
1411  // Compute [yd_1 .. yd_n] += diag([alphad_1 .. alphad_n])*A*x
1412  if (n_alpha_dot > 0) {
1413  if (gemv_Ax.size() != std::size_t(n))
1414  gemv_Ax.resize(n);
1415  blas.GEMV(trans, m, n, 1.0, A, lda, x, incx, 0.0, &gemv_Ax[0],
1416  OrdinalType(1));
1417  for (OrdinalType i=0; i<n_alpha_dot; i++)
1418  blas.AXPY(n_y_rows, alpha_dot[i], &gemv_Ax[0], OrdinalType(1),
1419  y_dot+i*incy_dot*n_y_rows, incy_dot);
1420  }
1421 
1422  // Compute [yd_1 .. yd_n] += alpha*[Ad_1*x .. Ad_n*x]
1423  for (OrdinalType i=0; i<n_A_dot; i++)
1424  blas.GEMV(trans, m, n, alpha, A_dot+i*lda_dot*n, lda_dot, x, incx, 1.0,
1425  y_dot+i*incy_dot*n_y_rows, incy_dot);
1426 
1427  // Compute [yd_1 .. yd_n] += diag([betad_1 .. betad_n])*y
1428  for (OrdinalType i=0; i<n_beta_dot; i++)
1429  blas.AXPY(n_y_rows, beta_dot[i], y, incy, y_dot+i*incy_dot*n_y_rows,
1430  incy_dot);
1431 
1432  // Compute y = alpha*A*x + beta*y
1433  if (n_alpha_dot > 0) {
1434  blas.SCAL(n_y_rows, beta, y, incy);
1435  blas.AXPY(n_y_rows, alpha, &gemv_Ax[0], OrdinalType(1), y, incy);
1436  }
1437  else
1438  blas.GEMV(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
1439 }
1440 
1441 template <typename OrdinalType, typename FadType>
1442 template <typename alpha_type, typename x_type, typename y_type>
1443 void
1445 Fad_GER(const OrdinalType m,
1446  const OrdinalType n,
1447  const alpha_type& alpha,
1448  const OrdinalType n_alpha_dot,
1449  const alpha_type* alpha_dot,
1450  const x_type* x,
1451  const OrdinalType incx,
1452  const OrdinalType n_x_dot,
1453  const x_type* x_dot,
1454  const OrdinalType incx_dot,
1455  const y_type* y,
1456  const OrdinalType incy,
1457  const OrdinalType n_y_dot,
1458  const y_type* y_dot,
1459  const OrdinalType incy_dot,
1460  ValueType* A,
1461  const OrdinalType lda,
1462  const OrdinalType n_A_dot,
1463  ValueType* A_dot,
1464  const OrdinalType lda_dot,
1465  const OrdinalType n_dot) const
1466 {
1467 #ifdef SACADO_DEBUG
1468  // Check sizes are consistent
1469  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
1470  (n_A_dot != n_dot && n_A_dot != 0) ||
1471  (n_x_dot != n_dot && n_x_dot != 0) ||
1472  (n_y_dot != n_dot && n_y_dot != 0),
1473  std::logic_error,
1474  "BLAS::Fad_GER(): All arguments must have " <<
1475  "the same number of derivative components, or none");
1476 #endif
1477 
1478  // Compute [Ad_1 .. Ad_n] += [alphad_1*x*y^T .. alphad_n*x*y^T]
1479  for (OrdinalType i=0; i<n_alpha_dot; i++)
1480  blas.GER(m, n, alpha_dot[i], x, incx, y, incy, A_dot+i*lda_dot*n, lda_dot);
1481 
1482  // Compute [Ad_1 .. Ad_n] += alpha*[xd_1*y^T .. xd_n*y^T]
1483  for (OrdinalType i=0; i<n_x_dot; i++)
1484  blas.GER(m, n, alpha, x_dot+i*incx_dot*m, incx_dot, y, incy,
1485  A_dot+i*lda_dot*n, lda_dot);
1486 
1487  // Compute [Ad_1 .. Ad_n] += alpha*x*[yd_1 .. yd_n]
1488  if (n_y_dot > 0)
1489  blas.GER(m, n*n_y_dot, alpha, x, incx, y_dot, incy_dot, A_dot, lda_dot);
1490 
1491  // Compute A = alpha*x*y^T + A
1492  blas.GER(m, n, alpha, x, incx, y, incy, A, lda);
1493 }
1494 
1495 template <typename OrdinalType, typename FadType>
1496 template <typename alpha_type, typename A_type, typename B_type,
1497  typename beta_type>
1498 void
1501  Teuchos::ETransp transb,
1502  const OrdinalType m,
1503  const OrdinalType n,
1504  const OrdinalType k,
1505  const alpha_type& alpha,
1506  const OrdinalType n_alpha_dot,
1507  const alpha_type* alpha_dot,
1508  const A_type* A,
1509  const OrdinalType lda,
1510  const OrdinalType n_A_dot,
1511  const A_type* A_dot,
1512  const OrdinalType lda_dot,
1513  const B_type* B,
1514  const OrdinalType ldb,
1515  const OrdinalType n_B_dot,
1516  const B_type* B_dot,
1517  const OrdinalType ldb_dot,
1518  const beta_type& beta,
1519  const OrdinalType n_beta_dot,
1520  const beta_type* beta_dot,
1521  ValueType* C,
1522  const OrdinalType ldc,
1523  const OrdinalType n_C_dot,
1524  ValueType* C_dot,
1525  const OrdinalType ldc_dot,
1526  const OrdinalType n_dot) const
1527 {
1528 #ifdef SACADO_DEBUG
1529  // Check sizes are consistent
1530  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
1531  (n_A_dot != n_dot && n_A_dot != 0) ||
1532  (n_B_dot != n_dot && n_B_dot != 0) ||
1533  (n_beta_dot != n_dot && n_beta_dot != 0) ||
1534  (n_C_dot != n_dot && n_C_dot != 0),
1535  std::logic_error,
1536  "BLAS::Fad_GEMM(): All arguments must have " <<
1537  "the same number of derivative components, or none");
1538 #endif
1539  OrdinalType n_A_cols = k;
1540  if (transa != Teuchos::NO_TRANS) {
1541  n_A_cols = m;
1542  }
1543 
1544  OrdinalType n_B_cols = n;
1545  if (transb != Teuchos::NO_TRANS) {
1546  n_B_cols = k;
1547  }
1548 
1549  // Compute [Cd_1 .. Cd_n] = beta*[Cd_1 .. Cd_n]
1550  if (n_C_dot > 0) {
1551  if (ldc_dot == m)
1552  blas.SCAL(m*n*n_C_dot, beta, C_dot, OrdinalType(1));
1553  else
1554  for (OrdinalType i=0; i<n_C_dot; i++)
1555  for (OrdinalType j=0; j<n; j++)
1556  blas.SCAL(m, beta, C_dot+i*ldc_dot*n+j*ldc_dot, OrdinalType(1));
1557  }
1558 
1559  // Compute [Cd_1 .. Cd_n] += alpha*A*[Bd_1 .. Bd_n]
1560  for (OrdinalType i=0; i<n_B_dot; i++)
1561  blas.GEMM(transa, transb, m, n, k, alpha, A, lda, B_dot+i*ldb_dot*n_B_cols,
1562  ldb_dot, 1.0, C_dot+i*ldc_dot*n, ldc_dot);
1563 
1564  // Compute [Cd_1 .. Cd_n] += [alphad_1*A*B .. alphad_n*A*B]
1565  if (n_alpha_dot > 0) {
1566  if (gemm_AB.size() != std::size_t(m*n))
1567  gemm_AB.resize(m*n);
1568  blas.GEMM(transa, transb, m, n, k, 1.0, A, lda, B, ldb, 0.0, &gemm_AB[0],
1569  OrdinalType(m));
1570  if (ldc_dot == m)
1571  for (OrdinalType i=0; i<n_alpha_dot; i++)
1572  blas.AXPY(m*n, alpha_dot[i], &gemm_AB[0], OrdinalType(1),
1573  C_dot+i*ldc_dot*n, OrdinalType(1));
1574  else
1575  for (OrdinalType i=0; i<n_alpha_dot; i++)
1576  for (OrdinalType j=0; j<n; j++)
1577  blas.AXPY(m, alpha_dot[i], &gemm_AB[j*m], OrdinalType(1),
1578  C_dot+i*ldc_dot*n+j*ldc_dot, OrdinalType(1));
1579  }
1580 
1581  // Compute [Cd_1 .. Cd_n] += alpha*[Ad_1*B .. Ad_n*B]
1582  for (OrdinalType i=0; i<n_A_dot; i++)
1583  blas.GEMM(transa, transb, m, n, k, alpha, A_dot+i*lda_dot*n_A_cols,
1584  lda_dot, B, ldb, 1.0, C_dot+i*ldc_dot*n, ldc_dot);
1585 
1586  // Compute [Cd_1 .. Cd_n] += [betad_1*C .. betad_n*C]
1587  if (ldc == m && ldc_dot == m)
1588  for (OrdinalType i=0; i<n_beta_dot; i++)
1589  blas.AXPY(m*n, beta_dot[i], C, OrdinalType(1), C_dot+i*ldc_dot*n,
1590  OrdinalType(1));
1591  else
1592  for (OrdinalType i=0; i<n_beta_dot; i++)
1593  for (OrdinalType j=0; j<n; j++)
1594  blas.AXPY(m, beta_dot[i], C+j*ldc, OrdinalType(1),
1595  C_dot+i*ldc_dot*n+j*ldc_dot, OrdinalType(1));
1596 
1597  // Compute C = alpha*A*B + beta*C
1598  if (n_alpha_dot > 0) {
1599  if (ldc == m) {
1600  blas.SCAL(m*n, beta, C, OrdinalType(1));
1601  blas.AXPY(m*n, alpha, &gemm_AB[0], OrdinalType(1), C, OrdinalType(1));
1602  }
1603  else
1604  for (OrdinalType j=0; j<n; j++) {
1605  blas.SCAL(m, beta, C+j*ldc, OrdinalType(1));
1606  blas.AXPY(m, alpha, &gemm_AB[j*m], OrdinalType(1), C+j*ldc,
1607  OrdinalType(1));
1608  }
1609  }
1610  else
1611  blas.GEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
1612 }
1613 
1614 template <typename OrdinalType, typename FadType>
1615 template <typename alpha_type, typename A_type, typename B_type,
1616  typename beta_type>
1617 void
1620  const OrdinalType m,
1621  const OrdinalType n,
1622  const alpha_type& alpha,
1623  const OrdinalType n_alpha_dot,
1624  const alpha_type* alpha_dot,
1625  const A_type* A,
1626  const OrdinalType lda,
1627  const OrdinalType n_A_dot,
1628  const A_type* A_dot,
1629  const OrdinalType lda_dot,
1630  const B_type* B,
1631  const OrdinalType ldb,
1632  const OrdinalType n_B_dot,
1633  const B_type* B_dot,
1634  const OrdinalType ldb_dot,
1635  const beta_type& beta,
1636  const OrdinalType n_beta_dot,
1637  const beta_type* beta_dot,
1638  ValueType* C,
1639  const OrdinalType ldc,
1640  const OrdinalType n_C_dot,
1641  ValueType* C_dot,
1642  const OrdinalType ldc_dot,
1643  const OrdinalType n_dot) const
1644 {
1645 #ifdef SACADO_DEBUG
1646  // Check sizes are consistent
1647  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
1648  (n_A_dot != n_dot && n_A_dot != 0) ||
1649  (n_B_dot != n_dot && n_B_dot != 0) ||
1650  (n_beta_dot != n_dot && n_beta_dot != 0) ||
1651  (n_C_dot != n_dot && n_C_dot != 0),
1652  std::logic_error,
1653  "BLAS::Fad_SYMM(): All arguments must have " <<
1654  "the same number of derivative components, or none");
1655 #endif
1656  OrdinalType n_A_cols = m;
1657  if (side == Teuchos::RIGHT_SIDE) {
1658  n_A_cols = n;
1659  }
1660 
1661  // Compute [Cd_1 .. Cd_n] = beta*[Cd_1 .. Cd_n]
1662  if (n_C_dot > 0) {
1663  if (ldc_dot == m)
1664  blas.SCAL(m*n*n_C_dot, beta, C_dot, OrdinalType(1));
1665  else
1666  for (OrdinalType i=0; i<n_C_dot; i++)
1667  for (OrdinalType j=0; j<n; j++)
1668  blas.SCAL(m, beta, C_dot+i*ldc_dot*n+j*ldc_dot, OrdinalType(1));
1669  }
1670 
1671  // Compute [Cd_1 .. Cd_n] += alpha*A*[Bd_1 .. Bd_n]
1672  for (OrdinalType i=0; i<n_B_dot; i++)
1673  blas.SYMM(side, uplo, m, n, alpha, A, lda, B_dot+i*ldb_dot*n,
1674  ldb_dot, 1.0, C_dot+i*ldc_dot*n, ldc_dot);
1675 
1676  // Compute [Cd_1 .. Cd_n] += [alphad_1*A*B .. alphad_n*A*B]
1677  if (n_alpha_dot > 0) {
1678  if (gemm_AB.size() != std::size_t(m*n))
1679  gemm_AB.resize(m*n);
1680  blas.SYMM(side, uplo, m, n, 1.0, A, lda, B, ldb, 0.0, &gemm_AB[0],
1681  OrdinalType(m));
1682  if (ldc_dot == m)
1683  for (OrdinalType i=0; i<n_alpha_dot; i++)
1684  blas.AXPY(m*n, alpha_dot[i], &gemm_AB[0], OrdinalType(1),
1685  C_dot+i*ldc_dot*n, OrdinalType(1));
1686  else
1687  for (OrdinalType i=0; i<n_alpha_dot; i++)
1688  for (OrdinalType j=0; j<n; j++)
1689  blas.AXPY(m, alpha_dot[i], &gemm_AB[j*m], OrdinalType(1),
1690  C_dot+i*ldc_dot*n+j*ldc_dot, OrdinalType(1));
1691  }
1692 
1693  // Compute [Cd_1 .. Cd_n] += alpha*[Ad_1*B .. Ad_n*B]
1694  for (OrdinalType i=0; i<n_A_dot; i++)
1695  blas.SYMM(side, uplo, m, n, alpha, A_dot+i*lda_dot*n_A_cols, lda_dot, B,
1696  ldb, 1.0, C_dot+i*ldc_dot*n, ldc_dot);
1697 
1698  // Compute [Cd_1 .. Cd_n] += [betad_1*C .. betad_n*C]
1699  if (ldc == m && ldc_dot == m)
1700  for (OrdinalType i=0; i<n_beta_dot; i++)
1701  blas.AXPY(m*n, beta_dot[i], C, OrdinalType(1), C_dot+i*ldc_dot*n,
1702  OrdinalType(1));
1703  else
1704  for (OrdinalType i=0; i<n_beta_dot; i++)
1705  for (OrdinalType j=0; j<n; j++)
1706  blas.AXPY(m, beta_dot[i], C+j*ldc, OrdinalType(1),
1707  C_dot+i*ldc_dot*n+j*ldc_dot, OrdinalType(1));
1708 
1709  // Compute C = alpha*A*B + beta*C
1710  if (n_alpha_dot > 0) {
1711  if (ldc == m) {
1712  blas.SCAL(m*n, beta, C, OrdinalType(1));
1713  blas.AXPY(m*n, alpha, &gemm_AB[0], OrdinalType(1), C, OrdinalType(1));
1714  }
1715  else
1716  for (OrdinalType j=0; j<n; j++) {
1717  blas.SCAL(m, beta, C+j*ldc, OrdinalType(1));
1718  blas.AXPY(m, alpha, &gemm_AB[j*m], OrdinalType(1), C+j*ldc,
1719  OrdinalType(1));
1720  }
1721  }
1722  else
1723  blas.SYMM(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
1724 }
1725 
1726 template <typename OrdinalType, typename FadType>
1727 template <typename alpha_type, typename A_type>
1728 void
1731  Teuchos::EUplo uplo,
1732  Teuchos::ETransp transa,
1733  Teuchos::EDiag diag,
1734  const OrdinalType m,
1735  const OrdinalType n,
1736  const alpha_type& alpha,
1737  const OrdinalType n_alpha_dot,
1738  const alpha_type* alpha_dot,
1739  const A_type* A,
1740  const OrdinalType lda,
1741  const OrdinalType n_A_dot,
1742  const A_type* A_dot,
1743  const OrdinalType lda_dot,
1744  ValueType* B,
1745  const OrdinalType ldb,
1746  const OrdinalType n_B_dot,
1747  ValueType* B_dot,
1748  const OrdinalType ldb_dot,
1749  const OrdinalType n_dot) const
1750 {
1751 #ifdef SACADO_DEBUG
1752  // Check sizes are consistent
1753  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
1754  (n_A_dot != n_dot && n_A_dot != 0) ||
1755  (n_B_dot != n_dot && n_B_dot != 0),
1756  std::logic_error,
1757  "BLAS::Fad_TRMM(): All arguments must have " <<
1758  "the same number of derivative components, or none");
1759 #endif
1760  OrdinalType n_A_cols = m;
1761  if (side == Teuchos::RIGHT_SIDE) {
1762  n_A_cols = n;
1763  }
1764 
1765  // Compute [Bd_1 .. Bd_n] = alpha*A*[Bd_1 .. Bd_n]
1766  for (OrdinalType i=0; i<n_B_dot; i++)
1767  blas.TRMM(side, uplo, transa, diag, m, n, alpha, A, lda, B_dot+i*ldb_dot*n,
1768  ldb_dot);
1769 
1770  // Compute [Bd_1 .. Bd_n] += [alphad_1*A*B .. alphad_n*A*B]
1771  if (n_alpha_dot > 0) {
1772  if (gemm_AB.size() != std::size_t(m*n))
1773  gemm_AB.resize(m*n);
1774  if (ldb == m)
1775  blas.COPY(m*n, B, OrdinalType(1), &gemm_AB[0], OrdinalType(1));
1776  else
1777  for (OrdinalType j=0; j<n; j++)
1778  blas.COPY(m, B+j*ldb, OrdinalType(1), &gemm_AB[j*m], OrdinalType(1));
1779  blas.TRMM(side, uplo, transa, diag, m, n, 1.0, A, lda, &gemm_AB[0],
1780  OrdinalType(m));
1781  if (ldb_dot == m)
1782  for (OrdinalType i=0; i<n_alpha_dot; i++)
1783  blas.AXPY(m*n, alpha_dot[i], &gemm_AB[0], OrdinalType(1),
1784  B_dot+i*ldb_dot*n, OrdinalType(1));
1785  else
1786  for (OrdinalType i=0; i<n_alpha_dot; i++)
1787  for (OrdinalType j=0; j<n; j++)
1788  blas.AXPY(m, alpha_dot[i], &gemm_AB[j*m], OrdinalType(1),
1789  B_dot+i*ldb_dot*n+j*ldb_dot, OrdinalType(1));
1790  }
1791 
1792  // Compute [Bd_1 .. Bd_n] += alpha*[Ad_1*B .. Ad_n*B]
1793  if (n_A_dot > 0) {
1794  if (gemm_AB.size() != std::size_t(m*n))
1795  gemm_AB.resize(m*n);
1796  for (OrdinalType i=0; i<n_A_dot; i++) {
1797  if (ldb == m)
1798  blas.COPY(m*n, B, OrdinalType(1), &gemm_AB[0], OrdinalType(1));
1799  else
1800  for (OrdinalType j=0; j<n; j++)
1801  blas.COPY(m, B+j*ldb, OrdinalType(1), &gemm_AB[j*m], OrdinalType(1));
1802  blas.TRMM(side, uplo, transa, Teuchos::NON_UNIT_DIAG, m, n, alpha,
1803  A_dot+i*lda_dot*n_A_cols, lda_dot, &gemm_AB[0],
1804  OrdinalType(m));
1805  if (ldb_dot == m)
1806  blas.AXPY(m*n, 1.0, &gemm_AB[0], OrdinalType(1),
1807  B_dot+i*ldb_dot*n, OrdinalType(1));
1808  else
1809  for (OrdinalType j=0; j<n; j++)
1810  blas.AXPY(m, 1.0, &gemm_AB[j*m], OrdinalType(1),
1811  B_dot+i*ldb_dot*n+j*ldb_dot, OrdinalType(1));
1812  }
1813  }
1814 
1815  // Compute B = alpha*A*B
1816  if (n_alpha_dot > 0 && n_A_dot == 0) {
1817  if (ldb == m) {
1818  blas.SCAL(m*n, 0.0, B, OrdinalType(1));
1819  blas.AXPY(m*n, alpha, &gemm_AB[0], OrdinalType(1), B, OrdinalType(1));
1820  }
1821  else
1822  for (OrdinalType j=0; j<n; j++) {
1823  blas.SCAL(m, 0.0, B+j*ldb, OrdinalType(1));
1824  blas.AXPY(m, alpha, &gemm_AB[j*m], OrdinalType(1), B+j*ldb,
1825  OrdinalType(1));
1826  }
1827  }
1828  else
1829  blas.TRMM(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
1830 }
1831 
1832 template <typename OrdinalType, typename FadType>
1833 template <typename alpha_type, typename A_type>
1834 void
1837  Teuchos::EUplo uplo,
1838  Teuchos::ETransp transa,
1839  Teuchos::EDiag diag,
1840  const OrdinalType m,
1841  const OrdinalType n,
1842  const alpha_type& alpha,
1843  const OrdinalType n_alpha_dot,
1844  const alpha_type* alpha_dot,
1845  const A_type* A,
1846  const OrdinalType lda,
1847  const OrdinalType n_A_dot,
1848  const A_type* A_dot,
1849  const OrdinalType lda_dot,
1850  ValueType* B,
1851  const OrdinalType ldb,
1852  const OrdinalType n_B_dot,
1853  ValueType* B_dot,
1854  const OrdinalType ldb_dot,
1855  const OrdinalType n_dot) const
1856 {
1857 #ifdef SACADO_DEBUG
1858  // Check sizes are consistent
1859  TEUCHOS_TEST_FOR_EXCEPTION((n_alpha_dot != n_dot && n_alpha_dot != 0) ||
1860  (n_A_dot != n_dot && n_A_dot != 0) ||
1861  (n_B_dot != n_dot && n_B_dot != 0),
1862  std::logic_error,
1863  "BLAS::Fad_TRSM(): All arguments must have " <<
1864  "the same number of derivative components, or none");
1865 #endif
1866  OrdinalType n_A_cols = m;
1867  if (side == Teuchos::RIGHT_SIDE) {
1868  n_A_cols = n;
1869  }
1870 
1871  // Compute [Bd_1 .. Bd_n] = alpha*[Bd_1 .. Bd_n]
1872  if (n_B_dot > 0) {
1873  if (ldb_dot == m)
1874  blas.SCAL(m*n*n_B_dot, alpha, B_dot, OrdinalType(1));
1875  else
1876  for (OrdinalType i=0; i<n_B_dot; i++)
1877  for (OrdinalType j=0; j<n; j++)
1878  blas.SCAL(m, alpha, B_dot+i*ldb_dot*n+j*ldb_dot, OrdinalType(1));
1879  }
1880 
1881  // Compute [Bd_1 .. Bd_n] += [alphad_1*B .. alphad_n*B]
1882  if (n_alpha_dot > 0) {
1883  if (ldb == m && ldb_dot == m)
1884  for (OrdinalType i=0; i<n_alpha_dot; i++)
1885  blas.AXPY(m*n, alpha_dot[i], B, OrdinalType(1),
1886  B_dot+i*ldb_dot*n, OrdinalType(1));
1887  else
1888  for (OrdinalType i=0; i<n_alpha_dot; i++)
1889  for (OrdinalType j=0; j<n; j++)
1890  blas.AXPY(m, alpha_dot[i], B+j*ldb, OrdinalType(1),
1891  B_dot+i*ldb_dot*n+j*ldb_dot, OrdinalType(1));
1892  }
1893 
1894  // Solve A*X = alpha*B
1895  blas.TRSM(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
1896 
1897  // Compute [Bd_1 .. Bd_n] -= [Ad_1*X .. Ad_n*X]
1898  if (n_A_dot > 0) {
1899  if (gemm_AB.size() != std::size_t(m*n))
1900  gemm_AB.resize(m*n);
1901  for (OrdinalType i=0; i<n_A_dot; i++) {
1902  if (ldb == m)
1903  blas.COPY(m*n, B, OrdinalType(1), &gemm_AB[0], OrdinalType(1));
1904  else
1905  for (OrdinalType j=0; j<n; j++)
1906  blas.COPY(m, B+j*ldb, OrdinalType(1), &gemm_AB[j*m], OrdinalType(1));
1907  blas.TRMM(side, uplo, transa, Teuchos::NON_UNIT_DIAG, m, n, 1.0,
1908  A_dot+i*lda_dot*n_A_cols, lda_dot, &gemm_AB[0],
1909  OrdinalType(m));
1910  if (ldb_dot == m)
1911  blas.AXPY(m*n, -1.0, &gemm_AB[0], OrdinalType(1),
1912  B_dot+i*ldb_dot*n, OrdinalType(1));
1913  else
1914  for (OrdinalType j=0; j<n; j++)
1915  blas.AXPY(m, -1.0, &gemm_AB[j*m], OrdinalType(1),
1916  B_dot+i*ldb_dot*n+j*ldb_dot, OrdinalType(1));
1917  }
1918  }
1919 
1920  // Solve A*[Xd_1 .. Xd_n] = [Bd_1 .. Bd_n]
1921  if (side == Teuchos::LEFT_SIDE)
1922  blas.TRSM(side, uplo, transa, diag, m, n*n_dot, 1.0, A, lda, B_dot,
1923  ldb_dot);
1924  else
1925  for (OrdinalType i=0; i<n_dot; i++)
1926  blas.TRSM(side, uplo, transa, diag, m, n, 1.0, A, lda, B_dot+i*ldb_dot*n,
1927  ldb_dot);
1928 }
void TRSM(Teuchos::ESide side, Teuchos::EUplo uplo, Teuchos::ETransp transa, Teuchos::EDiag diag, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const A_type *A, const OrdinalType lda, FadType *B, const OrdinalType ldb) const
Solves the matrix equations: op(A)*X=alpha*B or X*op(A)=alpha*B where X and B are m by n matrices...
void TRMV(Teuchos::EUplo uplo, Teuchos::ETransp trans, Teuchos::EDiag diag, const OrdinalType n, const A_type *A, const OrdinalType lda, FadType *x, const OrdinalType incx) const
Performs the matrix-std::vector operation: x &lt;- A*x or x &lt;- A&#39;*x where A is a unit/non-unit n by n uppe...
void Fad_GEMM(Teuchos::ETransp transa, Teuchos::ETransp transb, const OrdinalType m, const OrdinalType n, const OrdinalType k, const alpha_type &alpha, const OrdinalType n_alpha_dot, const alpha_type *alpha_dot, const A_type *A, const OrdinalType lda, const OrdinalType n_A_dot, const A_type *A_dot, const OrdinalType lda_dot, const B_type *B, const OrdinalType ldb, const OrdinalType n_B_dot, const B_type *B_dot, const OrdinalType ldb_dot, const beta_type &beta, const OrdinalType n_beta_dot, const beta_type *beta_dot, ValueType *C, const OrdinalType ldc, const OrdinalType n_C_dot, ValueType *C_dot, const OrdinalType ldc_dot, const OrdinalType n_dot) const
Implementation of GEMM.
void GEMV(Teuchos::ETransp trans, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const A_type *A, const OrdinalType lda, const x_type *x, const OrdinalType incx, const beta_type &beta, FadType *y, const OrdinalType incy) const
Performs the matrix-std::vector operation: y &lt;- alpha*A*x+beta*y or y &lt;- alpha*A&#39;*x+beta*y where A is a...
void GER(const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const x_type *x, const OrdinalType incx, const y_type *y, const OrdinalType incy, FadType *A, const OrdinalType lda) const
Performs the rank 1 operation: A &lt;- alpha*x*y&#39;+A.
expr expr dx(i)
void Fad_TRSM(Teuchos::ESide side, Teuchos::EUplo uplo, Teuchos::ETransp transa, Teuchos::EDiag diag, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const OrdinalType n_alpha_dot, const alpha_type *alpha_dot, const A_type *A, const OrdinalType lda, const OrdinalType n_A_dot, const A_type *A_dot, const OrdinalType lda_dot, ValueType *B, const OrdinalType ldb, const OrdinalType n_B_dot, ValueType *B_dot, const OrdinalType ldb_dot, const OrdinalType n_dot) const
Implementation of TRMM.
bool is_array_contiguous(const FadType *a, OrdinalType n, OrdinalType n_dot) const
FadType DOT(const OrdinalType n, const x_type *x, const OrdinalType incx, const y_type *y, const OrdinalType incy) const
Form the dot product of the vectors x and y.
virtual ~BLAS()
Destructor.
Fad specializations for Teuchos::BLAS wrappers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
void GEMM(Teuchos::ETransp transa, Teuchos::ETransp transb, const OrdinalType m, const OrdinalType n, const OrdinalType k, const alpha_type &alpha, const A_type *A, const OrdinalType lda, const B_type *B, const OrdinalType ldb, const beta_type &beta, FadType *C, const OrdinalType ldc) const
Performs the matrix-matrix operation: C &lt;- alpha*op(A)*op(B)+beta*C where op(A) is either A or A&#39;...
ValueType * allocate_array(OrdinalType size) const
void TRMM(Teuchos::ESide side, Teuchos::EUplo uplo, Teuchos::ETransp transa, Teuchos::EDiag diag, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const A_type *A, const OrdinalType lda, FadType *B, const OrdinalType ldb) const
Performs the matrix-matrix operation: C &lt;- alpha*op(A)*B+beta*C or C &lt;- alpha*B*op(A)+beta*C where op...
ValueType * workspace_pointer
Pointer to current free entry in workspace.
expr val()
OrdinalType workspace_size
Size of static workspace.
#define A
Definition: Sacado_rad.hpp:572
void Fad_SYMM(Teuchos::ESide side, Teuchos::EUplo uplo, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const OrdinalType n_alpha_dot, const alpha_type *alpha_dot, const A_type *A, const OrdinalType lda, const OrdinalType n_A_dot, const A_type *A_dot, const OrdinalType lda_dot, const B_type *B, const OrdinalType ldb, const OrdinalType n_B_dot, const B_type *B_dot, const OrdinalType ldb_dot, const beta_type &beta, const OrdinalType n_beta_dot, const beta_type *beta_dot, ValueType *C, const OrdinalType ldc, const OrdinalType n_C_dot, ValueType *C_dot, const OrdinalType ldc_dot, const OrdinalType n_dot) const
Implementation of SYMM.
Sacado::dummy< ValueType, scalar_type >::type ScalarType
ArrayTraits(bool use_dynamic=true, OrdinalType workspace_size=0)
void AXPY(const OrdinalType n, const alpha_type &alpha, const x_type *x, const OrdinalType incx, FadType *y, const OrdinalType incy) const
Perform the operation: y &lt;- y+alpha*x.
void COPY(const OrdinalType n, const FadType *x, const OrdinalType incx, FadType *y, const OrdinalType incy) const
Copy the std::vector x to the std::vector y.
void SYMM(Teuchos::ESide side, Teuchos::EUplo uplo, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const A_type *A, const OrdinalType lda, const B_type *B, const OrdinalType ldb, const beta_type &beta, FadType *C, const OrdinalType ldc) const
Performs the matrix-matrix operation: C &lt;- alpha*A*B+beta*C or C &lt;- alpha*B*A+beta*C where A is an m ...
void Fad_TRMM(Teuchos::ESide side, Teuchos::EUplo uplo, Teuchos::ETransp transa, Teuchos::EDiag diag, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const OrdinalType n_alpha_dot, const alpha_type *alpha_dot, const A_type *A, const OrdinalType lda, const OrdinalType n_A_dot, const A_type *A_dot, const OrdinalType lda_dot, ValueType *B, const OrdinalType ldb, const OrdinalType n_B_dot, ValueType *B_dot, const OrdinalType ldb_dot, const OrdinalType n_dot) const
Implementation of TRMM.
static magnitudeType magnitude(T a)
void free_array(const ValueType *ptr, OrdinalType size) const
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
MagnitudeType NRM2(const OrdinalType n, const FadType *x, const OrdinalType incx) const
Compute the 2-norm of the std::vector x.
ValueType * workspace
Workspace for holding contiguous values/derivatives.
void Fad_GEMV(Teuchos::ETransp trans, const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const OrdinalType n_alpha_dot, const alpha_type *alpha_dot, const A_type *A, const OrdinalType lda, const OrdinalType n_A_dot, const A_type *A_dot, const OrdinalType lda_dot, const x_type *x, const OrdinalType incx, const OrdinalType n_x_dot, const x_type *x_dot, const OrdinalType incx_dot, const beta_type &beta, const OrdinalType n_beta_dot, const beta_type *beta_dot, ValueType *y, const OrdinalType incy, const OrdinalType n_y_dot, ValueType *y_dot, const OrdinalType incy_dot, const OrdinalType n_dot) const
Implementation of GEMV.
void SCAL(const OrdinalType n, const FadType &alpha, FadType *x, const OrdinalType incx) const
Scale the std::vector x by the constant alpha.
void Fad_DOT(const OrdinalType n, const x_type *x, const OrdinalType incx, const OrdinalType n_x_dot, const x_type *x_dot, const OrdinalType incx_dot, const y_type *y, const OrdinalType incy, const OrdinalType n_y_dot, const y_type *y_dot, const OrdinalType incy_dot, ValueType &z, const OrdinalType n_z_dot, ValueType *zdot) const
Implementation of DOT.
Base template specification for ValueType.
void Fad_GER(const OrdinalType m, const OrdinalType n, const alpha_type &alpha, const OrdinalType n_alpha_dot, const alpha_type *alpha_dot, const x_type *x, const OrdinalType incx, const OrdinalType n_x_dot, const x_type *x_dot, const OrdinalType incx_dot, const y_type *y, const OrdinalType incy, const OrdinalType n_y_dot, const y_type *y_dot, const OrdinalType incy_dot, ValueType *A, const OrdinalType lda, const OrdinalType n_A_dot, ValueType *A_dot, const OrdinalType lda_dot, const OrdinalType n_dot) const
Implementation of GER.
BLAS(bool use_default_impl=true, bool use_dynamic=true, OrdinalType static_workspace_size=0)
Default constructor.