Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Intrepid2 Package
4 //
5 // Copyright 2007 NTESS and the Intrepid2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
15 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
16 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
17 
18 #include "Intrepid2_DataTools.hpp"
21 
22 #include "Teuchos_TimeMonitor.hpp"
23 
24 namespace Intrepid2 {
25 
26  namespace Impl
27  {
31  template<class Scalar, class DeviceType, int integralViewRank>
33  {
34  using ExecutionSpace = typename DeviceType::execution_space;
35  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
36  using TeamMember = typename TeamPolicy::member_type;
37 
38  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
39  IntegralViewType integralView_;
40  TensorData<Scalar,DeviceType> leftComponent_;
41  Data<Scalar,DeviceType> composedTransform_;
42  TensorData<Scalar,DeviceType> rightComponent_;
43  TensorData<Scalar,DeviceType> cellMeasures_;
44  int a_offset_;
45  int b_offset_;
46  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
47  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
48  int numTensorComponents_;
49  int leftFieldOrdinalOffset_;
50  int rightFieldOrdinalOffset_;
51  bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
52 
53  size_t fad_size_output_ = 0; // 0 if not a fad type
54 
55  Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
56 
57  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
58  // (this also makes it easier to reorder loops, etc., for further optimizations)
59  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
60  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
61  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
62 
63  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
64  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
65 
66  int maxFieldsLeft_;
67  int maxFieldsRight_;
68  int maxPointCount_;
69  public:
71  TensorData<Scalar,DeviceType> leftComponent,
72  Data<Scalar,DeviceType> composedTransform,
73  TensorData<Scalar,DeviceType> rightComponent,
74  TensorData<Scalar,DeviceType> cellMeasures,
75  int a_offset,
76  int b_offset,
77  int leftFieldOrdinalOffset,
78  int rightFieldOrdinalOffset,
79  bool forceNonSpecialized)
80  :
81  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
82  leftComponent_(leftComponent),
83  composedTransform_(composedTransform),
84  rightComponent_(rightComponent),
85  cellMeasures_(cellMeasures),
86  a_offset_(a_offset),
87  b_offset_(b_offset),
88  leftComponentSpan_(leftComponent.extent_int(2)),
89  rightComponentSpan_(rightComponent.extent_int(2)),
90  numTensorComponents_(leftComponent.numTensorComponents()),
91  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
92  rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
93  forceNonSpecialized_(forceNonSpecialized)
94  {
95  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
96 
97  // set up bounds containers
98  const int FIELD_DIM = 0;
99  const int POINT_DIM = 1;
100  maxFieldsLeft_ = 0;
101  maxFieldsRight_ = 0;
102  maxPointCount_ = 0;
103  for (int r=0; r<numTensorComponents_; r++)
104  {
105  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
106  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
107  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
108  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
109  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
110  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
111  }
112 
113  // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
114  int leftRelativeEnumerationSpan = 1;
115  int rightRelativeEnumerationSpan = 1;
116  for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
117  {
118  leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
119  rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
120  leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
121  rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
122  }
123 
124  // prepare for allocation of temporary storage
125  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
126 
127  const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
128  if (allocateFadStorage)
129  {
130  fad_size_output_ = dimension_scalar(integralView_);
131  }
132 
133  const int R = numTensorComponents_ - 1;
134 
135  int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
136  int allocationSoFar = 0;
137  offsetsForComponentOrdinal_[R] = allocationSoFar;
138  allocationSoFar++; // we store one entry corresponding to R, the final component
139 
140  for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
141  {
142  const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
143  const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
144 
145  num_ij *= leftFields * rightFields;
146 
147  offsetsForComponentOrdinal_[r] = allocationSoFar;
148  allocationSoFar += num_ij;
149  }
150  offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
151  }
152 
153  template<size_t maxComponents, size_t numComponents = maxComponents>
154  KOKKOS_INLINE_FUNCTION
155  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
156  const Kokkos::Array<int,maxComponents> &bounds) const
157  {
158  if (numComponents == 0)
159  {
160  return -1;
161  }
162  else
163  {
164  int r = static_cast<int>(numComponents - 1);
165  while (arguments[r] + 1 >= bounds[r])
166  {
167  arguments[r] = 0; // reset
168  r--;
169  if (r < 0) break;
170  }
171  if (r >= 0) ++arguments[r];
172  return r;
173  }
174  }
175 
177  KOKKOS_INLINE_FUNCTION
178  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
179  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
180  const int &numComponents) const
181  {
182  if (numComponents == 0) return -1;
183  int r = static_cast<int>(numComponents - 1);
184  while (arguments[r] + 1 >= bounds[r])
185  {
186  arguments[r] = 0; // reset
187  r--;
188  if (r < 0) break;
189  }
190  if (r >= 0) ++arguments[r];
191  return r;
192  }
193 
194  template<size_t maxComponents, size_t numComponents = maxComponents>
195  KOKKOS_INLINE_FUNCTION
196  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
197  const Kokkos::Array<int,maxComponents> &bounds) const
198  {
199  if (numComponents == 0)
200  {
201  return -1;
202  }
203  else
204  {
205  int r = static_cast<int>(numComponents - 1);
206  while (arguments[r] + 1 >= bounds[r])
207  {
208  r--;
209  if (r < 0) break;
210  }
211  return r;
212  }
213  }
214 
216  KOKKOS_INLINE_FUNCTION
217  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
218  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
219  const int &numComponents) const
220  {
221  if (numComponents == 0) return -1;
222  int r = numComponents - 1;
223  while (arguments[r] + 1 >= bounds[r])
224  {
225  r--;
226  if (r < 0) break;
227  }
228  return r;
229  }
230 
231  template<size_t maxComponents, size_t numComponents = maxComponents>
232  KOKKOS_INLINE_FUNCTION
233  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
234  const Kokkos::Array<int,maxComponents> &bounds,
235  const int startIndex) const
236  {
237  // the following mirrors what is done in TensorData
238  if (numComponents == 0)
239  {
240  return 0;
241  }
242  else
243  {
244  int enumerationIndex = 0;
245  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
246  {
247  enumerationIndex += arguments[r];
248  enumerationIndex *= bounds[r-1];
249  }
250  enumerationIndex += arguments[startIndex];
251  return enumerationIndex;
252  }
253  }
254 
256 
257  // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
258 // template<size_t numTensorComponents>
259 // KOKKOS_INLINE_FUNCTION
260 // void runSpecialized( const TeamMember & teamMember ) const;
261 
262 // template<>
263 // KOKKOS_INLINE_FUNCTION
264 // void runSpecialized<3>( const TeamMember & teamMember ) const
265  KOKKOS_INLINE_FUNCTION
266  void runSpecialized3( const TeamMember & teamMember ) const
267  {
268  constexpr int numTensorComponents = 3;
269 
270  Kokkos::Array<int,numTensorComponents> pointBounds;
271  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
272  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
273  for (unsigned r=0; r<numTensorComponents; r++)
274  {
275  pointBounds[r] = pointBounds_[r];
276  leftFieldBounds[r] = leftFieldBounds_[r];
277  rightFieldBounds[r] = rightFieldBounds_[r];
278  }
279 
280  const int cellDataOrdinal = teamMember.league_rank();
281  const int numThreads = teamMember.team_size(); // num threads
282  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
283 
284  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
285  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
286  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
287  if (fad_size_output_ > 0) {
288  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
289  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
290  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
291  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
292  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
293  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
294  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
295  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
296  }
297  else {
298  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
299  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
300  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
301  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
302  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
303  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
304  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
305  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
306  }
307 
308 // int approximateFlopCount = 0;
309 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
310 
311  constexpr int R = numTensorComponents - 1;
312 
313  const int composedTransformRank = composedTransform_.rank();
314 
315  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
316  {
317  const int a = a_offset_ + a_component;
318  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
319  {
320  const int b = b_offset_ + b_component;
321 
322  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
323  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
324 
325  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
326  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
327 
328  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
329  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
330  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
331  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
332  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
333  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
334 
335  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
336  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
337  const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
338  const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
339  if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
340  {
341  const int leftRank = leftTensorComponent_x.rank();
342  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
343  }
344  if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
345  {
346  const int leftRank = leftTensorComponent_y.rank();
347  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
348  }
349  if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
350  {
351  const int leftRank = leftTensorComponent_z.rank();
352  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
353  }
354  if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
355  {
356  const int rightRank = rightTensorComponent_x.rank();
357  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
358  }
359  if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
360  {
361  const int rightRank = rightTensorComponent_y.rank();
362  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
363  }
364  if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
365  {
366  const int rightRank = rightTensorComponent_z.rank();
367  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
368  }
369  });
370 
371  if (composedTransform_.underlyingMatchesLogical())
372  {
373  if (composedTransformRank == 4) // (C,P,D,D)
374  {
375  const auto & composedTransformView = composedTransform_.getUnderlyingView4();
376  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
377  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
378  });
379  }
380  else // rank 2, then: (C,P)
381  {
382  const auto & composedTransformView = composedTransform_.getUnderlyingView2();
383  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
384  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
385  });
386  }
387  }
388  else
389  {
390  if (composedTransformRank == 4) // (C,P,D,D)
391  {
392  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
393  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
394  });
395  }
396  else // rank 2, then: (C,P)
397  {
398  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
399  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
400  });
401  }
402  }
403 
404  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
405 
406  // synchronize threads
407  teamMember.team_barrier();
408 
409  // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
410  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
411  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
412  {
413  scratchView(i) = 0.0;
414  }
415 
416  // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
417  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
418  const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
419  const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
420 
421  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
422  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
423  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
424  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
425  rightFieldArguments[R] = jz;
426  leftFieldArguments[R] = iz;
427 
428  Kokkos::Array<int,numTensorComponents> pointArguments;
429  for (int i=0; i<numTensorComponents; i++)
430  {
431  pointArguments[i] = 0;
432  }
433 
434  for (int lx=0; lx<pointBounds[0]; lx++)
435  {
436  pointArguments[0] = lx;
437 
438  // clear Gy scratch:
439  // in scratch, Gz (1 entry) comes first, then Gy entries.
440  const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
441  const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
442 
443  for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
444  {
445  scratchView(Gy_index) = 0;
446  }
447 
448  for (int ly=0; ly<pointBounds[1]; ly++)
449  {
450  pointArguments[1] = ly;
451 
452  Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
453  *Gz = 0;
454 
455  for (int lz=0; lz < pointBounds[2]; lz++)
456  {
457  const Scalar & leftValue = leftFields_z(iz,lz);
458  const Scalar & rightValue = rightFields_z(jz,lz);
459 
460  pointArguments[2] = lz;
461  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
462 
463  *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
464 
465  // approximateFlopCount += 3; // 2 multiplies, 1 sum
466  } // lz
467 
468  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
469  {
470  leftFieldArguments[1] = iy;
471  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
472 
473  const Scalar & leftValue = leftFields_y(iy,ly);
474 
475  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
476  {
477  rightFieldArguments[1] = jy;
478 
479  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
480  const Scalar & rightValue = rightFields_y(jy,ly);
481 
482  const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
483  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
484 
485  const int Gz_index = 0;
486  const Scalar & aGz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
487 
488  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
489 
490  Gy += leftValue * aGz * rightValue;
491  // approximateFlopCount += 3; // 2 multiplies, 1 sum
492  }
493  }
494  } // ly
495  for (int ix=0; ix<leftFieldBounds_[0]; ix++)
496  {
497  leftFieldArguments[0] = ix;
498  const Scalar & leftValue = leftFields_x(ix,lx);
499 
500  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
501  {
502  leftFieldArguments[1] = iy;
503 
504  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
505 
506  for (int jx=0; jx<rightFieldBounds_[0]; jx++)
507  {
508  rightFieldArguments[0] = jx;
509  const Scalar & rightValue = rightFields_x(jx,lx);
510 
511  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
512  {
513  rightFieldArguments[1] = jy;
514  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
515 
516  const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
517 
518  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
519  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
520 
521  // compute enumeration indices to get field indices into output view
522  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
523  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
524  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
525  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
526 
527  if (integralViewRank == 3)
528  {
529 // if ((leftFieldIndex==0) && (rightFieldIndex==2))
530 // {
531 // using std::cout;
532 // using std::endl;
533 // cout << "***** Contribution to (0,0,2) *****\n";
534 // cout << "lx = " << lx << endl;
535 // cout << "ix = " << ix << endl;
536 // cout << "iy = " << iy << endl;
537 // cout << "jx = " << jx << endl;
538 // cout << "jy = " << jy << endl;
539 // cout << "iz = " << iz << endl;
540 // cout << "jz = " << jz << endl;
541 // cout << " leftValue = " << leftValue << endl;
542 // cout << "rightValue = " << rightValue << endl;
543 // cout << "Gy = " << Gy << endl;
544 //
545 // cout << endl;
546 // }
547 
548  // shape (C,F1,F2)
549  integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
550  }
551  else
552  {
553  // shape (F1,F2)
554  integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
555  }
556  // approximateFlopCount += 3; // 2 multiplies, 1 sum
557  } // jy
558  } // ix
559  } // iy
560  } // ix
561  } // lx
562  }); // TeamThreadRange parallel_for - (iz,jz) loop
563  }
564  }
565 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
566  }
567 
568  template<size_t numTensorComponents>
569  KOKKOS_INLINE_FUNCTION
570  void run( const TeamMember & teamMember ) const
571  {
572  Kokkos::Array<int,numTensorComponents> pointBounds;
573  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
574  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
575  for (unsigned r=0; r<numTensorComponents; r++)
576  {
577  pointBounds[r] = pointBounds_[r];
578  leftFieldBounds[r] = leftFieldBounds_[r];
579  rightFieldBounds[r] = rightFieldBounds_[r];
580  }
581 
582  const int cellDataOrdinal = teamMember.league_rank();
583  const int numThreads = teamMember.team_size(); // num threads
584  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
585  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
586  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
587  Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
588  if (fad_size_output_ > 0) {
589  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
590  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
591  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
592  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
593  }
594  else {
595  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
596  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
597  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
598  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
599  }
600 
601 // int approximateFlopCount = 0;
602 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
603 
604  constexpr int R = numTensorComponents - 1;
605 
606  const int composedTransformRank = composedTransform_.rank();
607 
608  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
609  {
610  const int a = a_offset_ + a_component;
611  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
612  {
613  const int b = b_offset_ + b_component;
614 
615  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
616  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
617 
618  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
619  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
620 
621  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
622  const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
623  const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
624  const int leftFieldCount = leftTensorComponent.extent_int(0);
625  const int pointCount = leftTensorComponent.extent_int(1);
626  const int leftRank = leftTensorComponent.rank();
627  const int rightFieldCount = rightTensorComponent.extent_int(0);
628  const int rightRank = rightTensorComponent.rank();
629  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
630  {
631  // slightly weird logic here in effort to avoid branch divergence
632  const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
633  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
634  {
635  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
636  leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
637  }
638  }
639  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
640  {
641  // slightly weird logic here in effort to avoid branch divergence
642  const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
643  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
644  {
645  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646  rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
647  }
648  }
649  });
650 
651  if (composedTransformRank == 4)
652  {
653  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
654  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
655  });
656  }
657  else
658  {
659  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
660  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
661  });
662  }
663  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
664 
665  // synchronize threads
666  teamMember.team_barrier();
667 
668  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
669  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
670  const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
671  const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
672 
673  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
674  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
675  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
676  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
677  rightFieldArguments[R] = j_R;
678  leftFieldArguments[R] = i_R;
679 
680  //TODO: I believe that this can be moved outside the thread parallel_for
681  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
682  {
683  scratchView(i) = 0.0;
684  }
685  Kokkos::Array<int,numTensorComponents> pointArguments;
686  for (unsigned i=0; i<numTensorComponents; i++)
687  {
688  pointArguments[i] = 0;
689  }
690 
691  int r = R;
692  while (r >= 0)
693  {
694  // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
695  const int pointBounds_R = pointBounds[R];
696  int & pointArgument_R = pointArguments[R];
697  for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
698  {
699  Scalar * G_R;
700  if (R != 0)
701  {
702  G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
703  }
704  else
705  {
706  const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
707  const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
708 
709  if (integralViewRank == 3)
710  {
711  // shape (C,F1,F2)
712  G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
713  }
714  else
715  {
716  // shape (F1,F2)
717  G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
718  }
719  }
720 
721  const int & pointIndex_R = pointArguments[R];
722 
723  const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
724  const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
725 
726  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
727 
728  *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
729 
730 // approximateFlopCount += 3; // 2 multiplies, 1 sum
731  } // pointArgument_R
732 
733  const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
734  const int r_min = (r_next >= 0) ? r_next : 0;
735 
736  for (int s=R-1; s>=r_min; s--)
737  {
738  const int & pointIndex_s = pointArguments[s];
739 
740  // want to cover all the multi-indices from s to R-1
741  for (int s1=s; s1<R; s1++)
742  {
743  leftFieldArguments[s1] = 0;
744  }
745 
746  // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
747  const int & i_s = leftFieldArguments[s];
748  const int & j_s = rightFieldArguments[s];
749 
750  int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
751  const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
752  const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
753 
754  while (sLeft >= s)
755  {
756  const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
757 
758  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
759  const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
760 
761  const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
762 
763  for (int s1=s; s1<R; s1++)
764  {
765  rightFieldArguments[s1] = 0;
766  }
767  int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
768  while (sRight >= s)
769  {
770  const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
771 
772  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
773  const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
774 
775  const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
776 
777  const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
778 
779  Scalar* G_s;
780 
781  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
782  // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
783  const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
784 
785  const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
786 
787 
788  if (s != 0)
789  {
790  G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
791  }
792  else
793  {
794  // compute enumeration indices
795  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
796  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
797  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
798  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
799 
800  if (integralViewRank == 3)
801  {
802  // shape (C,F1,F2)
803  G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
804  }
805  else
806  {
807  // shape (F1,F2)
808  G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
809  }
810  }
811 
812  *G_s += leftValue * G_sp * rightValue;
813 
814 // approximateFlopCount += 3; // 2 multiplies, 1 sum
815 
816  // increment rightField
817  sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
818  }
819 
820  // increment leftField
821  sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
822  }
823  }
824 
825  // clear tempStorage for r_next+1 to R
826  if (r_min+1 <= R)
827  {
828  const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
829  for (int i=scratchOffsetForThread; i<endIndex; i++)
830  {
831  scratchView(i) = 0.0;
832  }
833 // auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
834 // Kokkos::deep_copy(tempStorageSubview, 0.0);
835  }
836 
837  // proceed to next point
838  r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
839  }
840  }); // TeamThreadRange parallel_for
841  }
842  }
843 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
844  }
845 
846  KOKKOS_INLINE_FUNCTION
847  void operator()( const TeamMember & teamMember ) const
848  {
849  switch (numTensorComponents_)
850  {
851  case 1: run<1>(teamMember); break;
852  case 2: run<2>(teamMember); break;
853  case 3:
854  if (forceNonSpecialized_)
855  run<3>(teamMember);
856  else
857  runSpecialized3(teamMember);
858  break;
859  case 4: run<4>(teamMember); break;
860  case 5: run<5>(teamMember); break;
861  case 6: run<6>(teamMember); break;
862  case 7: run<7>(teamMember); break;
863 #ifdef INTREPID2_HAVE_DEBUG
864  default:
865  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
866 #endif
867  }
868  }
869 
872  {
873  // compute flop count on a single cell, then multiply by the number of cells
874  int flopCount = 0;
875 
876  const int R = numTensorComponents_ - 1;
877 
878  // we cache the value of M_ab * cellMeasure at each point.
879  // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
880  const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
881 
882  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
883  {
884  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
885  {
886  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
887  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
888 
889  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
890  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
891 
892  flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
893 
894  int flopsPer_i_R_j_R = 0;
895 
896  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
897  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
898  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
899  leftFieldArguments[R] = 0;
900 
901  Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
902  for (int i=0; i<numTensorComponents_; i++)
903  {
904  pointArguments[i] = 0;
905  }
906 
907  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
908  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
909  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
910  rightFieldArguments[R] = 0;
911 
912  int r = R;
913  while (r >= 0)
914  {
915  // integrate in final component dimension
916  for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
917  {
918  flopsPer_i_R_j_R += 4;
919  }
920  // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
921 // if (0 < pointBounds_[R])
922 // {
923 // flopsPer_i_R_j_R += pointBounds_[R] * 4;
924 // }
925 
926  const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
927  const int r_min = (r_next >= 0) ? r_next : 0;
928 
929  for (int s=R-1; s>=r_min; s--)
930  {
931  // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
932  int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
933  int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
934 
935  flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
936  }
937 
938  // proceed to next point
939  r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
940  }
941 
942  flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
943  }
944  }
945 // std::cout << "flop count per cell: " << flopCount << std::endl;
946  return flopCount;
947  }
948 
950  int teamSize(const int &maxTeamSizeFromKokkos) const
951  {
952  const int R = numTensorComponents_ - 1;
953  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
954  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
955  }
956 
958  size_t team_shmem_size (int team_size) const
959  {
960  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
961  size_t shmem_size = 0;
962 
963  if (fad_size_output_ > 0)
964  {
965  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
966  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
967  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
968  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
969  }
970  else
971  {
972  shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
973  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
974  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
975  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
976  }
977 
978  return shmem_size;
979  }
980  };
981 
991  template<class Scalar, class DeviceType, int integralViewRank>
993  {
994  using ExecutionSpace = typename DeviceType::execution_space;
995  using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
996  using TeamMember = typename TeamPolicy::member_type;
997 
998  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
999  IntegralViewType integralView_;
1000  TensorData<Scalar,DeviceType> leftComponent_;
1001  Data<Scalar,DeviceType> composedTransform_;
1002  TensorData<Scalar,DeviceType> rightComponent_;
1003  TensorData<Scalar,DeviceType> cellMeasures_;
1004  int a_offset_;
1005  int b_offset_;
1006  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1007  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1008  int numTensorComponents_;
1009  int leftFieldOrdinalOffset_;
1010  int rightFieldOrdinalOffset_;
1011 
1012  size_t fad_size_output_ = 0; // 0 if not a fad type
1013 
1014  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1015  // (this also makes it easier to reorder loops, etc., for further optimizations)
1016  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1017  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1018  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1019 
1020  int maxFieldsLeft_;
1021  int maxFieldsRight_;
1022  int maxPointCount_;
1023  public:
1025  TensorData<Scalar,DeviceType> leftComponent,
1026  Data<Scalar,DeviceType> composedTransform,
1027  TensorData<Scalar,DeviceType> rightComponent,
1028  TensorData<Scalar,DeviceType> cellMeasures,
1029  int a_offset,
1030  int b_offset,
1031  int leftFieldOrdinalOffset,
1032  int rightFieldOrdinalOffset)
1033  :
1034  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1035  leftComponent_(leftComponent),
1036  composedTransform_(composedTransform),
1037  rightComponent_(rightComponent),
1038  cellMeasures_(cellMeasures),
1039  a_offset_(a_offset),
1040  b_offset_(b_offset),
1041  leftComponentSpan_(leftComponent.extent_int(2)),
1042  rightComponentSpan_(rightComponent.extent_int(2)),
1043  numTensorComponents_(leftComponent.numTensorComponents()),
1044  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1045  rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1046  {
1047  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1048 
1049  const int FIELD_DIM = 0;
1050  const int POINT_DIM = 1;
1051  maxFieldsLeft_ = 0;
1052  maxFieldsRight_ = 0;
1053  maxPointCount_ = 0;
1054  for (int r=0; r<numTensorComponents_; r++)
1055  {
1056  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1057  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1058  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1059  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1060  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1061  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1062  }
1063 
1064  // prepare for allocation of temporary storage
1065  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1066 
1067  const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
1068  if (allocateFadStorage)
1069  {
1070  fad_size_output_ = dimension_scalar(integralView_);
1071  }
1072  }
1073 
1074  template<size_t maxComponents, size_t numComponents = maxComponents>
1075  KOKKOS_INLINE_FUNCTION
1076  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1077  const Kokkos::Array<int,maxComponents> &bounds) const
1078  {
1079  if (numComponents == 0) return -1;
1080  int r = static_cast<int>(numComponents - 1);
1081  while (arguments[r] + 1 >= bounds[r])
1082  {
1083  arguments[r] = 0; // reset
1084  r--;
1085  if (r < 0) break;
1086  }
1087  if (r >= 0) ++arguments[r];
1088  return r;
1089  }
1090 
1092  KOKKOS_INLINE_FUNCTION
1093  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1094  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1095  const int &numComponents) const
1096  {
1097  if (numComponents == 0) return -1;
1098  int r = static_cast<int>(numComponents - 1);
1099  while (arguments[r] + 1 >= bounds[r])
1100  {
1101  arguments[r] = 0; // reset
1102  r--;
1103  if (r < 0) break;
1104  }
1105  if (r >= 0) ++arguments[r];
1106  return r;
1107  }
1108 
1109  template<size_t maxComponents, size_t numComponents = maxComponents>
1110  KOKKOS_INLINE_FUNCTION
1111  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1112  const Kokkos::Array<int,maxComponents> &bounds) const
1113  {
1114  if (numComponents == 0) return -1;
1115  int r = static_cast<int>(numComponents - 1);
1116  while (arguments[r] + 1 >= bounds[r])
1117  {
1118  r--;
1119  if (r < 0) break;
1120  }
1121  return r;
1122  }
1123 
1125  KOKKOS_INLINE_FUNCTION
1126  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1127  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128  const int &numComponents) const
1129  {
1130  if (numComponents == 0) return -1;
1131  int r = numComponents - 1;
1132  while (arguments[r] + 1 >= bounds[r])
1133  {
1134  r--;
1135  if (r < 0) break;
1136  }
1137  return r;
1138  }
1139 
1140  template<size_t maxComponents, size_t numComponents = maxComponents>
1141  KOKKOS_INLINE_FUNCTION
1142  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1143  const Kokkos::Array<int,maxComponents> &bounds,
1144  const int startIndex) const
1145  {
1146  // the following mirrors what is done in TensorData
1147  if (numComponents == 0)
1148  {
1149  return 0;
1150  }
1151  int enumerationIndex = 0;
1152  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1153  {
1154  enumerationIndex += arguments[r];
1155  enumerationIndex *= bounds[r-1];
1156  }
1157  enumerationIndex += arguments[startIndex];
1158  return enumerationIndex;
1159  }
1160 
1161  template<int rank>
1162  KOKKOS_INLINE_FUNCTION
1163  enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1164  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1165  {
1166  return integralView(cellDataOrdinal,i,j);
1167  }
1168 
1169  template<int rank>
1170  KOKKOS_INLINE_FUNCTION
1171  enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1172  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1173  {
1174  return integralView(i,j);
1175  }
1176 
1178  KOKKOS_INLINE_FUNCTION
1179  void runSpecialized3( const TeamMember & teamMember ) const
1180  {
1181  constexpr int numTensorComponents = 3;
1182 
1183  const int pointBounds_x = pointBounds_[0];
1184  const int pointBounds_y = pointBounds_[1];
1185  const int pointBounds_z = pointBounds_[2];
1186  const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1187 
1188  const int leftFieldBounds_x = leftFieldBounds_[0];
1189  const int rightFieldBounds_x = rightFieldBounds_[0];
1190  const int leftFieldBounds_y = leftFieldBounds_[1];
1191  const int rightFieldBounds_y = rightFieldBounds_[1];
1192  const int leftFieldBounds_z = leftFieldBounds_[2];
1193  const int rightFieldBounds_z = rightFieldBounds_[2];
1194 
1195  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1196  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1197  for (unsigned r=0; r<numTensorComponents; r++)
1198  {
1199  leftFieldBounds[r] = leftFieldBounds_[r];
1200  rightFieldBounds[r] = rightFieldBounds_[r];
1201  }
1202 
1203  const auto integralView = integralView_;
1204  const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1205  const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1206 
1207  const int cellDataOrdinal = teamMember.league_rank();
1208  const int threadNumber = teamMember.team_rank();
1209 
1210  const int numThreads = teamMember.team_size(); // num threads
1211  const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1212  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1213  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1214  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1215 
1216  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1217  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1218  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1219  if (fad_size_output_ > 0) {
1220  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1221  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1222  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1223 
1224  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1225  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1226  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1227  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1228  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1229  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1230  }
1231  else {
1232  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1233  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1234  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1235 
1236  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1237  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1238  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1239  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1240  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1241  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1242  }
1243 
1244 // int approximateFlopCount = 0;
1245 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1246 
1247  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1248 
1249  const int composedTransformRank = composedTransform_.rank();
1250 
1251  // synchronize threads
1252  teamMember.team_barrier();
1253 
1254  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1255  {
1256  const int a = a_offset_ + a_component;
1257  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1258  {
1259  const int b = b_offset_ + b_component;
1260 
1261  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1262  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1263  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1264  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1265  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1266  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1267 
1268  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1269  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1270  if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1271  {
1272  const int pointCount = leftTensorComponent_x.extent_int(1);
1273  const int leftRank = leftTensorComponent_x.rank();
1274  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1275  {
1276  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1277  }
1278  }
1279  if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1280  {
1281  const int pointCount = leftTensorComponent_y.extent_int(1);
1282  const int leftRank = leftTensorComponent_y.rank();
1283  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1284  {
1285  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1286  }
1287  }
1288  if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1289  {
1290  const int pointCount = leftTensorComponent_z.extent_int(1);
1291  const int leftRank = leftTensorComponent_z.rank();
1292  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1293  {
1294  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1295  }
1296  }
1297  if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1298  {
1299  const int pointCount = rightTensorComponent_x.extent_int(1);
1300  const int rightRank = rightTensorComponent_x.rank();
1301  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1302  {
1303  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1304  }
1305  }
1306  if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1307  {
1308  const int pointCount = rightTensorComponent_y.extent_int(1);
1309  const int rightRank = rightTensorComponent_y.rank();
1310  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1311  {
1312  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1313  }
1314  }
1315  if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1316  {
1317  const int pointCount = rightTensorComponent_z.extent_int(1);
1318  const int rightRank = rightTensorComponent_z.rank();
1319  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1320  {
1321  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1322  }
1323  }
1324  });
1325 
1326  if (composedTransformRank == 4) // (C,P,D,D)
1327  {
1328  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1329  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1330  });
1331  }
1332  else // (C,P)
1333  {
1334  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1335  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1336  });
1337  }
1338 
1339  // synchronize threads
1340  teamMember.team_barrier();
1341 
1342  for (int i0=0; i0<leftFieldBounds_x; i0++)
1343  {
1344  for (int j0=0; j0<rightFieldBounds_x; j0++)
1345  {
1346  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1347  // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1348  const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1349 
1350  Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1351 
1352  Gx = 0;
1353  if (fad_size_output_ == 0)
1354  {
1355  // not a Fad type; we're allow to have a vector range
1356  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1357  {
1358  integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1359  }, Gx);
1360  }
1361  else
1362  {
1363  for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1364  {
1365  Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1366  }
1367  }
1368  });
1369 
1370  // synchronize threads
1371  teamMember.team_barrier();
1372 
1373  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1374  const int i1 = i1j1 % leftFieldBounds_y;
1375  const int j1 = i1j1 / leftFieldBounds_y;
1376 
1377  int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1378 
1379  for (int lz=0; lz<pointBounds_z; lz++)
1380  {
1381  int pointEnumerationIndex = lz * pointBounds_y;
1382  if (fad_size_output_ == 0)
1383  {
1384  Scalar Gy_local = 0;
1385 
1386  // not a Fad type; we're allow to have a vector range
1387  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (const int &ly, Scalar &integralThusFar)
1388  {
1389  const Scalar & leftValue = leftFields_y(i1,ly);
1390  const Scalar & rightValue = rightFields_y(j1,ly);
1391 
1392  integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1393  }, Gy_local);
1394 
1395  GyIntegrals(Gy_index_offset + lz) = Gy_local;
1396  }
1397  else
1398  {
1399  Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
1400  for (int ly=0; ly<pointBounds_y; ly++)
1401  {
1402  const Scalar & leftValue = leftFields_y(i1,ly);
1403  const Scalar & rightValue = rightFields_y(j1,ly);
1404 
1405  Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1406  }
1407  }
1408  }
1409 
1410  for (int i2=0; i2<leftFieldBounds_z; i2++)
1411  {
1412  for (int j2=0; j2<rightFieldBounds_z; j2++)
1413  {
1414  Scalar Gz = 0.0;
1415 
1416  if (fad_size_output_ == 0)
1417  {
1418  // not a Fad type; we're allow to have a vector range
1419  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (const int &lz, Scalar &integralThusFar)
1420  {
1421  const Scalar & leftValue = leftFields_z(i2,lz);
1422  const Scalar & rightValue = rightFields_z(j2,lz);
1423 
1424  integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1425  }, Gz);
1426  }
1427  else
1428  {
1429  for (int lz=0; lz<pointBounds_z; lz++)
1430  {
1431  const Scalar & leftValue = leftFields_z(i2,lz);
1432  const Scalar & rightValue = rightFields_z(j2,lz);
1433 
1434  Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1435  }
1436  }
1437 
1438  const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1439  const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1440  // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1441 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1442 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1443 
1444  Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
1445  integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1446  });
1447  }
1448  }
1449  });
1450  // synchronize threads
1451  teamMember.team_barrier();
1452  }
1453  }
1454  }
1455  }
1456  }
1457 
1458  template<size_t numTensorComponents>
1459  KOKKOS_INLINE_FUNCTION
1460  void run( const TeamMember & teamMember ) const
1461  {
1462  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1463 // Kokkos::Array<int,numTensorComponents> pointBounds;
1464 // Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1465 // Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1466 //
1467 // int pointsInNonzeroComponentDimensions = 1;
1468 // for (unsigned r=0; r<numTensorComponents; r++)
1469 // {
1470 // pointBounds[r] = pointBounds_[r];
1471 // if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1472 // leftFieldBounds[r] = leftFieldBounds_[r];
1473 // rightFieldBounds[r] = rightFieldBounds_[r];
1474 // }
1475 //
1476 // const int cellDataOrdinal = teamMember.league_rank();
1477 // const int numThreads = teamMember.team_size(); // num threads
1478 // const int G_k_StackHeight = numTensorComponents - 1; // per thread
1479 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1480 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1481 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1482 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1483 // if (fad_size_output_ > 0) {
1484 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1485 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1486 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1487 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1488 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1489 // }
1490 // else {
1491 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1492 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1493 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1494 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1495 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1496 // }
1497 //
1500 //
1501 // constexpr int R = numTensorComponents - 1;
1502 //
1503 // // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1504 //
1505 // // synchronize threads
1506 // teamMember.team_barrier();
1507 //
1508 // for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1509 // {
1510 // const int a = a_offset_ + a_component;
1511 // for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1512 // {
1513 // const int b = b_offset_ + b_component;
1514 //
1515 // const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1516 // const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1517 //
1518 // const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1519 // const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1520 //
1521 // const int numPointsFirst = leftFirstComponent.extent_int(1);
1522 //
1523 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1524 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1525 // });
1526 //
1527 // // synchronize threads
1528 // teamMember.team_barrier();
1529 //
1530 // for (int i0=0; i0<numLeftFieldsFirst; i0++)
1531 // {
1532 // for (int j0=0; j0<numRightFieldsFirst; j0++)
1533 // {
1534 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1535 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1536 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1537 // leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1538 // });
1539 //
1540 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1541 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1542 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1543 // rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1544 // });
1545 //
1546 // // synchronize threads
1547 // teamMember.team_barrier();
1548 //
1549 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1550 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1551 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1552 //
1553 // for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1554 // {
1555 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1556 // remainingIndex /= pointBounds[d+1];
1557 // }
1558 // pointArgumentsInLaterDimensions[0] = remainingIndex;
1559 //
1560 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1561 // for (int d=R; d>0; d--)
1562 // {
1563 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1564 // tensorPointEnumerationOffset *= pointBounds[d-1];
1565 // }
1566 //
1567 // Scalar integralValue = 0;
1568 // if (fad_size_output_ == 0)
1569 // {
1570 // // not a Fad type; we're allow to have a vector range
1571 // Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1572 // {
1573 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1574 // }, integralValue);
1575 // }
1576 // else
1577 // {
1578 // for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1579 // {
1580 // integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1581 // }
1582 // }
1583 //
1584 // G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1585 // });
1586 //
1587 // // synchronize threads
1588 // teamMember.team_barrier();
1589 //
1590 // // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1591 // }
1592 // }
1593 // }
1594 // }
1596  }
1597 
1598  KOKKOS_INLINE_FUNCTION
1599  void operator()( const TeamMember & teamMember ) const
1600  {
1601  switch (numTensorComponents_)
1602  {
1603  case 1: run<1>(teamMember); break;
1604  case 2: run<2>(teamMember); break;
1605  case 3: runSpecialized3(teamMember); break;
1606  case 4: run<4>(teamMember); break;
1607  case 5: run<5>(teamMember); break;
1608  case 6: run<6>(teamMember); break;
1609  case 7: run<7>(teamMember); break;
1610 #ifdef INTREPID2_HAVE_DEBUG
1611  default:
1612  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1613 #endif
1614  }
1615  }
1616 
1619  {
1620  // compute flop count on a single cell
1621  int flopCount = 0;
1622 
1623  constexpr int numTensorComponents = 3;
1624  Kokkos::Array<int,numTensorComponents> pointBounds;
1625  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1626  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1627 
1628  int pointsInNonzeroComponentDimensions = 1;
1629  for (unsigned r=0; r<numTensorComponents; r++)
1630  {
1631  pointBounds[r] = pointBounds_[r];
1632  if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1633  leftFieldBounds[r] = leftFieldBounds_[r];
1634  rightFieldBounds[r] = rightFieldBounds_[r];
1635  }
1636 
1637  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1638  {
1639  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1640  {
1641 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1642 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1643 // });
1644  flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1645 
1646  for (int i0=0; i0<leftFieldBounds[0]; i0++)
1647  {
1648  for (int j0=0; j0<rightFieldBounds[0]; j0++)
1649  {
1650  flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1651 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1652 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1653 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1654 //
1655 // for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1656 // {
1657 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1658 // remainingIndex /= pointBounds[d+1];
1659 // }
1660 // pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1661 //
1662 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1663 // for (int d=R; d>0; d--)
1664 // {
1665 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1666 // tensorPointEnumerationOffset *= pointBounds[d-1];
1667 // }
1668 //
1669 // Scalar integralValue = 0;
1670 // if (fad_size_output_ == 0)
1671 // {
1672 // // not a Fad type; we're allow to have a vector range
1673 // Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1674 // {
1675 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1676 // }, integralValue);
1677 // }
1678 // else
1679 // {
1680 // for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1681 // {
1682 // integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1683 // }
1684 // }
1685 //
1686 // GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1687 // });
1688 
1689 
1690  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1691  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1692  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1693 
1694 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1695 // const int i1 = i1j1 % leftFieldBounds[1];
1696 // const int j1 = i1j1 / leftFieldBounds[1];
1697 //
1699 //
1700 // int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1701 // for (int lz=0; lz<pointBounds[2]; lz++)
1702 // {
1703 // Scalar & Gy = GyIntegrals(Gy_index);
1704 // Gy = 0.0;
1705 //
1706 // const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1707 // const bool rightRankIs3 = (rightFields_y.rank() == 3);
1708 // for (int ly=0; ly<pointBounds[1]; ly++)
1709 // {
1710 // const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1711 // const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1712 //
1713 // Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1714 //
1715 // pointEnumerationIndex++;
1716 // }
1717 // Gy_index++;
1718 // }
1719 //
1720 // for (int i2=0; i2<leftFieldBounds[2]; i2++)
1721 // {
1722 // for (int j2=0; j2<rightFieldBounds[2]; j2++)
1723 // {
1724 // Scalar Gz = 0.0;
1725 //
1726 // int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1727 //
1728 // const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1729 // const bool rightRankIs3 = (rightFields_z.rank() == 3);
1730 // for (int lz=0; lz<pointBounds[2]; lz++)
1731 // {
1732 // const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1733 // const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1734 //
1735 // Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1736 //
1737 // Gy_index++;
1738 // }
1739 //
1740 // Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1741 // Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1742 //
1743 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1744 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1745 //
1746 // if (integralViewRank == 2)
1747 // {
1748 // integralView_.access(i,j,0) += Gz;
1749 // }
1750 // else
1751 // {
1752 // integralView_.access(cellDataOrdinal,i,j) += Gz;
1753 // }
1754 // }
1755 // }
1756 // });
1757  }
1758  }
1759  }
1760  }
1761  return flopCount;
1762  }
1763 
1765  int teamSize(const int &maxTeamSizeFromKokkos) const
1766  {
1767  // TODO: fix this to match the actual parallelism expressed
1768  const int R = numTensorComponents_ - 1;
1769  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1770  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1771  }
1772 
1774  size_t team_shmem_size (int numThreads) const
1775  {
1776  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1777  size_t shmem_size = 0;
1778 
1779  const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1780 
1781  int pointsInNonzeroComponentDimensions = 1;
1782  for (int d=1; d<numTensorComponents_; d++)
1783  {
1784  pointsInNonzeroComponentDimensions *= pointBounds_[d];
1785  }
1786 
1787  if (fad_size_output_ > 0)
1788  {
1789  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1790  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1791  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1792 
1793  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1794  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1795  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1796  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1797  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1798  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1799  }
1800  else
1801  {
1802  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1803  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1804  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1805 
1806  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1807  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1808  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1809  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1810  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1811  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1812  }
1813 
1814  return shmem_size;
1815  }
1816  };
1817 
1818  template<class Scalar, class DeviceType>
1820  {
1821  ScalarView<Scalar,DeviceType> integral_;
1823  Data<Scalar,DeviceType> right_;
1824  Data<Scalar,DeviceType> weights_;
1825  ordinal_type dimSpan_;
1826  ordinal_type leftRank_;
1827  ordinal_type rightRank_;
1828  ordinal_type numPoints_;
1829  public:
1830  F_RefSpaceIntegral(ScalarView<Scalar,DeviceType> integralView,
1832  ordinal_type dimSpan)
1833  :
1834  integral_(integralView),
1835  left_(left),
1836  right_(right),
1837  weights_(weights),
1838  dimSpan_(dimSpan)
1839  {
1840  leftRank_ = left.rank();
1841  rightRank_ = right.rank();
1842  numPoints_ = weights.extent_int(0);
1843  }
1844 
1845  KOKKOS_INLINE_FUNCTION
1846  void operator()( const ordinal_type & i, const ordinal_type & j ) const
1847  {
1848  Scalar refSpaceIntegral = 0.0;
1849  for (int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1850  {
1851  const Scalar & weight = weights_(ptOrdinal);
1852  for (int a=0; a<dimSpan_; a++)
1853  {
1854  const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1855  const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1856  refSpaceIntegral += leftValue * rightValue * weight;
1857  }
1858  }
1859  integral_(i,j) = refSpaceIntegral;
1860  }
1861  };
1862  }
1863 
1864 template<typename DeviceType>
1865 template<class Scalar>
1867  const TensorData<Scalar,DeviceType> cellMeasures,
1868  const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
1869 {
1870  // allocates a (C,F,F) container for storing integral data
1871 
1872  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1873  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1874  const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1875  const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1876  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1877 
1878  // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1879  // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1880  // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1881  // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1882  const int CELL_DIM = 0;
1883  const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1884  const auto leftTransform = vectorDataLeft.transform();
1885 
1886  DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1887  // transforms may be invalid, indicating an identity transform. If so, it will not constrain the output at all.
1888  if (vectorDataLeft.transform().isValid())
1889  {
1890  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1891  }
1892  if (vectorDataRight.transform().isValid())
1893  {
1894  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1895  }
1896 
1897  DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1898  int cellDataExtent = combinedCellDimInfo.dataExtent;
1899 
1900  const int numCells = vectorDataLeft.numCells();
1901  const int numFieldsLeft = vectorDataLeft.numFields();
1902  const int numFieldsRight = vectorDataRight.numFields();
1903 
1904  Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1905  Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1906 
1907  if (cellVariationType != CONSTANT)
1908  {
1909  Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1910  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1911  }
1912  else
1913  {
1914  Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1915  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1916  }
1917 }
1918 
1922 template<typename DeviceType>
1923 template<class Scalar>
1925  const TensorData<Scalar,DeviceType> & cellMeasures,
1926  const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
1927  double* approximateFlops)
1928 {
1929  using ExecutionSpace = typename DeviceType::execution_space;
1930 
1931  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1932  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1933  const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1934  const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1935  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1936 
1937  if (approximateFlops != NULL)
1938  {
1939  *approximateFlops = 0;
1940  }
1941 
1942  // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1943  const int integralViewRank = integrals.getUnderlyingViewRank();
1944 
1945  if (!sumInto)
1946  {
1947  integrals.clear();
1948  }
1949 
1950  const int cellDataExtent = integrals.getDataExtent(0);
1951  const int numFieldsLeft = basisValuesLeft.numFields();
1952  const int numFieldsRight = basisValuesRight.numFields();
1953  const int spaceDim = basisValuesLeft.spaceDim();
1954 
1955  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1956 
1957  const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1958  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1959 
1960  // we require that the number of tensor components in the vectors are the same for each vector entry
1961  // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1962  int numTensorComponentsLeft = -1;
1963  const bool leftIsVectorValued = basisValuesLeft.vectorData().isValid();
1964 
1965  if (leftIsVectorValued)
1966  {
1967  const auto &refVectorLeft = basisValuesLeft.vectorData();
1968  int numFamiliesLeft = refVectorLeft.numFamilies();
1969  int numVectorComponentsLeft = refVectorLeft.numComponents();
1970  Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1971  for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1972  {
1973  for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1974  {
1975  const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1976  if (tensorData.numTensorComponents() > 0)
1977  {
1978  if (numTensorComponentsLeft == -1)
1979  {
1980  numTensorComponentsLeft = tensorData.numTensorComponents();
1981  }
1982  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1983  for (int r=0; r<numTensorComponentsLeft; r++)
1984  {
1985  maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
1986  }
1987  }
1988  }
1989  }
1990  }
1991  else
1992  {
1993  numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
1994  for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1995  {
1996  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
1997  }
1998  }
1999  int numTensorComponentsRight = -1;
2000  const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
2001 
2002  if (rightIsVectorValued)
2003  {
2004  const auto &refVectorRight = basisValuesRight.vectorData();
2005  int numFamiliesRight = refVectorRight.numFamilies();
2006  int numVectorComponentsRight = refVectorRight.numComponents();
2007  Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
2008  for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2009  {
2010  for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2011  {
2012  const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2013  if (tensorData.numTensorComponents() > 0)
2014  {
2015  if (numTensorComponentsRight == -1)
2016  {
2017  numTensorComponentsRight = tensorData.numTensorComponents();
2018  }
2019  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2020  for (int r=0; r<numTensorComponentsRight; r++)
2021  {
2022  maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2023  }
2024  }
2025  }
2026  }
2027  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2028  }
2029  else
2030  {
2031  // check that right tensor component count agrees with left
2032  for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2033  {
2034  INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2035  }
2036  }
2037  const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
2038 
2039  if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
2040  {
2041  // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
2042 
2043  // in this case, the integrals in each tensorial direction are entirely separable
2044  // allocate some temporary storage for the integrals in each tensorial direction
2045 
2046  // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
2047 
2048  Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2049  for (int r=0; r<numPointTensorComponents; r++)
2050  {
2051  // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
2052  pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
2053  }
2054 
2055  // only one of these will be a valid container:
2056  Kokkos::View<Scalar**, DeviceType> integralView2;
2057  Kokkos::View<Scalar***, DeviceType> integralView3;
2058  if (integralViewRank == 3)
2059  {
2060  integralView3 = integrals.getUnderlyingView3();
2061  }
2062  else
2063  {
2064  integralView2 = integrals.getUnderlyingView2();
2065  }
2066  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2067  {
2068  int a_offset = 0; // left vector component offset
2069  int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2070 
2071  const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
2072  for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2073  {
2074  TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
2075  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2076  if (!leftComponent.isValid())
2077  {
2078  a_offset++; // empty components are understood to take up one dimension
2079  continue;
2080  }
2081  const int leftDimSpan = leftComponent.extent_int(2);
2082 
2083  const int leftComponentFieldCount = leftComponent.extent_int(0);
2084 
2085  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2086  {
2087  int b_offset = 0; // right vector component offset
2088  int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2089 
2090  const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2091  for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2092  {
2093  TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2094  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2095  if (!rightComponent.isValid())
2096  {
2097  b_offset++; // empty components are understood to take up one dimension
2098  continue;
2099  }
2100  const int rightDimSpan = rightComponent.extent_int(2);
2101 
2102  const int rightComponentFieldCount = rightComponent.extent_int(0);
2103 
2104  // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2105  if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2106  {
2107  b_offset += rightDimSpan;
2108 
2109  continue;
2110  }
2111 
2112  // if the a, b spans intersect, by assumption they should align exactly
2113  INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2114  INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2115 
2116  const int d_start = a_offset;
2117  const int d_end = d_start + leftDimSpan;
2118 
2119  using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2120  ComponentIntegralsArray componentIntegrals;
2121  for (int r=0; r<numPointTensorComponents; r++)
2122  {
2123  /*
2124  Four vector data cases to consider:
2125  1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2126  2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2127  3. First container is axial, second arbitrary.
2128  4. First is arbitrary, second axial.
2129 
2130  But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2131  the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2132 
2133  The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2134  while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2135  */
2136 
2137  Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2138  const int numPoints = pointDimensions[r];
2139 
2140  // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2141 
2142  Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2143  Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2144 
2145  const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2146  const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2147  const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2148  const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2149 
2150  INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2151 
2152  for (int d=d_start; d<d_end; d++)
2153  {
2154  ScalarView<Scalar,DeviceType> componentIntegralView;
2155 
2156  const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
2157  if (allocateFadStorage)
2158  {
2159  auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2160  componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2161  }
2162  else
2163  {
2164  componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2165  }
2166 
2167  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2168 
2169  Impl::F_RefSpaceIntegral<Scalar, DeviceType> refSpaceIntegralFunctor(componentIntegralView, leftTensorComponent, rightTensorComponent, quadratureWeights,
2170  leftTensorComponentDimSpan);
2171  Kokkos::parallel_for("compute componentIntegrals", policy, refSpaceIntegralFunctor);
2172 
2173  componentIntegrals[r][d] = componentIntegralView;
2174 
2175  if (approximateFlops != NULL)
2176  {
2177  *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2178  }
2179  } // d
2180  } // r
2181 
2182  ExecutionSpace().fence();
2183 
2184  Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2185  Kokkos::Array<int,3> lowerBounds {0,0,0};
2186  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2187  // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2188  Kokkos::parallel_for("compute field integrals", policy,
2189  KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2190  const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2191 
2192  TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2193  leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2194 
2195  TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2196  rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2197 
2198  Scalar integralSum = 0.0;
2199  for (int d=d_start; d<d_end; d++)
2200  {
2201  const Scalar & transformLeft_d = basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
2202  const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
2203 
2204  const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2205  // approximateFlopCount++;
2206 
2207  Scalar integral_d = 1.0;
2208 
2209  for (int r=0; r<numPointTensorComponents; r++)
2210  {
2211  integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2212  // approximateFlopCount++; // product
2213  }
2214  integralSum += leftRightTransform_d * integral_d;
2215  // approximateFlopCount += 2; // multiply and sum
2216 
2217  const int i = leftFieldOrdinal + leftFieldOffset;
2218  const int j = rightFieldOrdinal + rightFieldOffset;
2219 
2220  if (integralViewRank == 3)
2221  {
2222  integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2223  }
2224  else
2225  {
2226  integralView2(i,j) += cellMeasureWeight * integralSum;
2227  }
2228  }
2229  });
2230  b_offset += rightDimSpan;
2231  } // rightVectorComponentOrdinal
2232  } // rightFamilyOrdinal
2233  a_offset += leftDimSpan;
2234  } // leftVectorComponentOrdinal
2235  } // leftFamilyOrdinal
2236 
2237  if (approximateFlops != NULL)
2238  {
2239  // TODO: check the accuracy of this
2240  *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2241  }
2242  }
2243  else // general case (not axis-aligned + affine tensor-product structure)
2244  {
2245  // prepare composed transformation matrices
2246  const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2247  const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2248  const bool transposeLeft = true;
2249  const bool transposeRight = false;
2250 // auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2251 // timer->start();
2252  // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2 -- or vector weights -- (C,P,D): rank 3
2253  Data<Scalar,DeviceType> composedTransform;
2254  // invalid/empty transforms are used when the identity is intended.
2255  const int leftRank = leftTransform.rank();
2256  const int rightRank = rightTransform.rank();
2257 
2258  if (leftTransform.isValid() && rightTransform.isValid())
2259  {
2260  const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
2261  const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
2262  const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
2263  const bool ranks32 = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
2264  const bool ranks42 = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
2265 
2266  if (bothRank4) // (C,P,D,D)
2267  {
2268  composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2269  composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2270 
2271  // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2272  if (approximateFlops != NULL)
2273  {
2274  *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2275  }
2276  }
2277  else if (bothRank3) // (C,P,D)
2278  {
2279  // re-cast leftTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2280  const int newRank = 4;
2281  auto extents = leftTransform.getExtents();
2282  auto variationTypes = leftTransform.getVariationTypes();
2283  extents[3] = extents[2];
2284  extents[2] = 1;
2285  variationTypes[3] = variationTypes[2];
2286  variationTypes[2] = CONSTANT;
2287  auto leftTransformMatrix = leftTransform.shallowCopy(newRank, extents, variationTypes);
2288 
2289  // re-cast rightTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P)
2290  extents = rightTransform.getExtents();
2291  variationTypes = rightTransform.getVariationTypes();
2292  extents[3] = extents[2];
2293  extents[2] = 1;
2294  variationTypes[3] = variationTypes[2];
2295  variationTypes[2] = CONSTANT;
2296  auto rightTransformMatrix = rightTransform.shallowCopy(newRank, extents, variationTypes);
2297 
2298  composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix); // false: don't transpose
2299  composedTransform.storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
2300 
2301  if (approximateFlops != NULL)
2302  {
2303  *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2304  }
2305  }
2306  else if (bothRank2)
2307  {
2308  composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2309  composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2310 
2311  // re-cast composedTranform as a rank 4 (C,P,1,1) object -- a 1 x 1 matrix at each (C,P).
2312  const int newRank = 4;
2313  auto extents = composedTransform.getExtents();
2314  auto variationTypes = composedTransform.getVariationTypes();
2315  composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2316  if (approximateFlops != NULL)
2317  {
2318  *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2319  }
2320  }
2321  else if (ranks32) // rank 2 / rank 3 combination.
2322  {
2323  const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
2324  const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
2325 
2326  composedTransform = DataTools::multiplyByCPWeights(rank3Transform, rank2Transform);
2327 
2328  // re-cast composedTransform as a rank 4 object:
2329  // logically, the original rank-3 transform can be understood as a 1xD matrix. The composed transform is leftTransform^T * rightTransform, so:
2330  // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2331  // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2332  const int newRank = 4;
2333  auto extents = composedTransform.getExtents();
2334  auto variationTypes = composedTransform.getVariationTypes();
2335  if (leftRank == 3)
2336  {
2337  // extents[3] and variationTypes[3] will already be 1 and CONSTANT, respectively
2338  // extents[3] = 1;
2339  // variationTypes[3] = CONSTANT;
2340  }
2341  else
2342  {
2343  extents[3] = extents[2];
2344  extents[2] = 1;
2345  variationTypes[3] = variationTypes[2];
2346  variationTypes[2] = CONSTANT;
2347  }
2348  composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2349  }
2350  else if (ranks42) // rank 4 / rank 2 combination.
2351  {
2352  if (leftRank == 4)
2353  {
2354  // want to transpose left matrix, and multiply by the values from rightTransform
2355  // start with the multiplication:
2356  auto composedTransformTransposed = DataTools::multiplyByCPWeights(leftTransform, rightTransform);
2357  composedTransform = DataTools::transposeMatrix(composedTransformTransposed);
2358  }
2359  else // (leftRank == 2)
2360  {
2361  composedTransform = DataTools::multiplyByCPWeights(rightTransform, leftTransform);
2362  }
2363  }
2364  else
2365  {
2366  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2367  }
2368  }
2369  else if (leftTransform.isValid())
2370  {
2371  // rightTransform is the identity
2372  switch (leftRank)
2373  {
2374  case 4: composedTransform = DataTools::transposeMatrix(leftTransform); break;
2375  case 3:
2376  {
2377  // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2378  const int newRank = 4;
2379  auto extents = leftTransform.getExtents();
2380  auto variationTypes = leftTransform.getVariationTypes();
2381 
2382  composedTransform = leftTransform.shallowCopy(newRank, extents, variationTypes);
2383  }
2384  break;
2385  case 2: composedTransform = leftTransform; break;
2386  default:
2387  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2388  }
2389  }
2390  else if (rightTransform.isValid())
2391  {
2392  // leftTransform is the identity
2393  composedTransform = rightTransform;
2394  switch (rightRank)
2395  {
2396  case 4: composedTransform = rightTransform; break;
2397  case 3:
2398  {
2399  // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2400  const int newRank = 4;
2401  auto extents = rightTransform.getExtents();
2402  auto variationTypes = rightTransform.getVariationTypes();
2403  extents[3] = extents[2];
2404  variationTypes[3] = variationTypes[2];
2405  extents[2] = 1;
2406  variationTypes[2] = CONSTANT;
2407 
2408  composedTransform = rightTransform.shallowCopy(newRank, extents, variationTypes);
2409  }
2410  break;
2411  case 2: composedTransform = rightTransform; break;
2412  default:
2413  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2414  }
2415  }
2416  else
2417  {
2418  // both left and right transforms are identity
2419  Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2420  Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2421 
2422  Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2423  Kokkos::deep_copy(identityUnderlyingView, 1.0);
2424  composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2425  }
2426 
2427 // timer->stop();
2428 // std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2429 // timer->reset();
2430 
2431  const int leftComponentCount = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2432  const int rightComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2433 
2434  int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2435  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2436  {
2437  // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2438  // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2439  int a_offset = 0;
2440  bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2441  for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2442  {
2443  TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2444  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2445  if (!leftComponent.isValid())
2446  {
2447  // represents zero
2448  a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2449  continue;
2450  }
2451 
2452  int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2453  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2454  {
2455  // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2456  // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2457  bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2458  int b_offset = 0;
2459  for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2460  {
2461  TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2462  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2463  if (!rightComponent.isValid())
2464  {
2465  // represents zero
2466  b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2467  continue;
2468  }
2469 
2470  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2471 
2472  const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2473  Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2474 
2475  // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2476  // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2477  // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2478  // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2479  // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2480  // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2481  // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2482  bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2483  bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2484 
2485  // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2486  // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2487  if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2488  {
2489  ExecutionSpace().fence();
2490  }
2491  haveLaunchedContributionToCurrentFamilyLeft = true;
2492  haveLaunchedContributionToCurrentFamilyRight = true;
2493 
2494  if (integralViewRank == 2)
2495  {
2496  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2497  {
2498  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2499 
2500  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2501  const int teamSize = functor.teamSize(recommendedTeamSize);
2502 
2503  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2504 
2505  Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2506 
2507  if (approximateFlops != NULL)
2508  {
2509  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2510  }
2511  }
2512  else
2513  {
2514  auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2515 
2516  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2517  const int teamSize = functor.teamSize(recommendedTeamSize);
2518 
2519  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2520 
2521  Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2522 
2523  if (approximateFlops != NULL)
2524  {
2525  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2526  }
2527  }
2528  }
2529  else if (integralViewRank == 3)
2530  {
2531  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2532  {
2533  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2534 
2535  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2536  const int teamSize = functor.teamSize(recommendedTeamSize);
2537 
2538  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2539 
2540  Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2541 
2542  if (approximateFlops != NULL)
2543  {
2544  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2545  }
2546  }
2547  else
2548  {
2549  auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2550 
2551  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2552  const int teamSize = functor.teamSize(recommendedTeamSize);
2553 
2554  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2555 
2556  Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2557 
2558  if (approximateFlops != NULL)
2559  {
2560  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2561  }
2562  }
2563  }
2564  b_offset += rightIsVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2565  }
2566  rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2567  }
2568  a_offset += leftIsVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2569  }
2570  leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2571  }
2572  }
2573 // if (approximateFlops != NULL)
2574 // {
2575 // std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2576 // }
2577  ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2578 }
2579 
2580 } // end namespace Intrepid2
2581 
2582 #endif
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
static void multiplyByCPWeights(Data< Scalar, DeviceType > &resultMatrixData, const Data< Scalar, DeviceType > &matrixDataIn, const Data< Scalar, DeviceType > &scalarDataIn)
Utility methods for manipulating Intrepid2::Data objects.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container...
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
static Data< Scalar, DeviceType > allocateIntegralData(const TransformedBasisValues< Scalar, DeviceType > vectorDataLeft, const TensorData< Scalar, DeviceType > cellMeasures, const TransformedBasisValues< Scalar, DeviceType > vectorDataRight)
Allocates storage for the contraction of vectorDataLeft and vectorDataRight containers on point and s...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don&#39;t (e.g., those that have been co...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
static void integrate(Data< Scalar, DeviceType > integrals, const TransformedBasisValues< Scalar, DeviceType > &vectorDataLeft, const TensorData< Scalar, DeviceType > &cellMeasures, const TransformedBasisValues< Scalar, DeviceType > &vectorDataRight, const bool sumInto=false, double *approximateFlops=NULL)
Contracts vectorDataLeft and vectorDataRight containers on point and space dimensions, weighting each point according to cellMeasures, and stores the result in outputValues. The integrals container can be constructed using allocateIntegralData().
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank...
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
static Data< Scalar, DeviceType > transposeMatrix(const Data< Scalar, DeviceType > &matrixDataIn)
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don&#39;t (namely, those that have been ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container...
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
Structure-preserving representation of transformed vector data; reference space values and transforma...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container...
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...