Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Intrepid2 Package
4 //
5 // Copyright 2007 NTESS and the Intrepid2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
15 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
16 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
17 
18 #include "Intrepid2_DataTools.hpp"
21 
22 #include "Teuchos_TimeMonitor.hpp"
23 
24 namespace Intrepid2 {
25 
26  namespace Impl
27  {
31  template<class Scalar, class DeviceType, int integralViewRank>
33  {
34  using ExecutionSpace = typename DeviceType::execution_space;
35  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
36  using TeamMember = typename TeamPolicy::member_type;
37 
38  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
39  IntegralViewType integralView_;
40  TensorData<Scalar,DeviceType> leftComponent_;
41  Data<Scalar,DeviceType> composedTransform_;
42  TensorData<Scalar,DeviceType> rightComponent_;
43  TensorData<Scalar,DeviceType> cellMeasures_;
44  int a_offset_;
45  int b_offset_;
46  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
47  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
48  int numTensorComponents_;
49  int leftFieldOrdinalOffset_;
50  int rightFieldOrdinalOffset_;
51  bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
52 
53  size_t fad_size_output_ = 0; // 0 if not a fad type
54 
55  Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
56 
57  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
58  // (this also makes it easier to reorder loops, etc., for further optimizations)
59  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
60  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
61  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
62 
63  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
64  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
65 
66  int maxFieldsLeft_;
67  int maxFieldsRight_;
68  int maxPointCount_;
69  public:
71  TensorData<Scalar,DeviceType> leftComponent,
72  Data<Scalar,DeviceType> composedTransform,
73  TensorData<Scalar,DeviceType> rightComponent,
74  TensorData<Scalar,DeviceType> cellMeasures,
75  int a_offset,
76  int b_offset,
77  int leftFieldOrdinalOffset,
78  int rightFieldOrdinalOffset,
79  bool forceNonSpecialized)
80  :
81  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
82  leftComponent_(leftComponent),
83  composedTransform_(composedTransform),
84  rightComponent_(rightComponent),
85  cellMeasures_(cellMeasures),
86  a_offset_(a_offset),
87  b_offset_(b_offset),
88  leftComponentSpan_(leftComponent.extent_int(2)),
89  rightComponentSpan_(rightComponent.extent_int(2)),
90  numTensorComponents_(leftComponent.numTensorComponents()),
91  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
92  rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
93  forceNonSpecialized_(forceNonSpecialized)
94  {
95  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
96 
97  // set up bounds containers
98  const int FIELD_DIM = 0;
99  const int POINT_DIM = 1;
100  maxFieldsLeft_ = 0;
101  maxFieldsRight_ = 0;
102  maxPointCount_ = 0;
103  for (int r=0; r<numTensorComponents_; r++)
104  {
105  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
106  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
107  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
108  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
109  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
110  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
111  }
112 
113  // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
114  int leftRelativeEnumerationSpan = 1;
115  int rightRelativeEnumerationSpan = 1;
116  for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
117  {
118  leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
119  rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
120  leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
121  rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
122  }
123 
124  // prepare for allocation of temporary storage
125  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
126 
127  const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
128  if (allocateFadStorage)
129  {
130  fad_size_output_ = dimension_scalar(integralView_);
131  }
132 
133  const int R = numTensorComponents_ - 1;
134 
135  int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
136  int allocationSoFar = 0;
137  offsetsForComponentOrdinal_[R] = allocationSoFar;
138  allocationSoFar++; // we store one entry corresponding to R, the final component
139 
140  for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
141  {
142  const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
143  const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
144 
145  num_ij *= leftFields * rightFields;
146 
147  offsetsForComponentOrdinal_[r] = allocationSoFar;
148  allocationSoFar += num_ij;
149  }
150  offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
151  }
152 
153  template<size_t maxComponents, size_t numComponents = maxComponents>
154  KOKKOS_INLINE_FUNCTION
155  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
156  const Kokkos::Array<int,maxComponents> &bounds) const
157  {
158  if (numComponents == 0)
159  {
160  return -1;
161  }
162  else
163  {
164  int r = static_cast<int>(numComponents - 1);
165  while (arguments[r] + 1 >= bounds[r])
166  {
167  arguments[r] = 0; // reset
168  r--;
169  if (r < 0) break;
170  }
171  if (r >= 0) ++arguments[r];
172  return r;
173  }
174  }
175 
177  KOKKOS_INLINE_FUNCTION
178  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
179  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
180  const int &numComponents) const
181  {
182  if (numComponents == 0) return -1;
183  int r = static_cast<int>(numComponents - 1);
184  while (arguments[r] + 1 >= bounds[r])
185  {
186  arguments[r] = 0; // reset
187  r--;
188  if (r < 0) break;
189  }
190  if (r >= 0) ++arguments[r];
191  return r;
192  }
193 
194  template<size_t maxComponents, size_t numComponents = maxComponents>
195  KOKKOS_INLINE_FUNCTION
196  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
197  const Kokkos::Array<int,maxComponents> &bounds) const
198  {
199  if (numComponents == 0)
200  {
201  return -1;
202  }
203  else
204  {
205  int r = static_cast<int>(numComponents - 1);
206  while (arguments[r] + 1 >= bounds[r])
207  {
208  r--;
209  if (r < 0) break;
210  }
211  return r;
212  }
213  }
214 
216  KOKKOS_INLINE_FUNCTION
217  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
218  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
219  const int &numComponents) const
220  {
221  if (numComponents == 0) return -1;
222  int r = numComponents - 1;
223  while (arguments[r] + 1 >= bounds[r])
224  {
225  r--;
226  if (r < 0) break;
227  }
228  return r;
229  }
230 
231  template<size_t maxComponents, size_t numComponents = maxComponents>
232  KOKKOS_INLINE_FUNCTION
233  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
234  const Kokkos::Array<int,maxComponents> &bounds,
235  const int startIndex) const
236  {
237  // the following mirrors what is done in TensorData
238  if (numComponents == 0)
239  {
240  return 0;
241  }
242  else
243  {
244  int enumerationIndex = 0;
245  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
246  {
247  enumerationIndex += arguments[r];
248  enumerationIndex *= bounds[r-1];
249  }
250  enumerationIndex += arguments[startIndex];
251  return enumerationIndex;
252  }
253  }
254 
256 
257  // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
258 // template<size_t numTensorComponents>
259 // KOKKOS_INLINE_FUNCTION
260 // void runSpecialized( const TeamMember & teamMember ) const;
261 
262 // template<>
263 // KOKKOS_INLINE_FUNCTION
264 // void runSpecialized<3>( const TeamMember & teamMember ) const
265  KOKKOS_INLINE_FUNCTION
266  void runSpecialized3( const TeamMember & teamMember ) const
267  {
268  constexpr int numTensorComponents = 3;
269 
270  Kokkos::Array<int,numTensorComponents> pointBounds;
271  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
272  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
273  for (unsigned r=0; r<numTensorComponents; r++)
274  {
275  pointBounds[r] = pointBounds_[r];
276  leftFieldBounds[r] = leftFieldBounds_[r];
277  rightFieldBounds[r] = rightFieldBounds_[r];
278  }
279 
280  const int cellDataOrdinal = teamMember.league_rank();
281  const int numThreads = teamMember.team_size(); // num threads
282  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
283 
284  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
285  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
286  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
287  if (fad_size_output_ > 0) {
288  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
289  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
290  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
291  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
292  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
293  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
294  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
295  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
296  }
297  else {
298  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
299  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
300  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
301  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
302  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
303  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
304  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
305  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
306  }
307 
308 // int approximateFlopCount = 0;
309 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
310 
311  constexpr int R = numTensorComponents - 1;
312 
313  const int composedTransformRank = composedTransform_.rank();
314 
315  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
316  {
317  const int a = a_offset_ + a_component;
318  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
319  {
320  const int b = b_offset_ + b_component;
321 
322  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
323  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
324 
325  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
326  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
327 
328  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
329  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
330  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
331  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
332  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
333  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
334 
335  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
336  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
337  const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
338  const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
339  if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
340  {
341  const int leftRank = leftTensorComponent_x.rank();
342  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
343  }
344  if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
345  {
346  const int leftRank = leftTensorComponent_y.rank();
347  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
348  }
349  if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
350  {
351  const int leftRank = leftTensorComponent_z.rank();
352  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
353  }
354  if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
355  {
356  const int rightRank = rightTensorComponent_x.rank();
357  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
358  }
359  if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
360  {
361  const int rightRank = rightTensorComponent_y.rank();
362  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
363  }
364  if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
365  {
366  const int rightRank = rightTensorComponent_z.rank();
367  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
368  }
369  });
370 
371  if (composedTransform_.underlyingMatchesLogical())
372  {
373  if (composedTransformRank == 4) // (C,P,D,D)
374  {
375  const auto & composedTransformView = composedTransform_.getUnderlyingView4();
376  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
377  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
378  });
379  }
380  else // rank 2, then: (C,P)
381  {
382  const auto & composedTransformView = composedTransform_.getUnderlyingView2();
383  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
384  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
385  });
386  }
387  }
388  else
389  {
390  if (composedTransformRank == 4) // (C,P,D,D)
391  {
392  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
393  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
394  });
395  }
396  else // rank 2, then: (C,P)
397  {
398  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
399  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
400  });
401  }
402  }
403 
404  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
405 
406  // synchronize threads
407  teamMember.team_barrier();
408 
409  // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
410  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
411  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
412  {
413  scratchView(i) = 0.0;
414  }
415 
416  // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
417  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
418  const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
419  const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
420 
421  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
422  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
423  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
424  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
425  rightFieldArguments[R] = jz;
426  leftFieldArguments[R] = iz;
427 
428  Kokkos::Array<int,numTensorComponents> pointArguments;
429  for (int i=0; i<numTensorComponents; i++)
430  {
431  pointArguments[i] = 0;
432  }
433 
434  for (int lx=0; lx<pointBounds[0]; lx++)
435  {
436  pointArguments[0] = lx;
437 
438  // clear Gy scratch:
439  // in scratch, Gz (1 entry) comes first, then Gy entries.
440  const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
441  const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
442 
443  for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
444  {
445  scratchView(Gy_index) = 0;
446  }
447 
448  for (int ly=0; ly<pointBounds[1]; ly++)
449  {
450  pointArguments[1] = ly;
451 
452  Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
453  *Gz = 0;
454 
455  for (int lz=0; lz < pointBounds[2]; lz++)
456  {
457  const Scalar & leftValue = leftFields_z(iz,lz);
458  const Scalar & rightValue = rightFields_z(jz,lz);
459 
460  pointArguments[2] = lz;
461  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
462 
463  *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
464 
465  // approximateFlopCount += 3; // 2 multiplies, 1 sum
466  } // lz
467 
468  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
469  {
470  leftFieldArguments[1] = iy;
471  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
472 
473  const Scalar & leftValue = leftFields_y(iy,ly);
474 
475  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
476  {
477  rightFieldArguments[1] = jy;
478 
479  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
480  const Scalar & rightValue = rightFields_y(jy,ly);
481 
482  const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
483  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
484 
485  const int Gz_index = 0;
486  const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
487 
488  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
489 
490  Gy += leftValue * Gz * rightValue;
491  // approximateFlopCount += 3; // 2 multiplies, 1 sum
492  }
493  }
494  } // ly
495  for (int ix=0; ix<leftFieldBounds_[0]; ix++)
496  {
497  leftFieldArguments[0] = ix;
498  const Scalar & leftValue = leftFields_x(ix,lx);
499 
500  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
501  {
502  leftFieldArguments[1] = iy;
503 
504  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
505 
506  for (int jx=0; jx<rightFieldBounds_[0]; jx++)
507  {
508  rightFieldArguments[0] = jx;
509  const Scalar & rightValue = rightFields_x(jx,lx);
510 
511  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
512  {
513  rightFieldArguments[1] = jy;
514  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
515 
516  const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
517 
518  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
519  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
520 
521  // compute enumeration indices to get field indices into output view
522  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
523  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
524  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
525  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
526 
527  if (integralViewRank == 3)
528  {
529 // if ((leftFieldIndex==0) && (rightFieldIndex==2))
530 // {
531 // using std::cout;
532 // using std::endl;
533 // cout << "***** Contribution to (0,0,2) *****\n";
534 // cout << "lx = " << lx << endl;
535 // cout << "ix = " << ix << endl;
536 // cout << "iy = " << iy << endl;
537 // cout << "jx = " << jx << endl;
538 // cout << "jy = " << jy << endl;
539 // cout << "iz = " << iz << endl;
540 // cout << "jz = " << jz << endl;
541 // cout << " leftValue = " << leftValue << endl;
542 // cout << "rightValue = " << rightValue << endl;
543 // cout << "Gy = " << Gy << endl;
544 //
545 // cout << endl;
546 // }
547 
548  // shape (C,F1,F2)
549  integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
550  }
551  else
552  {
553  // shape (F1,F2)
554  integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
555  }
556  // approximateFlopCount += 3; // 2 multiplies, 1 sum
557  } // jy
558  } // ix
559  } // iy
560  } // ix
561  } // lx
562  }); // TeamThreadRange parallel_for - (iz,jz) loop
563  }
564  }
565 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
566  }
567 
568  template<size_t numTensorComponents>
569  KOKKOS_INLINE_FUNCTION
570  void run( const TeamMember & teamMember ) const
571  {
572  Kokkos::Array<int,numTensorComponents> pointBounds;
573  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
574  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
575  for (unsigned r=0; r<numTensorComponents; r++)
576  {
577  pointBounds[r] = pointBounds_[r];
578  leftFieldBounds[r] = leftFieldBounds_[r];
579  rightFieldBounds[r] = rightFieldBounds_[r];
580  }
581 
582  const int cellDataOrdinal = teamMember.league_rank();
583  const int numThreads = teamMember.team_size(); // num threads
584  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
585  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
586  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
587  Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
588  if (fad_size_output_ > 0) {
589  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
590  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
591  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
592  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
593  }
594  else {
595  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
596  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
597  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
598  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
599  }
600 
601 // int approximateFlopCount = 0;
602 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
603 
604  constexpr int R = numTensorComponents - 1;
605 
606  const int composedTransformRank = composedTransform_.rank();
607 
608  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
609  {
610  const int a = a_offset_ + a_component;
611  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
612  {
613  const int b = b_offset_ + b_component;
614 
615  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
616  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
617 
618  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
619  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
620 
621  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
622  const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
623  const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
624  const int leftFieldCount = leftTensorComponent.extent_int(0);
625  const int pointCount = leftTensorComponent.extent_int(1);
626  const int leftRank = leftTensorComponent.rank();
627  const int rightFieldCount = rightTensorComponent.extent_int(0);
628  const int rightRank = rightTensorComponent.rank();
629  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
630  {
631  // slightly weird logic here in effort to avoid branch divergence
632  const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
633  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
634  {
635  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
636  leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
637  }
638  }
639  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
640  {
641  // slightly weird logic here in effort to avoid branch divergence
642  const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
643  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
644  {
645  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646  rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
647  }
648  }
649  });
650 
651  if (composedTransformRank == 4)
652  {
653  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
654  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
655  });
656  }
657  else
658  {
659  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
660  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
661  });
662  }
663  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
664 
665  // synchronize threads
666  teamMember.team_barrier();
667 
668  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
669  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
670  const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
671  const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
672 
673  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
674  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
675  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
676  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
677  rightFieldArguments[R] = j_R;
678  leftFieldArguments[R] = i_R;
679 
680  //TODO: I believe that this can be moved outside the thread parallel_for
681  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
682  {
683  scratchView(i) = 0.0;
684  }
685  Kokkos::Array<int,numTensorComponents> pointArguments;
686  for (unsigned i=0; i<numTensorComponents; i++)
687  {
688  pointArguments[i] = 0;
689  }
690 
691  int r = R;
692  while (r >= 0)
693  {
694  // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
695  const int pointBounds_R = pointBounds[R];
696  int & pointArgument_R = pointArguments[R];
697  for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
698  {
699  Scalar * G_R;
700  if (R != 0)
701  {
702  G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
703  }
704  else
705  {
706  const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
707  const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
708 
709  if (integralViewRank == 3)
710  {
711  // shape (C,F1,F2)
712  G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
713  }
714  else
715  {
716  // shape (F1,F2)
717  G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
718  }
719  }
720 
721  const int & pointIndex_R = pointArguments[R];
722 
723  const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
724  const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
725 
726  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
727 
728  *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
729 
730 // approximateFlopCount += 3; // 2 multiplies, 1 sum
731  } // pointArgument_R
732 
733  const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
734  const int r_min = (r_next >= 0) ? r_next : 0;
735 
736  for (int s=R-1; s>=r_min; s--)
737  {
738  const int & pointIndex_s = pointArguments[s];
739 
740  // want to cover all the multi-indices from s to R-1
741  for (int s1=s; s1<R; s1++)
742  {
743  leftFieldArguments[s1] = 0;
744  }
745 
746  // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
747  const int & i_s = leftFieldArguments[s];
748  const int & j_s = rightFieldArguments[s];
749 
750  int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
751  const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
752  const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
753 
754  while (sLeft >= s)
755  {
756  const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
757 
758  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
759  const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
760 
761  const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
762 
763  for (int s1=s; s1<R; s1++)
764  {
765  rightFieldArguments[s1] = 0;
766  }
767  int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
768  while (sRight >= s)
769  {
770  const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
771 
772  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
773  const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
774 
775  const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
776 
777  const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
778 
779  Scalar* G_s;
780 
781  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
782  // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
783  const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
784 
785  const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
786 
787 
788  if (s != 0)
789  {
790  G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
791  }
792  else
793  {
794  // compute enumeration indices
795  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
796  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
797  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
798  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
799 
800  if (integralViewRank == 3)
801  {
802  // shape (C,F1,F2)
803  G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
804  }
805  else
806  {
807  // shape (F1,F2)
808  G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
809  }
810  }
811 
812  *G_s += leftValue * G_sp * rightValue;
813 
814 // approximateFlopCount += 3; // 2 multiplies, 1 sum
815 
816  // increment rightField
817  sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
818  }
819 
820  // increment leftField
821  sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
822  }
823  }
824 
825  // clear tempStorage for r_next+1 to R
826  if (r_min+1 <= R)
827  {
828  const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
829  for (int i=scratchOffsetForThread; i<endIndex; i++)
830  {
831  scratchView(i) = 0.0;
832  }
833 // auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
834 // Kokkos::deep_copy(tempStorageSubview, 0.0);
835  }
836 
837  // proceed to next point
838  r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
839  }
840  }); // TeamThreadRange parallel_for
841  }
842  }
843 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
844  }
845 
846  KOKKOS_INLINE_FUNCTION
847  void operator()( const TeamMember & teamMember ) const
848  {
849  switch (numTensorComponents_)
850  {
851  case 1: run<1>(teamMember); break;
852  case 2: run<2>(teamMember); break;
853  case 3:
854  if (forceNonSpecialized_)
855  run<3>(teamMember);
856  else
857  runSpecialized3(teamMember);
858  break;
859  case 4: run<4>(teamMember); break;
860  case 5: run<5>(teamMember); break;
861  case 6: run<6>(teamMember); break;
862  case 7: run<7>(teamMember); break;
863 #ifdef INTREPID2_HAVE_DEBUG
864  default:
865  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
866 #endif
867  }
868  }
869 
872  {
873  // compute flop count on a single cell, then multiply by the number of cells
874  int flopCount = 0;
875 
876  const int R = numTensorComponents_ - 1;
877 
878  // we cache the value of M_ab * cellMeasure at each point.
879  // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
880  const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
881 
882  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
883  {
884  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
885  {
886  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
887  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
888 
889  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
890  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
891 
892  flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
893 
894  int flopsPer_i_R_j_R = 0;
895 
896  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
897  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
898  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
899  leftFieldArguments[R] = 0;
900 
901  Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
902  for (int i=0; i<numTensorComponents_; i++)
903  {
904  pointArguments[i] = 0;
905  }
906 
907  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
908  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
909  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
910  rightFieldArguments[R] = 0;
911 
912  int r = R;
913  while (r >= 0)
914  {
915  // integrate in final component dimension
916  for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
917  {
918  flopsPer_i_R_j_R += 4;
919  }
920  // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
921 // if (0 < pointBounds_[R])
922 // {
923 // flopsPer_i_R_j_R += pointBounds_[R] * 4;
924 // }
925 
926  const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
927  const int r_min = (r_next >= 0) ? r_next : 0;
928 
929  for (int s=R-1; s>=r_min; s--)
930  {
931  // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
932  int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
933  int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
934 
935  flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
936  }
937 
938  // proceed to next point
939  r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
940  }
941 
942  flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
943  }
944  }
945 // std::cout << "flop count per cell: " << flopCount << std::endl;
946  return flopCount;
947  }
948 
950  int teamSize(const int &maxTeamSizeFromKokkos) const
951  {
952  const int R = numTensorComponents_ - 1;
953  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
954  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
955  }
956 
958  size_t team_shmem_size (int team_size) const
959  {
960  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
961  size_t shmem_size = 0;
962 
963  if (fad_size_output_ > 0)
964  {
965  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
966  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
967  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
968  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
969  }
970  else
971  {
972  shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
973  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
974  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
975  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
976  }
977 
978  return shmem_size;
979  }
980  };
981 
991  template<class Scalar, class DeviceType, int integralViewRank>
993  {
994  using ExecutionSpace = typename DeviceType::execution_space;
995  using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
996  using TeamMember = typename TeamPolicy::member_type;
997 
998  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
999  IntegralViewType integralView_;
1000  TensorData<Scalar,DeviceType> leftComponent_;
1001  Data<Scalar,DeviceType> composedTransform_;
1002  TensorData<Scalar,DeviceType> rightComponent_;
1003  TensorData<Scalar,DeviceType> cellMeasures_;
1004  int a_offset_;
1005  int b_offset_;
1006  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1007  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1008  int numTensorComponents_;
1009  int leftFieldOrdinalOffset_;
1010  int rightFieldOrdinalOffset_;
1011 
1012  size_t fad_size_output_ = 0; // 0 if not a fad type
1013 
1014  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1015  // (this also makes it easier to reorder loops, etc., for further optimizations)
1016  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1017  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1018  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1019 
1020  int maxFieldsLeft_;
1021  int maxFieldsRight_;
1022  int maxPointCount_;
1023  public:
1025  TensorData<Scalar,DeviceType> leftComponent,
1026  Data<Scalar,DeviceType> composedTransform,
1027  TensorData<Scalar,DeviceType> rightComponent,
1028  TensorData<Scalar,DeviceType> cellMeasures,
1029  int a_offset,
1030  int b_offset,
1031  int leftFieldOrdinalOffset,
1032  int rightFieldOrdinalOffset)
1033  :
1034  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1035  leftComponent_(leftComponent),
1036  composedTransform_(composedTransform),
1037  rightComponent_(rightComponent),
1038  cellMeasures_(cellMeasures),
1039  a_offset_(a_offset),
1040  b_offset_(b_offset),
1041  leftComponentSpan_(leftComponent.extent_int(2)),
1042  rightComponentSpan_(rightComponent.extent_int(2)),
1043  numTensorComponents_(leftComponent.numTensorComponents()),
1044  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1045  rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1046  {
1047  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1048 
1049  const int FIELD_DIM = 0;
1050  const int POINT_DIM = 1;
1051  maxFieldsLeft_ = 0;
1052  maxFieldsRight_ = 0;
1053  maxPointCount_ = 0;
1054  for (int r=0; r<numTensorComponents_; r++)
1055  {
1056  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1057  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1058  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1059  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1060  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1061  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1062  }
1063 
1064  // prepare for allocation of temporary storage
1065  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1066 
1067  const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
1068  if (allocateFadStorage)
1069  {
1070  fad_size_output_ = dimension_scalar(integralView_);
1071  }
1072  }
1073 
1074  template<size_t maxComponents, size_t numComponents = maxComponents>
1075  KOKKOS_INLINE_FUNCTION
1076  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1077  const Kokkos::Array<int,maxComponents> &bounds) const
1078  {
1079  if (numComponents == 0) return -1;
1080  int r = static_cast<int>(numComponents - 1);
1081  while (arguments[r] + 1 >= bounds[r])
1082  {
1083  arguments[r] = 0; // reset
1084  r--;
1085  if (r < 0) break;
1086  }
1087  if (r >= 0) ++arguments[r];
1088  return r;
1089  }
1090 
1092  KOKKOS_INLINE_FUNCTION
1093  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1094  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1095  const int &numComponents) const
1096  {
1097  if (numComponents == 0) return -1;
1098  int r = static_cast<int>(numComponents - 1);
1099  while (arguments[r] + 1 >= bounds[r])
1100  {
1101  arguments[r] = 0; // reset
1102  r--;
1103  if (r < 0) break;
1104  }
1105  if (r >= 0) ++arguments[r];
1106  return r;
1107  }
1108 
1109  template<size_t maxComponents, size_t numComponents = maxComponents>
1110  KOKKOS_INLINE_FUNCTION
1111  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1112  const Kokkos::Array<int,maxComponents> &bounds) const
1113  {
1114  if (numComponents == 0) return -1;
1115  int r = static_cast<int>(numComponents - 1);
1116  while (arguments[r] + 1 >= bounds[r])
1117  {
1118  r--;
1119  if (r < 0) break;
1120  }
1121  return r;
1122  }
1123 
1125  KOKKOS_INLINE_FUNCTION
1126  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1127  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128  const int &numComponents) const
1129  {
1130  if (numComponents == 0) return -1;
1131  int r = numComponents - 1;
1132  while (arguments[r] + 1 >= bounds[r])
1133  {
1134  r--;
1135  if (r < 0) break;
1136  }
1137  return r;
1138  }
1139 
1140  template<size_t maxComponents, size_t numComponents = maxComponents>
1141  KOKKOS_INLINE_FUNCTION
1142  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1143  const Kokkos::Array<int,maxComponents> &bounds,
1144  const int startIndex) const
1145  {
1146  // the following mirrors what is done in TensorData
1147  if (numComponents == 0)
1148  {
1149  return 0;
1150  }
1151  int enumerationIndex = 0;
1152  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1153  {
1154  enumerationIndex += arguments[r];
1155  enumerationIndex *= bounds[r-1];
1156  }
1157  enumerationIndex += arguments[startIndex];
1158  return enumerationIndex;
1159  }
1160 
1161  template<int rank>
1162  KOKKOS_INLINE_FUNCTION
1163  enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1164  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1165  {
1166  return integralView(cellDataOrdinal,i,j);
1167  }
1168 
1169  template<int rank>
1170  KOKKOS_INLINE_FUNCTION
1171  enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1172  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1173  {
1174  return integralView(i,j);
1175  }
1176 
1178  KOKKOS_INLINE_FUNCTION
1179  void runSpecialized3( const TeamMember & teamMember ) const
1180  {
1181  constexpr int numTensorComponents = 3;
1182 
1183  const int pointBounds_x = pointBounds_[0];
1184  const int pointBounds_y = pointBounds_[1];
1185  const int pointBounds_z = pointBounds_[2];
1186  const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1187 
1188  const int leftFieldBounds_x = leftFieldBounds_[0];
1189  const int rightFieldBounds_x = rightFieldBounds_[0];
1190  const int leftFieldBounds_y = leftFieldBounds_[1];
1191  const int rightFieldBounds_y = rightFieldBounds_[1];
1192  const int leftFieldBounds_z = leftFieldBounds_[2];
1193  const int rightFieldBounds_z = rightFieldBounds_[2];
1194 
1195  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1196  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1197  for (unsigned r=0; r<numTensorComponents; r++)
1198  {
1199  leftFieldBounds[r] = leftFieldBounds_[r];
1200  rightFieldBounds[r] = rightFieldBounds_[r];
1201  }
1202 
1203  const auto integralView = integralView_;
1204  const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1205  const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1206 
1207  const int cellDataOrdinal = teamMember.league_rank();
1208  const int threadNumber = teamMember.team_rank();
1209 
1210  const int numThreads = teamMember.team_size(); // num threads
1211  const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1212  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1213  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1214  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1215 
1216  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1217  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1218  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1219  if (fad_size_output_ > 0) {
1220  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1221  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1222  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1223 
1224  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1225  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1226  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1227  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1228  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1229  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1230  }
1231  else {
1232  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1233  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1234  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1235 
1236  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1237  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1238  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1239  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1240  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1241  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1242  }
1243 
1244 // int approximateFlopCount = 0;
1245 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1246 
1247  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1248 
1249  const int composedTransformRank = composedTransform_.rank();
1250 
1251  // synchronize threads
1252  teamMember.team_barrier();
1253 
1254  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1255  {
1256  const int a = a_offset_ + a_component;
1257  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1258  {
1259  const int b = b_offset_ + b_component;
1260 
1261  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1262  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1263  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1264  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1265  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1266  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1267 
1268  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1269  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1270  if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1271  {
1272  const int pointCount = leftTensorComponent_x.extent_int(1);
1273  const int leftRank = leftTensorComponent_x.rank();
1274  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1275  {
1276  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1277  }
1278  }
1279  if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1280  {
1281  const int pointCount = leftTensorComponent_y.extent_int(1);
1282  const int leftRank = leftTensorComponent_y.rank();
1283  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1284  {
1285  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1286  }
1287  }
1288  if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1289  {
1290  const int pointCount = leftTensorComponent_z.extent_int(1);
1291  const int leftRank = leftTensorComponent_z.rank();
1292  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1293  {
1294  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1295  }
1296  }
1297  if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1298  {
1299  const int pointCount = rightTensorComponent_x.extent_int(1);
1300  const int rightRank = rightTensorComponent_x.rank();
1301  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1302  {
1303  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1304  }
1305  }
1306  if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1307  {
1308  const int pointCount = rightTensorComponent_y.extent_int(1);
1309  const int rightRank = rightTensorComponent_y.rank();
1310  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1311  {
1312  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1313  }
1314  }
1315  if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1316  {
1317  const int pointCount = rightTensorComponent_z.extent_int(1);
1318  const int rightRank = rightTensorComponent_z.rank();
1319  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1320  {
1321  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1322  }
1323  }
1324  });
1325 
1326  if (composedTransformRank == 4) // (C,P,D,D)
1327  {
1328  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1329  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1330  });
1331  }
1332  else // (C,P)
1333  {
1334  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1335  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1336  });
1337  }
1338 
1339  // synchronize threads
1340  teamMember.team_barrier();
1341 
1342  for (int i0=0; i0<leftFieldBounds_x; i0++)
1343  {
1344  for (int j0=0; j0<rightFieldBounds_x; j0++)
1345  {
1346  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1347  // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1348  const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1349 
1350  Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1351 
1352  Gx = 0;
1353  if (fad_size_output_ == 0)
1354  {
1355  // not a Fad type; we're allow to have a vector range
1356  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1357  {
1358  integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1359  }, Gx);
1360  }
1361  else
1362  {
1363  for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1364  {
1365  Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1366  }
1367  }
1368  });
1369 
1370  // synchronize threads
1371  teamMember.team_barrier();
1372 
1373  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1374  const int i1 = i1j1 % leftFieldBounds_y;
1375  const int j1 = i1j1 / leftFieldBounds_y;
1376 
1377  int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1378 
1379  for (int lz=0; lz<pointBounds_z; lz++)
1380  {
1381  int pointEnumerationIndex = lz * pointBounds_y;
1382  if (fad_size_output_ == 0)
1383  {
1384  Scalar Gy_local = 0;
1385 
1386  // not a Fad type; we're allow to have a vector range
1387  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (const int &ly, Scalar &integralThusFar)
1388  {
1389  const Scalar & leftValue = leftFields_y(i1,ly);
1390  const Scalar & rightValue = rightFields_y(j1,ly);
1391 
1392  integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1393  }, Gy_local);
1394 
1395  GyIntegrals(Gy_index_offset + lz) = Gy_local;
1396  }
1397  else
1398  {
1399  Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
1400  for (int ly=0; ly<pointBounds_y; ly++)
1401  {
1402  const Scalar & leftValue = leftFields_y(i1,ly);
1403  const Scalar & rightValue = rightFields_y(j1,ly);
1404 
1405  Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1406  }
1407  }
1408  }
1409 
1410  for (int i2=0; i2<leftFieldBounds_z; i2++)
1411  {
1412  for (int j2=0; j2<rightFieldBounds_z; j2++)
1413  {
1414  Scalar Gz = 0.0;
1415 
1416  int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1417 
1418  if (fad_size_output_ == 0)
1419  {
1420  // not a Fad type; we're allow to have a vector range
1421  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (const int &lz, Scalar &integralThusFar)
1422  {
1423  const Scalar & leftValue = leftFields_z(i2,lz);
1424  const Scalar & rightValue = rightFields_z(j2,lz);
1425 
1426  integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1427  }, Gz);
1428  }
1429  else
1430  {
1431  for (int lz=0; lz<pointBounds_z; lz++)
1432  {
1433  const Scalar & leftValue = leftFields_z(i2,lz);
1434  const Scalar & rightValue = rightFields_z(j2,lz);
1435 
1436  Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1437  }
1438  }
1439 
1440  const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1441  const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1442  // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1443 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1444 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1445 
1446  Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
1447  integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1448  });
1449  }
1450  }
1451  });
1452  // synchronize threads
1453  teamMember.team_barrier();
1454  }
1455  }
1456  }
1457  }
1458  }
1459 
1460  template<size_t numTensorComponents>
1461  KOKKOS_INLINE_FUNCTION
1462  void run( const TeamMember & teamMember ) const
1463  {
1464  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1465 // Kokkos::Array<int,numTensorComponents> pointBounds;
1466 // Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1467 // Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1468 //
1469 // int pointsInNonzeroComponentDimensions = 1;
1470 // for (unsigned r=0; r<numTensorComponents; r++)
1471 // {
1472 // pointBounds[r] = pointBounds_[r];
1473 // if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1474 // leftFieldBounds[r] = leftFieldBounds_[r];
1475 // rightFieldBounds[r] = rightFieldBounds_[r];
1476 // }
1477 //
1478 // const int cellDataOrdinal = teamMember.league_rank();
1479 // const int numThreads = teamMember.team_size(); // num threads
1480 // const int G_k_StackHeight = numTensorComponents - 1; // per thread
1481 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1482 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1483 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1484 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1485 // if (fad_size_output_ > 0) {
1486 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1487 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1488 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1489 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1490 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1491 // }
1492 // else {
1493 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1494 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1495 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1496 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1497 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1498 // }
1499 //
1502 //
1503 // constexpr int R = numTensorComponents - 1;
1504 //
1505 // // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1506 //
1507 // // synchronize threads
1508 // teamMember.team_barrier();
1509 //
1510 // for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1511 // {
1512 // const int a = a_offset_ + a_component;
1513 // for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1514 // {
1515 // const int b = b_offset_ + b_component;
1516 //
1517 // const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1518 // const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1519 //
1520 // const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1521 // const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1522 //
1523 // const int numPointsFirst = leftFirstComponent.extent_int(1);
1524 //
1525 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1526 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1527 // });
1528 //
1529 // // synchronize threads
1530 // teamMember.team_barrier();
1531 //
1532 // for (int i0=0; i0<numLeftFieldsFirst; i0++)
1533 // {
1534 // for (int j0=0; j0<numRightFieldsFirst; j0++)
1535 // {
1536 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1537 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1538 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1539 // leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1540 // });
1541 //
1542 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1543 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1544 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1545 // rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1546 // });
1547 //
1548 // // synchronize threads
1549 // teamMember.team_barrier();
1550 //
1551 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1552 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1553 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1554 //
1555 // for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1556 // {
1557 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1558 // remainingIndex /= pointBounds[d+1];
1559 // }
1560 // pointArgumentsInLaterDimensions[0] = remainingIndex;
1561 //
1562 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1563 // for (int d=R; d>0; d--)
1564 // {
1565 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1566 // tensorPointEnumerationOffset *= pointBounds[d-1];
1567 // }
1568 //
1569 // Scalar integralValue = 0;
1570 // if (fad_size_output_ == 0)
1571 // {
1572 // // not a Fad type; we're allow to have a vector range
1573 // Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1574 // {
1575 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1576 // }, integralValue);
1577 // }
1578 // else
1579 // {
1580 // for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1581 // {
1582 // integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1583 // }
1584 // }
1585 //
1586 // G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1587 // });
1588 //
1589 // // synchronize threads
1590 // teamMember.team_barrier();
1591 //
1592 // // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1593 // }
1594 // }
1595 // }
1596 // }
1598  }
1599 
1600  KOKKOS_INLINE_FUNCTION
1601  void operator()( const TeamMember & teamMember ) const
1602  {
1603  switch (numTensorComponents_)
1604  {
1605  case 1: run<1>(teamMember); break;
1606  case 2: run<2>(teamMember); break;
1607  case 3: runSpecialized3(teamMember); break;
1608  case 4: run<4>(teamMember); break;
1609  case 5: run<5>(teamMember); break;
1610  case 6: run<6>(teamMember); break;
1611  case 7: run<7>(teamMember); break;
1612 #ifdef INTREPID2_HAVE_DEBUG
1613  default:
1614  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1615 #endif
1616  }
1617  }
1618 
1621  {
1622  // compute flop count on a single cell
1623  int flopCount = 0;
1624 
1625  constexpr int numTensorComponents = 3;
1626  Kokkos::Array<int,numTensorComponents> pointBounds;
1627  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1628  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1629 
1630  int pointsInNonzeroComponentDimensions = 1;
1631  for (unsigned r=0; r<numTensorComponents; r++)
1632  {
1633  pointBounds[r] = pointBounds_[r];
1634  if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1635  leftFieldBounds[r] = leftFieldBounds_[r];
1636  rightFieldBounds[r] = rightFieldBounds_[r];
1637  }
1638 
1639  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1640  {
1641  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1642  {
1643 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1644 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1645 // });
1646  flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1647 
1648  for (int i0=0; i0<leftFieldBounds[0]; i0++)
1649  {
1650  for (int j0=0; j0<rightFieldBounds[0]; j0++)
1651  {
1652  flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1653 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1654 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1655 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1656 //
1657 // for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1658 // {
1659 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1660 // remainingIndex /= pointBounds[d+1];
1661 // }
1662 // pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1663 //
1664 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1665 // for (int d=R; d>0; d--)
1666 // {
1667 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1668 // tensorPointEnumerationOffset *= pointBounds[d-1];
1669 // }
1670 //
1671 // Scalar integralValue = 0;
1672 // if (fad_size_output_ == 0)
1673 // {
1674 // // not a Fad type; we're allow to have a vector range
1675 // Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1676 // {
1677 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1678 // }, integralValue);
1679 // }
1680 // else
1681 // {
1682 // for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1683 // {
1684 // integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1685 // }
1686 // }
1687 //
1688 // GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1689 // });
1690 
1691 
1692  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1693  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1694  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1695 
1696 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1697 // const int i1 = i1j1 % leftFieldBounds[1];
1698 // const int j1 = i1j1 / leftFieldBounds[1];
1699 //
1701 //
1702 // int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1703 // for (int lz=0; lz<pointBounds[2]; lz++)
1704 // {
1705 // Scalar & Gy = GyIntegrals(Gy_index);
1706 // Gy = 0.0;
1707 //
1708 // const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1709 // const bool rightRankIs3 = (rightFields_y.rank() == 3);
1710 // for (int ly=0; ly<pointBounds[1]; ly++)
1711 // {
1712 // const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1713 // const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1714 //
1715 // Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1716 //
1717 // pointEnumerationIndex++;
1718 // }
1719 // Gy_index++;
1720 // }
1721 //
1722 // for (int i2=0; i2<leftFieldBounds[2]; i2++)
1723 // {
1724 // for (int j2=0; j2<rightFieldBounds[2]; j2++)
1725 // {
1726 // Scalar Gz = 0.0;
1727 //
1728 // int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1729 //
1730 // const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1731 // const bool rightRankIs3 = (rightFields_z.rank() == 3);
1732 // for (int lz=0; lz<pointBounds[2]; lz++)
1733 // {
1734 // const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1735 // const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1736 //
1737 // Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1738 //
1739 // Gy_index++;
1740 // }
1741 //
1742 // Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1743 // Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1744 //
1745 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1746 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1747 //
1748 // if (integralViewRank == 2)
1749 // {
1750 // integralView_.access(i,j,0) += Gz;
1751 // }
1752 // else
1753 // {
1754 // integralView_.access(cellDataOrdinal,i,j) += Gz;
1755 // }
1756 // }
1757 // }
1758 // });
1759  }
1760  }
1761  }
1762  }
1763  return flopCount;
1764  }
1765 
1767  int teamSize(const int &maxTeamSizeFromKokkos) const
1768  {
1769  // TODO: fix this to match the actual parallelism expressed
1770  const int R = numTensorComponents_ - 1;
1771  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1772  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1773  }
1774 
1776  size_t team_shmem_size (int numThreads) const
1777  {
1778  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1779  size_t shmem_size = 0;
1780 
1781  const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1782 
1783  int pointsInNonzeroComponentDimensions = 1;
1784  for (int d=1; d<numTensorComponents_; d++)
1785  {
1786  pointsInNonzeroComponentDimensions *= pointBounds_[d];
1787  }
1788 
1789  if (fad_size_output_ > 0)
1790  {
1791  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1792  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1793  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1794 
1795  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1796  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1797  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1798  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1799  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1800  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1801  }
1802  else
1803  {
1804  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1805  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1806  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1807 
1808  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1809  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1810  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1811  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1812  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1813  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1814  }
1815 
1816  return shmem_size;
1817  }
1818  };
1819 
1820  template<class Scalar, class DeviceType>
1822  {
1823  ScalarView<Scalar,DeviceType> integral_;
1825  Data<Scalar,DeviceType> right_;
1826  Data<Scalar,DeviceType> weights_;
1827  ordinal_type dimSpan_;
1828  ordinal_type leftRank_;
1829  ordinal_type rightRank_;
1830  ordinal_type numPoints_;
1831  public:
1832  F_RefSpaceIntegral(ScalarView<Scalar,DeviceType> integralView,
1834  ordinal_type dimSpan)
1835  :
1836  integral_(integralView),
1837  left_(left),
1838  right_(right),
1839  weights_(weights),
1840  dimSpan_(dimSpan)
1841  {
1842  leftRank_ = left.rank();
1843  rightRank_ = right.rank();
1844  numPoints_ = weights.extent_int(0);
1845  }
1846 
1847  KOKKOS_INLINE_FUNCTION
1848  void operator()( const ordinal_type & i, const ordinal_type & j ) const
1849  {
1850  Scalar refSpaceIntegral = 0.0;
1851  for (int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1852  {
1853  const Scalar & weight = weights_(ptOrdinal);
1854  for (int a=0; a<dimSpan_; a++)
1855  {
1856  const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1857  const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1858  refSpaceIntegral += leftValue * rightValue * weight;
1859  }
1860  }
1861  integral_(i,j) = refSpaceIntegral;
1862  }
1863  };
1864  }
1865 
1866 template<typename DeviceType>
1867 template<class Scalar>
1869  const TensorData<Scalar,DeviceType> cellMeasures,
1870  const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
1871 {
1872  // allocates a (C,F,F) container for storing integral data
1873 
1874  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1875  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1876  const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1877  const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1878  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1879 
1880  // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1881  // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1882  // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1883  // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1884  const int CELL_DIM = 0;
1885  const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1886  const auto leftTransform = vectorDataLeft.transform();
1887 
1888  DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1889  // transforms may be invalid, indicating an identity transform. If so, it will not constrain the output at all.
1890  if (vectorDataLeft.transform().isValid())
1891  {
1892  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1893  }
1894  if (vectorDataRight.transform().isValid())
1895  {
1896  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1897  }
1898 
1899  DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1900  int cellDataExtent = combinedCellDimInfo.dataExtent;
1901 
1902  const int numCells = vectorDataLeft.numCells();
1903  const int numFieldsLeft = vectorDataLeft.numFields();
1904  const int numFieldsRight = vectorDataRight.numFields();
1905 
1906  Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1907  Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1908 
1909  if (cellVariationType != CONSTANT)
1910  {
1911  Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1912  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1913  }
1914  else
1915  {
1916  Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1917  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1918  }
1919 }
1920 
1924 template<typename DeviceType>
1925 template<class Scalar>
1927  const TensorData<Scalar,DeviceType> & cellMeasures,
1928  const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
1929  double* approximateFlops)
1930 {
1931  using ExecutionSpace = typename DeviceType::execution_space;
1932 
1933  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1934  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1935  const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1936  const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1937  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1938 
1939  if (approximateFlops != NULL)
1940  {
1941  *approximateFlops = 0;
1942  }
1943 
1944  // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1945  const int integralViewRank = integrals.getUnderlyingViewRank();
1946 
1947  if (!sumInto)
1948  {
1949  integrals.clear();
1950  }
1951 
1952  const int cellDataExtent = integrals.getDataExtent(0);
1953  const int numFieldsLeft = basisValuesLeft.numFields();
1954  const int numFieldsRight = basisValuesRight.numFields();
1955  const int spaceDim = basisValuesLeft.spaceDim();
1956 
1957  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1958 
1959  const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1960  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1961 
1962  // we require that the number of tensor components in the vectors are the same for each vector entry
1963  // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1964  int numTensorComponentsLeft = -1;
1965  const bool leftIsVectorValued = basisValuesLeft.vectorData().isValid();
1966 
1967  if (leftIsVectorValued)
1968  {
1969  const auto &refVectorLeft = basisValuesLeft.vectorData();
1970  int numFamiliesLeft = refVectorLeft.numFamilies();
1971  int numVectorComponentsLeft = refVectorLeft.numComponents();
1972  Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1973  for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1974  {
1975  for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1976  {
1977  const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1978  if (tensorData.numTensorComponents() > 0)
1979  {
1980  if (numTensorComponentsLeft == -1)
1981  {
1982  numTensorComponentsLeft = tensorData.numTensorComponents();
1983  }
1984  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1985  for (int r=0; r<numTensorComponentsLeft; r++)
1986  {
1987  maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
1988  }
1989  }
1990  }
1991  }
1992  }
1993  else
1994  {
1995  numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
1996  for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1997  {
1998  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
1999  }
2000  }
2001  int numTensorComponentsRight = -1;
2002  const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
2003 
2004  if (rightIsVectorValued)
2005  {
2006  const auto &refVectorRight = basisValuesRight.vectorData();
2007  int numFamiliesRight = refVectorRight.numFamilies();
2008  int numVectorComponentsRight = refVectorRight.numComponents();
2009  Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
2010  for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2011  {
2012  for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2013  {
2014  const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2015  if (tensorData.numTensorComponents() > 0)
2016  {
2017  if (numTensorComponentsRight == -1)
2018  {
2019  numTensorComponentsRight = tensorData.numTensorComponents();
2020  }
2021  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2022  for (int r=0; r<numTensorComponentsRight; r++)
2023  {
2024  maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2025  }
2026  }
2027  }
2028  }
2029  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2030  }
2031  else
2032  {
2033  // check that right tensor component count agrees with left
2034  for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2035  {
2036  INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2037  }
2038  }
2039  const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
2040 
2041  if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
2042  {
2043  // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
2044 
2045  // in this case, the integrals in each tensorial direction are entirely separable
2046  // allocate some temporary storage for the integrals in each tensorial direction
2047 
2048  // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
2049 
2050  Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2051  for (int r=0; r<numPointTensorComponents; r++)
2052  {
2053  // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
2054  pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
2055  }
2056 
2057  // only one of these will be a valid container:
2058  Kokkos::View<Scalar**, DeviceType> integralView2;
2059  Kokkos::View<Scalar***, DeviceType> integralView3;
2060  if (integralViewRank == 3)
2061  {
2062  integralView3 = integrals.getUnderlyingView3();
2063  }
2064  else
2065  {
2066  integralView2 = integrals.getUnderlyingView2();
2067  }
2068  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2069  {
2070  int a_offset = 0; // left vector component offset
2071  int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2072 
2073  const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
2074  for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2075  {
2076  TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
2077  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2078  if (!leftComponent.isValid())
2079  {
2080  a_offset++; // empty components are understood to take up one dimension
2081  continue;
2082  }
2083  const int leftDimSpan = leftComponent.extent_int(2);
2084 
2085  const int leftComponentFieldCount = leftComponent.extent_int(0);
2086 
2087  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2088  {
2089  int b_offset = 0; // right vector component offset
2090  int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2091 
2092  const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2093  for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2094  {
2095  TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2096  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2097  if (!rightComponent.isValid())
2098  {
2099  b_offset++; // empty components are understood to take up one dimension
2100  continue;
2101  }
2102  const int rightDimSpan = rightComponent.extent_int(2);
2103 
2104  const int rightComponentFieldCount = rightComponent.extent_int(0);
2105 
2106  // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2107  if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2108  {
2109  b_offset += rightDimSpan;
2110 
2111  continue;
2112  }
2113 
2114  // if the a, b spans intersect, by assumption they should align exactly
2115  INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2116  INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2117 
2118  const int d_start = a_offset;
2119  const int d_end = d_start + leftDimSpan;
2120 
2121  using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2122  ComponentIntegralsArray componentIntegrals;
2123  for (int r=0; r<numPointTensorComponents; r++)
2124  {
2125  /*
2126  Four vector data cases to consider:
2127  1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2128  2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2129  3. First container is axial, second arbitrary.
2130  4. First is arbitrary, second axial.
2131 
2132  But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2133  the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2134 
2135  The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2136  while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2137  */
2138 
2139  Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2140  const int numPoints = pointDimensions[r];
2141 
2142  // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2143 
2144  Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2145  Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2146 
2147  const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2148  const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2149  const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2150  const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2151 
2152  INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2153 
2154  for (int d=d_start; d<d_end; d++)
2155  {
2156  ScalarView<Scalar,DeviceType> componentIntegralView;
2157 
2158  const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
2159  if (allocateFadStorage)
2160  {
2161  auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2162  componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2163  }
2164  else
2165  {
2166  componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2167  }
2168 
2169  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2170 
2171  Impl::F_RefSpaceIntegral<Scalar, DeviceType> refSpaceIntegralFunctor(componentIntegralView, leftTensorComponent, rightTensorComponent, quadratureWeights,
2172  leftTensorComponentDimSpan);
2173  Kokkos::parallel_for("compute componentIntegrals", policy, refSpaceIntegralFunctor);
2174 
2175  componentIntegrals[r][d] = componentIntegralView;
2176 
2177  if (approximateFlops != NULL)
2178  {
2179  *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2180  }
2181  } // d
2182  } // r
2183 
2184  ExecutionSpace().fence();
2185 
2186  Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2187  Kokkos::Array<int,3> lowerBounds {0,0,0};
2188  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2189  // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2190  Kokkos::parallel_for("compute field integrals", policy,
2191  KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2192  const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2193 
2194  TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2195  leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2196 
2197  TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2198  rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2199 
2200  Scalar integralSum = 0.0;
2201  for (int d=d_start; d<d_end; d++)
2202  {
2203  const Scalar & transformLeft_d = basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
2204  const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
2205 
2206  const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2207  // approximateFlopCount++;
2208 
2209  Scalar integral_d = 1.0;
2210 
2211  for (int r=0; r<numPointTensorComponents; r++)
2212  {
2213  integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2214  // approximateFlopCount++; // product
2215  }
2216  integralSum += leftRightTransform_d * integral_d;
2217  // approximateFlopCount += 2; // multiply and sum
2218 
2219  const int i = leftFieldOrdinal + leftFieldOffset;
2220  const int j = rightFieldOrdinal + rightFieldOffset;
2221 
2222  if (integralViewRank == 3)
2223  {
2224  integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2225  }
2226  else
2227  {
2228  integralView2(i,j) += cellMeasureWeight * integralSum;
2229  }
2230  }
2231  });
2232  b_offset += rightDimSpan;
2233  } // rightVectorComponentOrdinal
2234  } // rightFamilyOrdinal
2235  a_offset += leftDimSpan;
2236  } // leftVectorComponentOrdinal
2237  } // leftFamilyOrdinal
2238 
2239  if (approximateFlops != NULL)
2240  {
2241  // TODO: check the accuracy of this
2242  *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2243  }
2244  }
2245  else // general case (not axis-aligned + affine tensor-product structure)
2246  {
2247  // prepare composed transformation matrices
2248  const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2249  const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2250  const bool transposeLeft = true;
2251  const bool transposeRight = false;
2252 // auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2253 // timer->start();
2254  // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2 -- or vector weights -- (C,P,D): rank 3
2255  Data<Scalar,DeviceType> composedTransform;
2256  // invalid/empty transforms are used when the identity is intended.
2257  const int leftRank = leftTransform.rank();
2258  const int rightRank = rightTransform.rank();
2259 
2260  if (leftTransform.isValid() && rightTransform.isValid())
2261  {
2262  const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
2263  const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
2264  const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
2265  const bool ranks32 = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
2266  const bool ranks42 = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
2267 
2268  if (bothRank4) // (C,P,D,D)
2269  {
2270  composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2271  composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2272 
2273  // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2274  if (approximateFlops != NULL)
2275  {
2276  *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2277  }
2278  }
2279  else if (bothRank3) // (C,P,D)
2280  {
2281  // re-cast leftTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2282  const int newRank = 4;
2283  auto extents = leftTransform.getExtents();
2284  auto variationTypes = leftTransform.getVariationTypes();
2285  extents[3] = extents[2];
2286  extents[2] = 1;
2287  variationTypes[3] = variationTypes[2];
2288  variationTypes[2] = CONSTANT;
2289  auto leftTransformMatrix = leftTransform.shallowCopy(newRank, extents, variationTypes);
2290 
2291  // re-cast rightTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P)
2292  extents = rightTransform.getExtents();
2293  variationTypes = rightTransform.getVariationTypes();
2294  extents[3] = extents[2];
2295  extents[2] = 1;
2296  variationTypes[3] = variationTypes[2];
2297  variationTypes[2] = CONSTANT;
2298  auto rightTransformMatrix = rightTransform.shallowCopy(newRank, extents, variationTypes);
2299 
2300  composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix); // false: don't transpose
2301  composedTransform.storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
2302 
2303  if (approximateFlops != NULL)
2304  {
2305  *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2306  }
2307  }
2308  else if (bothRank2)
2309  {
2310  composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2311  composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2312 
2313  // re-cast composedTranform as a rank 4 (C,P,1,1) object -- a 1 x 1 matrix at each (C,P).
2314  const int newRank = 4;
2315  auto extents = composedTransform.getExtents();
2316  auto variationTypes = composedTransform.getVariationTypes();
2317  composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2318  if (approximateFlops != NULL)
2319  {
2320  *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2321  }
2322  }
2323  else if (ranks32) // rank 2 / rank 3 combination.
2324  {
2325  const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
2326  const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
2327 
2328  composedTransform = DataTools::multiplyByCPWeights(rank3Transform, rank2Transform);
2329 
2330  // re-cast composedTransform as a rank 4 object:
2331  // logically, the original rank-3 transform can be understood as a 1xD matrix. The composed transform is leftTransform^T * rightTransform, so:
2332  // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2333  // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2334  const int newRank = 4;
2335  auto extents = composedTransform.getExtents();
2336  auto variationTypes = composedTransform.getVariationTypes();
2337  if (leftRank == 3)
2338  {
2339  // extents[3] and variationTypes[3] will already be 1 and CONSTANT, respectively
2340  // extents[3] = 1;
2341  // variationTypes[3] = CONSTANT;
2342  }
2343  else
2344  {
2345  extents[3] = extents[2];
2346  extents[2] = 1;
2347  variationTypes[3] = variationTypes[2];
2348  variationTypes[2] = CONSTANT;
2349  }
2350  composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2351  }
2352  else if (ranks42) // rank 4 / rank 2 combination.
2353  {
2354  if (leftRank == 4)
2355  {
2356  // want to transpose left matrix, and multiply by the values from rightTransform
2357  // start with the multiplication:
2358  auto composedTransformTransposed = DataTools::multiplyByCPWeights(leftTransform, rightTransform);
2359  composedTransform = DataTools::transposeMatrix(composedTransformTransposed);
2360  }
2361  else // (leftRank == 2)
2362  {
2363  composedTransform = DataTools::multiplyByCPWeights(rightTransform, leftTransform);
2364  }
2365  }
2366  else
2367  {
2368  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2369  }
2370  }
2371  else if (leftTransform.isValid())
2372  {
2373  // rightTransform is the identity
2374  switch (leftRank)
2375  {
2376  case 4: composedTransform = DataTools::transposeMatrix(leftTransform); break;
2377  case 3:
2378  {
2379  // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2380  const int newRank = 4;
2381  auto extents = leftTransform.getExtents();
2382  auto variationTypes = leftTransform.getVariationTypes();
2383 
2384  composedTransform = leftTransform.shallowCopy(newRank, extents, variationTypes);
2385  }
2386  break;
2387  case 2: composedTransform = leftTransform; break;
2388  default:
2389  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2390  }
2391  }
2392  else if (rightTransform.isValid())
2393  {
2394  // leftTransform is the identity
2395  composedTransform = rightTransform;
2396  switch (rightRank)
2397  {
2398  case 4: composedTransform = rightTransform; break;
2399  case 3:
2400  {
2401  // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2402  const int newRank = 4;
2403  auto extents = rightTransform.getExtents();
2404  auto variationTypes = rightTransform.getVariationTypes();
2405  extents[3] = extents[2];
2406  variationTypes[3] = variationTypes[2];
2407  extents[2] = 1;
2408  variationTypes[2] = CONSTANT;
2409 
2410  composedTransform = rightTransform.shallowCopy(newRank, extents, variationTypes);
2411  }
2412  break;
2413  case 2: composedTransform = rightTransform; break;
2414  default:
2415  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2416  }
2417  }
2418  else
2419  {
2420  // both left and right transforms are identity
2421  Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2422  Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2423 
2424  Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2425  Kokkos::deep_copy(identityUnderlyingView, 1.0);
2426  composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2427  }
2428 
2429 // timer->stop();
2430 // std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2431 // timer->reset();
2432 
2433  const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2434  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2435  const int leftComponentCount = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2436  const int rightComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2437 
2438  int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2439  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2440  {
2441  // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2442  // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2443  int a_offset = 0;
2444  bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2445  for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2446  {
2447  TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2448  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2449  if (!leftComponent.isValid())
2450  {
2451  // represents zero
2452  a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2453  continue;
2454  }
2455 
2456  int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2457  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2458  {
2459  // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2460  // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2461  bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2462  int b_offset = 0;
2463  for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2464  {
2465  TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2466  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2467  if (!rightComponent.isValid())
2468  {
2469  // represents zero
2470  b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2471  continue;
2472  }
2473 
2474  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2475 
2476  const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2477  Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2478 
2479  // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2480  // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2481  // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2482  // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2483  // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2484  // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2485  // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2486  bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2487  bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2488 
2489  // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2490  // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2491  if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2492  {
2493  ExecutionSpace().fence();
2494  }
2495  haveLaunchedContributionToCurrentFamilyLeft = true;
2496  haveLaunchedContributionToCurrentFamilyRight = true;
2497 
2498  if (integralViewRank == 2)
2499  {
2500  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2501  {
2502  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2503 
2504  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2505  const int teamSize = functor.teamSize(recommendedTeamSize);
2506 
2507  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2508 
2509  Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2510 
2511  if (approximateFlops != NULL)
2512  {
2513  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2514  }
2515  }
2516  else
2517  {
2518  auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2519 
2520  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2521  const int teamSize = functor.teamSize(recommendedTeamSize);
2522 
2523  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2524 
2525  Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2526 
2527  if (approximateFlops != NULL)
2528  {
2529  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2530  }
2531  }
2532  }
2533  else if (integralViewRank == 3)
2534  {
2535  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2536  {
2537  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2538 
2539  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2540  const int teamSize = functor.teamSize(recommendedTeamSize);
2541 
2542  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2543 
2544  Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2545 
2546  if (approximateFlops != NULL)
2547  {
2548  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2549  }
2550  }
2551  else
2552  {
2553  auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2554 
2555  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2556  const int teamSize = functor.teamSize(recommendedTeamSize);
2557 
2558  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2559 
2560  Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2561 
2562  if (approximateFlops != NULL)
2563  {
2564  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2565  }
2566  }
2567  }
2568  b_offset += rightIsVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2569  }
2570  rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2571  }
2572  a_offset += leftIsVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2573  }
2574  leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2575  }
2576  }
2577 // if (approximateFlops != NULL)
2578 // {
2579 // std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2580 // }
2581  ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2582 }
2583 
2584 } // end namespace Intrepid2
2585 
2586 #endif
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
static void multiplyByCPWeights(Data< Scalar, DeviceType > &resultMatrixData, const Data< Scalar, DeviceType > &matrixDataIn, const Data< Scalar, DeviceType > &scalarDataIn)
Utility methods for manipulating Intrepid2::Data objects.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container...
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
static Data< Scalar, DeviceType > allocateIntegralData(const TransformedBasisValues< Scalar, DeviceType > vectorDataLeft, const TensorData< Scalar, DeviceType > cellMeasures, const TransformedBasisValues< Scalar, DeviceType > vectorDataRight)
Allocates storage for the contraction of vectorDataLeft and vectorDataRight containers on point and s...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don&#39;t (e.g., those that have been co...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
static void integrate(Data< Scalar, DeviceType > integrals, const TransformedBasisValues< Scalar, DeviceType > &vectorDataLeft, const TensorData< Scalar, DeviceType > &cellMeasures, const TransformedBasisValues< Scalar, DeviceType > &vectorDataRight, const bool sumInto=false, double *approximateFlops=NULL)
Contracts vectorDataLeft and vectorDataRight containers on point and space dimensions, weighting each point according to cellMeasures, and stores the result in outputValues. The integrals container can be constructed using allocateIntegralData().
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank...
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
static Data< Scalar, DeviceType > transposeMatrix(const Data< Scalar, DeviceType > &matrixDataIn)
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don&#39;t (namely, those that have been ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container...
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
Structure-preserving representation of transformed vector data; reference space values and transforma...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container...
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...