Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ************************************************************************
3 //
4 // Intrepid2 Package
5 // Copyright (2007) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Kyungjoo Kim (kyukim@sandia.gov),
38 // Mauro Perego (mperego@sandia.gov), or
39 // Nate Roberts (nvrober@sandia.gov)
40 //
41 // ************************************************************************
42 // @HEADER
43 
49 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
50 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
51 
54 
55 #include "Teuchos_TimeMonitor.hpp"
56 
57 namespace Intrepid2 {
58 
59  namespace Impl
60  {
64  template<class Scalar, class DeviceType, int integralViewRank>
66  {
67  using ExecutionSpace = typename DeviceType::execution_space;
68  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69  using TeamMember = typename TeamPolicy::member_type;
70 
71  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72  IntegralViewType integralView_;
73  TensorData<Scalar,DeviceType> leftComponent_;
74  Data<Scalar,DeviceType> composedTransform_;
75  TensorData<Scalar,DeviceType> rightComponent_;
76  TensorData<Scalar,DeviceType> cellMeasures_;
77  int a_offset_;
78  int b_offset_;
79  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
80  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
81  int numTensorComponents_;
82  int leftFieldOrdinalOffset_;
83  int rightFieldOrdinalOffset_;
84  bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
85 
86  size_t fad_size_output_ = 0; // 0 if not a fad type
87 
88  Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
89 
90  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
91  // (this also makes it easier to reorder loops, etc., for further optimizations)
92  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
95 
96  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
97  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
98 
99  int maxFieldsLeft_;
100  int maxFieldsRight_;
101  int maxPointCount_;
102  public:
104  TensorData<Scalar,DeviceType> leftComponent,
105  Data<Scalar,DeviceType> composedTransform,
106  TensorData<Scalar,DeviceType> rightComponent,
107  TensorData<Scalar,DeviceType> cellMeasures,
108  int a_offset,
109  int b_offset,
110  int leftFieldOrdinalOffset,
111  int rightFieldOrdinalOffset,
112  bool forceNonSpecialized)
113  :
114  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115  leftComponent_(leftComponent),
116  composedTransform_(composedTransform),
117  rightComponent_(rightComponent),
118  cellMeasures_(cellMeasures),
119  a_offset_(a_offset),
120  b_offset_(b_offset),
121  leftComponentSpan_(leftComponent.extent_int(2)),
122  rightComponentSpan_(rightComponent.extent_int(2)),
123  numTensorComponents_(leftComponent.numTensorComponents()),
124  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125  rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126  forceNonSpecialized_(forceNonSpecialized)
127  {
128  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
129 
130  // set up bounds containers
131  const int FIELD_DIM = 0;
132  const int POINT_DIM = 1;
133  maxFieldsLeft_ = 0;
134  maxFieldsRight_ = 0;
135  maxPointCount_ = 0;
136  for (int r=0; r<numTensorComponents_; r++)
137  {
138  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
139  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
140  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
141  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
142  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
143  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
144  }
145 
146  // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
147  int leftRelativeEnumerationSpan = 1;
148  int rightRelativeEnumerationSpan = 1;
149  for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
150  {
151  leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152  rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153  leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154  rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
155  }
156 
157  // prepare for allocation of temporary storage
158  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
159 
160  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161  if (allocateFadStorage)
162  {
163  fad_size_output_ = dimension_scalar(integralView_);
164  }
165 
166  const int R = numTensorComponents_ - 1;
167 
168  int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
169  int allocationSoFar = 0;
170  offsetsForComponentOrdinal_[R] = allocationSoFar;
171  allocationSoFar++; // we store one entry corresponding to R, the final component
172 
173  for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
174  {
175  const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
176  const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
177 
178  num_ij *= leftFields * rightFields;
179 
180  offsetsForComponentOrdinal_[r] = allocationSoFar;
181  allocationSoFar += num_ij;
182  }
183  offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
184  }
185 
186  template<size_t maxComponents, size_t numComponents = maxComponents>
187  KOKKOS_INLINE_FUNCTION
188  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189  const Kokkos::Array<int,maxComponents> &bounds) const
190  {
191  if (numComponents == 0)
192  {
193  return -1;
194  }
195  else
196  {
197  int r = static_cast<int>(numComponents - 1);
198  while (arguments[r] + 1 >= bounds[r])
199  {
200  arguments[r] = 0; // reset
201  r--;
202  if (r < 0) break;
203  }
204  if (r >= 0) ++arguments[r];
205  return r;
206  }
207  }
208 
210  KOKKOS_INLINE_FUNCTION
211  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
212  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213  const int &numComponents) const
214  {
215  if (numComponents == 0) return -1;
216  int r = static_cast<int>(numComponents - 1);
217  while (arguments[r] + 1 >= bounds[r])
218  {
219  arguments[r] = 0; // reset
220  r--;
221  if (r < 0) break;
222  }
223  if (r >= 0) ++arguments[r];
224  return r;
225  }
226 
227  template<size_t maxComponents, size_t numComponents = maxComponents>
228  KOKKOS_INLINE_FUNCTION
229  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
230  const Kokkos::Array<int,maxComponents> &bounds) const
231  {
232  if (numComponents == 0)
233  {
234  return -1;
235  }
236  else
237  {
238  int r = static_cast<int>(numComponents - 1);
239  while (arguments[r] + 1 >= bounds[r])
240  {
241  r--;
242  if (r < 0) break;
243  }
244  return r;
245  }
246  }
247 
249  KOKKOS_INLINE_FUNCTION
250  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
251  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252  const int &numComponents) const
253  {
254  if (numComponents == 0) return -1;
255  int r = numComponents - 1;
256  while (arguments[r] + 1 >= bounds[r])
257  {
258  r--;
259  if (r < 0) break;
260  }
261  return r;
262  }
263 
264  template<size_t maxComponents, size_t numComponents = maxComponents>
265  KOKKOS_INLINE_FUNCTION
266  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
267  const Kokkos::Array<int,maxComponents> &bounds,
268  const int startIndex) const
269  {
270  // the following mirrors what is done in TensorData
271  if (numComponents == 0)
272  {
273  return 0;
274  }
275  else
276  {
277  int enumerationIndex = 0;
278  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
279  {
280  enumerationIndex += arguments[r];
281  enumerationIndex *= bounds[r-1];
282  }
283  enumerationIndex += arguments[startIndex];
284  return enumerationIndex;
285  }
286  }
287 
289 
290  // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
291 // template<size_t numTensorComponents>
292 // KOKKOS_INLINE_FUNCTION
293 // void runSpecialized( const TeamMember & teamMember ) const;
294 
295 // template<>
296 // KOKKOS_INLINE_FUNCTION
297 // void runSpecialized<3>( const TeamMember & teamMember ) const
298  KOKKOS_INLINE_FUNCTION
299  void runSpecialized3( const TeamMember & teamMember ) const
300  {
301  constexpr int numTensorComponents = 3;
302 
303  Kokkos::Array<int,numTensorComponents> pointBounds;
304  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306  for (unsigned r=0; r<numTensorComponents; r++)
307  {
308  pointBounds[r] = pointBounds_[r];
309  leftFieldBounds[r] = leftFieldBounds_[r];
310  rightFieldBounds[r] = rightFieldBounds_[r];
311  }
312 
313  const int cellDataOrdinal = teamMember.league_rank();
314  const int numThreads = teamMember.team_size(); // num threads
315  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
316 
317  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
318  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
319  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
320  if (fad_size_output_ > 0) {
321  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
323  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
329  }
330  else {
331  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
333  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
339  }
340 
341 // int approximateFlopCount = 0;
342 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
343 
344  constexpr int R = numTensorComponents - 1;
345 
346  const int composedTransformRank = composedTransform_.rank();
347 
348  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
349  {
350  const int a = a_offset_ + a_component;
351  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
352  {
353  const int b = b_offset_ + b_component;
354 
355  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
356  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
357 
358  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
359  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
360 
361  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
362  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
363  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
364  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
365  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
366  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
367 
368  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
369  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
370  const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
371  const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
372  if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
373  {
374  const int leftRank = leftTensorComponent_x.rank();
375  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
376  }
377  if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
378  {
379  const int leftRank = leftTensorComponent_y.rank();
380  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
381  }
382  if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
383  {
384  const int leftRank = leftTensorComponent_z.rank();
385  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
386  }
387  if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
388  {
389  const int rightRank = rightTensorComponent_x.rank();
390  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
391  }
392  if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
393  {
394  const int rightRank = rightTensorComponent_y.rank();
395  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
396  }
397  if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
398  {
399  const int rightRank = rightTensorComponent_z.rank();
400  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
401  }
402  });
403 
404  if (composedTransform_.underlyingMatchesLogical())
405  {
406  if (composedTransformRank == 4) // (C,P,D,D)
407  {
408  const auto & composedTransformView = composedTransform_.getUnderlyingView4();
409  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
410  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
411  });
412  }
413  else // rank 2, then: (C,P)
414  {
415  const auto & composedTransformView = composedTransform_.getUnderlyingView2();
416  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
417  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
418  });
419  }
420  }
421  else
422  {
423  if (composedTransformRank == 4) // (C,P,D,D)
424  {
425  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
426  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
427  });
428  }
429  else // rank 2, then: (C,P)
430  {
431  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
432  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
433  });
434  }
435  }
436 
437  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
438 
439  // synchronize threads
440  teamMember.team_barrier();
441 
442  // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
443  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
444  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
445  {
446  scratchView(i) = 0.0;
447  }
448 
449  // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
450  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
451  const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
452  const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
453 
454  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
455  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
456  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
457  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
458  rightFieldArguments[R] = jz;
459  leftFieldArguments[R] = iz;
460 
461  Kokkos::Array<int,numTensorComponents> pointArguments;
462  for (int i=0; i<numTensorComponents; i++)
463  {
464  pointArguments[i] = 0;
465  }
466 
467  for (int lx=0; lx<pointBounds[0]; lx++)
468  {
469  pointArguments[0] = lx;
470 
471  // clear Gy scratch:
472  // in scratch, Gz (1 entry) comes first, then Gy entries.
473  const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
474  const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
475 
476  for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
477  {
478  scratchView(Gy_index) = 0;
479  }
480 
481  for (int ly=0; ly<pointBounds[1]; ly++)
482  {
483  pointArguments[1] = ly;
484 
485  Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
486  *Gz = 0;
487 
488  for (int lz=0; lz < pointBounds[2]; lz++)
489  {
490  const Scalar & leftValue = leftFields_z(iz,lz);
491  const Scalar & rightValue = rightFields_z(jz,lz);
492 
493  pointArguments[2] = lz;
494  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
495 
496  *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
497 
498  // approximateFlopCount += 3; // 2 multiplies, 1 sum
499  } // lz
500 
501  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
502  {
503  leftFieldArguments[1] = iy;
504  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
505 
506  const Scalar & leftValue = leftFields_y(iy,ly);
507 
508  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
509  {
510  rightFieldArguments[1] = jy;
511 
512  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
513  const Scalar & rightValue = rightFields_y(jy,ly);
514 
515  const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
516  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
517 
518  const int Gz_index = 0;
519  const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
520 
521  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
522 
523  Gy += leftValue * Gz * rightValue;
524  // approximateFlopCount += 3; // 2 multiplies, 1 sum
525  }
526  }
527  } // ly
528  for (int ix=0; ix<leftFieldBounds_[0]; ix++)
529  {
530  leftFieldArguments[0] = ix;
531  const Scalar & leftValue = leftFields_x(ix,lx);
532 
533  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
534  {
535  leftFieldArguments[1] = iy;
536 
537  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
538 
539  for (int jx=0; jx<rightFieldBounds_[0]; jx++)
540  {
541  rightFieldArguments[0] = jx;
542  const Scalar & rightValue = rightFields_x(jx,lx);
543 
544  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
545  {
546  rightFieldArguments[1] = jy;
547  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
548 
549  const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
550 
551  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
552  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
553 
554  // compute enumeration indices to get field indices into output view
555  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
556  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
557  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
558  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
559 
560  if (integralViewRank == 3)
561  {
562 // if ((leftFieldIndex==0) && (rightFieldIndex==2))
563 // {
564 // using std::cout;
565 // using std::endl;
566 // cout << "***** Contribution to (0,0,2) *****\n";
567 // cout << "lx = " << lx << endl;
568 // cout << "ix = " << ix << endl;
569 // cout << "iy = " << iy << endl;
570 // cout << "jx = " << jx << endl;
571 // cout << "jy = " << jy << endl;
572 // cout << "iz = " << iz << endl;
573 // cout << "jz = " << jz << endl;
574 // cout << " leftValue = " << leftValue << endl;
575 // cout << "rightValue = " << rightValue << endl;
576 // cout << "Gy = " << Gy << endl;
577 //
578 // cout << endl;
579 // }
580 
581  // shape (C,F1,F2)
582  integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
583  }
584  else
585  {
586  // shape (F1,F2)
587  integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
588  }
589  // approximateFlopCount += 3; // 2 multiplies, 1 sum
590  } // jy
591  } // ix
592  } // iy
593  } // ix
594  } // lx
595  }); // TeamThreadRange parallel_for - (iz,jz) loop
596  }
597  }
598 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
599  }
600 
601  template<size_t numTensorComponents>
602  KOKKOS_INLINE_FUNCTION
603  void run( const TeamMember & teamMember ) const
604  {
605  Kokkos::Array<int,numTensorComponents> pointBounds;
606  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
607  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
608  for (unsigned r=0; r<numTensorComponents; r++)
609  {
610  pointBounds[r] = pointBounds_[r];
611  leftFieldBounds[r] = leftFieldBounds_[r];
612  rightFieldBounds[r] = rightFieldBounds_[r];
613  }
614 
615  const int cellDataOrdinal = teamMember.league_rank();
616  const int numThreads = teamMember.team_size(); // num threads
617  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
618  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
619  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
620  Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
621  if (fad_size_output_ > 0) {
622  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
623  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
624  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
625  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
626  }
627  else {
628  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
629  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
630  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
631  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
632  }
633 
634 // int approximateFlopCount = 0;
635 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
636 
637  constexpr int R = numTensorComponents - 1;
638 
639  const int composedTransformRank = composedTransform_.rank();
640 
641  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
642  {
643  const int a = a_offset_ + a_component;
644  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
645  {
646  const int b = b_offset_ + b_component;
647 
648  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
649  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
650 
651  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
652  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
653 
654  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
655  const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
656  const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
657  const int leftFieldCount = leftTensorComponent.extent_int(0);
658  const int pointCount = leftTensorComponent.extent_int(1);
659  const int leftRank = leftTensorComponent.rank();
660  const int rightFieldCount = rightTensorComponent.extent_int(0);
661  const int rightRank = rightTensorComponent.rank();
662  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
663  {
664  // slightly weird logic here in effort to avoid branch divergence
665  const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
666  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
667  {
668  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
669  leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
670  }
671  }
672  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
673  {
674  // slightly weird logic here in effort to avoid branch divergence
675  const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
676  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
677  {
678  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
679  rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
680  }
681  }
682  });
683 
684  if (composedTransformRank == 4)
685  {
686  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
687  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
688  });
689  }
690  else
691  {
692  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
693  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
694  });
695  }
696  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
697 
698  // synchronize threads
699  teamMember.team_barrier();
700 
701  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
702  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
703  const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
704  const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
705 
706  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
707  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
708  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
709  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
710  rightFieldArguments[R] = j_R;
711  leftFieldArguments[R] = i_R;
712 
713  //TODO: I believe that this can be moved outside the thread parallel_for
714  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
715  {
716  scratchView(i) = 0.0;
717  }
718  Kokkos::Array<int,numTensorComponents> pointArguments;
719  for (unsigned i=0; i<numTensorComponents; i++)
720  {
721  pointArguments[i] = 0;
722  }
723 
724  int r = R;
725  while (r >= 0)
726  {
727  // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
728  const int pointBounds_R = pointBounds[R];
729  int & pointArgument_R = pointArguments[R];
730  for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
731  {
732  Scalar * G_R;
733  if (R != 0)
734  {
735  G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
736  }
737  else
738  {
739  const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
740  const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
741 
742  if (integralViewRank == 3)
743  {
744  // shape (C,F1,F2)
745  G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
746  }
747  else
748  {
749  // shape (F1,F2)
750  G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
751  }
752  }
753 
754  const int & pointIndex_R = pointArguments[R];
755 
756  const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
757  const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
758 
759  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
760 
761  *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
762 
763 // approximateFlopCount += 3; // 2 multiplies, 1 sum
764  } // pointArgument_R
765 
766  const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
767  const int r_min = (r_next >= 0) ? r_next : 0;
768 
769  for (int s=R-1; s>=r_min; s--)
770  {
771  const int & pointIndex_s = pointArguments[s];
772 
773  // want to cover all the multi-indices from s to R-1
774  for (int s1=s; s1<R; s1++)
775  {
776  leftFieldArguments[s1] = 0;
777  }
778 
779  // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
780  const int & i_s = leftFieldArguments[s];
781  const int & j_s = rightFieldArguments[s];
782 
783  int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
784  const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
785  const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
786 
787  while (sLeft >= s)
788  {
789  const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
790 
791  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
792  const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
793 
794  const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
795 
796  for (int s1=s; s1<R; s1++)
797  {
798  rightFieldArguments[s1] = 0;
799  }
800  int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
801  while (sRight >= s)
802  {
803  const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
804 
805  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
806  const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
807 
808  const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
809 
810  const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
811 
812  Scalar* G_s;
813 
814  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
815  // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
816  const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
817 
818  const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
819 
820 
821  if (s != 0)
822  {
823  G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
824  }
825  else
826  {
827  // compute enumeration indices
828  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
829  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
830  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
831  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
832 
833  if (integralViewRank == 3)
834  {
835  // shape (C,F1,F2)
836  G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
837  }
838  else
839  {
840  // shape (F1,F2)
841  G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
842  }
843  }
844 
845  *G_s += leftValue * G_sp * rightValue;
846 
847 // approximateFlopCount += 3; // 2 multiplies, 1 sum
848 
849  // increment rightField
850  sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
851  }
852 
853  // increment leftField
854  sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
855  }
856  }
857 
858  // clear tempStorage for r_next+1 to R
859  if (r_min+1 <= R)
860  {
861  const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
862  for (int i=scratchOffsetForThread; i<endIndex; i++)
863  {
864  scratchView(i) = 0.0;
865  }
866 // auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
867 // Kokkos::deep_copy(tempStorageSubview, 0.0);
868  }
869 
870  // proceed to next point
871  r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
872  }
873  }); // TeamThreadRange parallel_for
874  }
875  }
876 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
877  }
878 
879  KOKKOS_INLINE_FUNCTION
880  void operator()( const TeamMember & teamMember ) const
881  {
882  switch (numTensorComponents_)
883  {
884  case 1: run<1>(teamMember); break;
885  case 2: run<2>(teamMember); break;
886  case 3:
887  if (forceNonSpecialized_)
888  run<3>(teamMember);
889  else
890  runSpecialized3(teamMember);
891  break;
892  case 4: run<4>(teamMember); break;
893  case 5: run<5>(teamMember); break;
894  case 6: run<6>(teamMember); break;
895  case 7: run<7>(teamMember); break;
896 #ifdef INTREPID2_HAVE_DEBUG
897  default:
898  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
899 #endif
900  }
901  }
902 
905  {
906  // compute flop count on a single cell, then multiply by the number of cells
907  int flopCount = 0;
908 
909  const int R = numTensorComponents_ - 1;
910 
911  // we cache the value of M_ab * cellMeasure at each point.
912  // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
913  const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
914 
915  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
916  {
917  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
918  {
919  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
920  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
921 
922  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
923  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
924 
925  flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
926 
927  int flopsPer_i_R_j_R = 0;
928 
929  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
930  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
931  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
932  leftFieldArguments[R] = 0;
933 
934  Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
935  for (int i=0; i<numTensorComponents_; i++)
936  {
937  pointArguments[i] = 0;
938  }
939 
940  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
941  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
942  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
943  rightFieldArguments[R] = 0;
944 
945  int r = R;
946  while (r >= 0)
947  {
948  // integrate in final component dimension
949  for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
950  {
951  flopsPer_i_R_j_R += 4;
952  }
953  // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
954 // if (0 < pointBounds_[R])
955 // {
956 // flopsPer_i_R_j_R += pointBounds_[R] * 4;
957 // }
958 
959  const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
960  const int r_min = (r_next >= 0) ? r_next : 0;
961 
962  for (int s=R-1; s>=r_min; s--)
963  {
964  // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
965  int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
966  int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
967 
968  flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
969  }
970 
971  // proceed to next point
972  r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
973  }
974 
975  flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
976  }
977  }
978 // std::cout << "flop count per cell: " << flopCount << std::endl;
979  return flopCount;
980  }
981 
983  int teamSize(const int &maxTeamSizeFromKokkos) const
984  {
985  const int R = numTensorComponents_ - 1;
986  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
987  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
988  }
989 
991  size_t team_shmem_size (int team_size) const
992  {
993  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
994  size_t shmem_size = 0;
995 
996  if (fad_size_output_ > 0)
997  {
998  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
999  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
1000  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
1001  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
1002  }
1003  else
1004  {
1005  shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
1006  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
1007  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
1008  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
1009  }
1010 
1011  return shmem_size;
1012  }
1013  };
1014 
1024  template<class Scalar, class DeviceType, int integralViewRank>
1026  {
1027  using ExecutionSpace = typename DeviceType::execution_space;
1028  using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
1029  using TeamMember = typename TeamPolicy::member_type;
1030 
1031  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1032  IntegralViewType integralView_;
1033  TensorData<Scalar,DeviceType> leftComponent_;
1034  Data<Scalar,DeviceType> composedTransform_;
1035  TensorData<Scalar,DeviceType> rightComponent_;
1036  TensorData<Scalar,DeviceType> cellMeasures_;
1037  int a_offset_;
1038  int b_offset_;
1039  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1040  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1041  int numTensorComponents_;
1042  int leftFieldOrdinalOffset_;
1043  int rightFieldOrdinalOffset_;
1044 
1045  size_t fad_size_output_ = 0; // 0 if not a fad type
1046 
1047  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1048  // (this also makes it easier to reorder loops, etc., for further optimizations)
1049  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1050  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1051  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1052 
1053  int maxFieldsLeft_;
1054  int maxFieldsRight_;
1055  int maxPointCount_;
1056  public:
1058  TensorData<Scalar,DeviceType> leftComponent,
1059  Data<Scalar,DeviceType> composedTransform,
1060  TensorData<Scalar,DeviceType> rightComponent,
1061  TensorData<Scalar,DeviceType> cellMeasures,
1062  int a_offset,
1063  int b_offset,
1064  int leftFieldOrdinalOffset,
1065  int rightFieldOrdinalOffset)
1066  :
1067  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1068  leftComponent_(leftComponent),
1069  composedTransform_(composedTransform),
1070  rightComponent_(rightComponent),
1071  cellMeasures_(cellMeasures),
1072  a_offset_(a_offset),
1073  b_offset_(b_offset),
1074  leftComponentSpan_(leftComponent.extent_int(2)),
1075  rightComponentSpan_(rightComponent.extent_int(2)),
1076  numTensorComponents_(leftComponent.numTensorComponents()),
1077  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1078  rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1079  {
1080  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1081 
1082  const int FIELD_DIM = 0;
1083  const int POINT_DIM = 1;
1084  maxFieldsLeft_ = 0;
1085  maxFieldsRight_ = 0;
1086  maxPointCount_ = 0;
1087  for (int r=0; r<numTensorComponents_; r++)
1088  {
1089  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1090  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1091  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1092  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1093  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1094  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1095  }
1096 
1097  // prepare for allocation of temporary storage
1098  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1099 
1100  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1101  if (allocateFadStorage)
1102  {
1103  fad_size_output_ = dimension_scalar(integralView_);
1104  }
1105  }
1106 
1107  template<size_t maxComponents, size_t numComponents = maxComponents>
1108  KOKKOS_INLINE_FUNCTION
1109  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1110  const Kokkos::Array<int,maxComponents> &bounds) const
1111  {
1112  if (numComponents == 0) return -1;
1113  int r = static_cast<int>(numComponents - 1);
1114  while (arguments[r] + 1 >= bounds[r])
1115  {
1116  arguments[r] = 0; // reset
1117  r--;
1118  if (r < 0) break;
1119  }
1120  if (r >= 0) ++arguments[r];
1121  return r;
1122  }
1123 
1125  KOKKOS_INLINE_FUNCTION
1126  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1127  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128  const int &numComponents) const
1129  {
1130  if (numComponents == 0) return -1;
1131  int r = static_cast<int>(numComponents - 1);
1132  while (arguments[r] + 1 >= bounds[r])
1133  {
1134  arguments[r] = 0; // reset
1135  r--;
1136  if (r < 0) break;
1137  }
1138  if (r >= 0) ++arguments[r];
1139  return r;
1140  }
1141 
1142  template<size_t maxComponents, size_t numComponents = maxComponents>
1143  KOKKOS_INLINE_FUNCTION
1144  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1145  const Kokkos::Array<int,maxComponents> &bounds) const
1146  {
1147  if (numComponents == 0) return -1;
1148  int r = static_cast<int>(numComponents - 1);
1149  while (arguments[r] + 1 >= bounds[r])
1150  {
1151  r--;
1152  if (r < 0) break;
1153  }
1154  return r;
1155  }
1156 
1158  KOKKOS_INLINE_FUNCTION
1159  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1160  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1161  const int &numComponents) const
1162  {
1163  if (numComponents == 0) return -1;
1164  int r = numComponents - 1;
1165  while (arguments[r] + 1 >= bounds[r])
1166  {
1167  r--;
1168  if (r < 0) break;
1169  }
1170  return r;
1171  }
1172 
1173  template<size_t maxComponents, size_t numComponents = maxComponents>
1174  KOKKOS_INLINE_FUNCTION
1175  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1176  const Kokkos::Array<int,maxComponents> &bounds,
1177  const int startIndex) const
1178  {
1179  // the following mirrors what is done in TensorData
1180  if (numComponents == 0)
1181  {
1182  return 0;
1183  }
1184  int enumerationIndex = 0;
1185  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1186  {
1187  enumerationIndex += arguments[r];
1188  enumerationIndex *= bounds[r-1];
1189  }
1190  enumerationIndex += arguments[startIndex];
1191  return enumerationIndex;
1192  }
1193 
1194  template<int rank>
1195  KOKKOS_INLINE_FUNCTION
1196  enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1197  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1198  {
1199  return integralView(cellDataOrdinal,i,j);
1200  }
1201 
1202  template<int rank>
1203  KOKKOS_INLINE_FUNCTION
1204  enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1205  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1206  {
1207  return integralView(i,j);
1208  }
1209 
1211  KOKKOS_INLINE_FUNCTION
1212  void runSpecialized3( const TeamMember & teamMember ) const
1213  {
1214  constexpr int numTensorComponents = 3;
1215 
1216  const int pointBounds_x = pointBounds_[0];
1217  const int pointBounds_y = pointBounds_[1];
1218  const int pointBounds_z = pointBounds_[2];
1219  const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1220 
1221  const int leftFieldBounds_x = leftFieldBounds_[0];
1222  const int rightFieldBounds_x = rightFieldBounds_[0];
1223  const int leftFieldBounds_y = leftFieldBounds_[1];
1224  const int rightFieldBounds_y = rightFieldBounds_[1];
1225  const int leftFieldBounds_z = leftFieldBounds_[2];
1226  const int rightFieldBounds_z = rightFieldBounds_[2];
1227 
1228  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1229  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1230  for (unsigned r=0; r<numTensorComponents; r++)
1231  {
1232  leftFieldBounds[r] = leftFieldBounds_[r];
1233  rightFieldBounds[r] = rightFieldBounds_[r];
1234  }
1235 
1236  const auto integralView = integralView_;
1237  const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1238  const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1239 
1240  const int cellDataOrdinal = teamMember.league_rank();
1241  const int threadNumber = teamMember.team_rank();
1242 
1243  const int numThreads = teamMember.team_size(); // num threads
1244  const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1245  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1246  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1247  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral; // for one Gz value that we sum into before summing into the destination matrix
1248  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1249 
1250  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1251  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1252  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1253  if (fad_size_output_ > 0) {
1254  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1255  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1256  GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1257  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1258 
1259  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1260  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1261  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1262  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1263  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1264  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1265  }
1266  else {
1267  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1268  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1269  GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1270  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1271 
1272  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1273  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1274  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1275  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1276  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1277  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1278  }
1279 
1280 // int approximateFlopCount = 0;
1281 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1282 
1283  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1284 
1285  const int composedTransformRank = composedTransform_.rank();
1286 
1287  // synchronize threads
1288  teamMember.team_barrier();
1289 
1290  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1291  {
1292  const int a = a_offset_ + a_component;
1293  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1294  {
1295  const int b = b_offset_ + b_component;
1296 
1297  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1298  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1299  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1300  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1301  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1302  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1303 
1304  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1305  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1306  if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1307  {
1308  const int pointCount = leftTensorComponent_x.extent_int(1);
1309  const int leftRank = leftTensorComponent_x.rank();
1310  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1311  {
1312  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1313  }
1314  }
1315  if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1316  {
1317  const int pointCount = leftTensorComponent_y.extent_int(1);
1318  const int leftRank = leftTensorComponent_y.rank();
1319  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1320  {
1321  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1322  }
1323  }
1324  if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1325  {
1326  const int pointCount = leftTensorComponent_z.extent_int(1);
1327  const int leftRank = leftTensorComponent_z.rank();
1328  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1329  {
1330  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1331  }
1332  }
1333  if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1334  {
1335  const int pointCount = rightTensorComponent_x.extent_int(1);
1336  const int rightRank = rightTensorComponent_x.rank();
1337  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1338  {
1339  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1340  }
1341  }
1342  if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1343  {
1344  const int pointCount = rightTensorComponent_y.extent_int(1);
1345  const int rightRank = rightTensorComponent_y.rank();
1346  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1347  {
1348  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1349  }
1350  }
1351  if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1352  {
1353  const int pointCount = rightTensorComponent_z.extent_int(1);
1354  const int rightRank = rightTensorComponent_z.rank();
1355  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1356  {
1357  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1358  }
1359  }
1360  });
1361 
1362  if (composedTransformRank == 4) // (C,P,D,D)
1363  {
1364  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1365  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1366  });
1367  }
1368  else // (C,P)
1369  {
1370  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1371  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1372  });
1373  }
1374 
1375  // synchronize threads
1376  teamMember.team_barrier();
1377 
1378  for (int i0=0; i0<leftFieldBounds_x; i0++)
1379  {
1380  for (int j0=0; j0<rightFieldBounds_x; j0++)
1381  {
1382  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1383  // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1384  const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1385 
1386  Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1387 
1388  Gx = 0;
1389  if (fad_size_output_ == 0)
1390  {
1391  // not a Fad type; we're allow to have a vector range
1392  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1393  {
1394  integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1395  }, Gx);
1396  }
1397  else
1398  {
1399  for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1400  {
1401  Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1402  }
1403  }
1404  });
1405 
1406  // synchronize threads
1407  teamMember.team_barrier();
1408 
1409  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1410  const int i1 = i1j1 % leftFieldBounds_y;
1411  const int j1 = i1j1 / leftFieldBounds_y;
1412 
1413  int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1414 
1415  int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1416  for (int lz=0; lz<pointBounds_z; lz++)
1417  {
1418  Scalar & Gy = GyIntegrals(Gy_index);
1419  Gy = 0.0;
1420 
1421  for (int ly=0; ly<pointBounds_y; ly++)
1422  {
1423  const Scalar & leftValue = leftFields_y(i1,ly);
1424  const Scalar & rightValue = rightFields_y(j1,ly);
1425 
1426  Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1427 
1428  pointEnumerationIndex++;
1429  }
1430  Gy_index++;
1431  }
1432 
1433  Scalar & Gz = GzIntegral(threadNumber); // one entry per thread
1434  for (int i2=0; i2<leftFieldBounds_z; i2++)
1435  {
1436  for (int j2=0; j2<rightFieldBounds_z; j2++)
1437  {
1438  Gz = 0.0;
1439 
1440  int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1441 
1442  for (int lz=0; lz<pointBounds_z; lz++)
1443  {
1444  const Scalar & leftValue = leftFields_z(i2,lz);
1445  const Scalar & rightValue = rightFields_z(j2,lz);
1446 
1447  Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1448 
1449  Gy_index++;
1450  }
1451 
1452  const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1453  const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1454  // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1455 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1456 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1457 
1458  integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1459  }
1460  }
1461  });
1462  // synchronize threads
1463  teamMember.team_barrier();
1464  }
1465  }
1466  }
1467  }
1468  }
1469 
1470  template<size_t numTensorComponents>
1471  KOKKOS_INLINE_FUNCTION
1472  void run( const TeamMember & teamMember ) const
1473  {
1474  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1475 // Kokkos::Array<int,numTensorComponents> pointBounds;
1476 // Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1477 // Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1478 //
1479 // int pointsInNonzeroComponentDimensions = 1;
1480 // for (unsigned r=0; r<numTensorComponents; r++)
1481 // {
1482 // pointBounds[r] = pointBounds_[r];
1483 // if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1484 // leftFieldBounds[r] = leftFieldBounds_[r];
1485 // rightFieldBounds[r] = rightFieldBounds_[r];
1486 // }
1487 //
1488 // const int cellDataOrdinal = teamMember.league_rank();
1489 // const int numThreads = teamMember.team_size(); // num threads
1490 // const int G_k_StackHeight = numTensorComponents - 1; // per thread
1491 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1492 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1493 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1494 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1495 // if (fad_size_output_ > 0) {
1496 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1497 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1498 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1499 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1500 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1501 // }
1502 // else {
1503 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1504 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1505 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1506 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1507 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1508 // }
1509 //
1512 //
1513 // constexpr int R = numTensorComponents - 1;
1514 //
1515 // // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1516 //
1517 // // synchronize threads
1518 // teamMember.team_barrier();
1519 //
1520 // for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1521 // {
1522 // const int a = a_offset_ + a_component;
1523 // for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1524 // {
1525 // const int b = b_offset_ + b_component;
1526 //
1527 // const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1528 // const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1529 //
1530 // const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1531 // const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1532 //
1533 // const int numPointsFirst = leftFirstComponent.extent_int(1);
1534 //
1535 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1536 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1537 // });
1538 //
1539 // // synchronize threads
1540 // teamMember.team_barrier();
1541 //
1542 // for (int i0=0; i0<numLeftFieldsFirst; i0++)
1543 // {
1544 // for (int j0=0; j0<numRightFieldsFirst; j0++)
1545 // {
1546 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1547 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1548 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1549 // leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1550 // });
1551 //
1552 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1553 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1554 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1555 // rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1556 // });
1557 //
1558 // // synchronize threads
1559 // teamMember.team_barrier();
1560 //
1561 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1562 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1563 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1564 //
1565 // for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1566 // {
1567 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1568 // remainingIndex /= pointBounds[d+1];
1569 // }
1570 // pointArgumentsInLaterDimensions[0] = remainingIndex;
1571 //
1572 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1573 // for (int d=R; d>0; d--)
1574 // {
1575 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1576 // tensorPointEnumerationOffset *= pointBounds[d-1];
1577 // }
1578 //
1579 // Scalar integralValue = 0;
1580 // if (fad_size_output_ == 0)
1581 // {
1582 // // not a Fad type; we're allow to have a vector range
1583 // Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1584 // {
1585 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1586 // }, integralValue);
1587 // }
1588 // else
1589 // {
1590 // for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1591 // {
1592 // integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1593 // }
1594 // }
1595 //
1596 // G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1597 // });
1598 //
1599 // // synchronize threads
1600 // teamMember.team_barrier();
1601 //
1602 // // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1603 // }
1604 // }
1605 // }
1606 // }
1608  }
1609 
1610  KOKKOS_INLINE_FUNCTION
1611  void operator()( const TeamMember & teamMember ) const
1612  {
1613  switch (numTensorComponents_)
1614  {
1615  case 1: run<1>(teamMember); break;
1616  case 2: run<2>(teamMember); break;
1617  case 3: runSpecialized3(teamMember); break;
1618  case 4: run<4>(teamMember); break;
1619  case 5: run<5>(teamMember); break;
1620  case 6: run<6>(teamMember); break;
1621  case 7: run<7>(teamMember); break;
1622 #ifdef INTREPID2_HAVE_DEBUG
1623  default:
1624  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1625 #endif
1626  }
1627  }
1628 
1631  {
1632  // compute flop count on a single cell
1633  int flopCount = 0;
1634 
1635  constexpr int numTensorComponents = 3;
1636  Kokkos::Array<int,numTensorComponents> pointBounds;
1637  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1638  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1639 
1640  int pointsInNonzeroComponentDimensions = 1;
1641  for (unsigned r=0; r<numTensorComponents; r++)
1642  {
1643  pointBounds[r] = pointBounds_[r];
1644  if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1645  leftFieldBounds[r] = leftFieldBounds_[r];
1646  rightFieldBounds[r] = rightFieldBounds_[r];
1647  }
1648 
1649  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1650  {
1651  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1652  {
1653 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1654 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1655 // });
1656  flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1657 
1658  for (int i0=0; i0<leftFieldBounds[0]; i0++)
1659  {
1660  for (int j0=0; j0<rightFieldBounds[0]; j0++)
1661  {
1662  flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1663 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1664 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1665 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1666 //
1667 // for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1668 // {
1669 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1670 // remainingIndex /= pointBounds[d+1];
1671 // }
1672 // pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1673 //
1674 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1675 // for (int d=R; d>0; d--)
1676 // {
1677 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1678 // tensorPointEnumerationOffset *= pointBounds[d-1];
1679 // }
1680 //
1681 // Scalar integralValue = 0;
1682 // if (fad_size_output_ == 0)
1683 // {
1684 // // not a Fad type; we're allow to have a vector range
1685 // Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1686 // {
1687 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1688 // }, integralValue);
1689 // }
1690 // else
1691 // {
1692 // for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1693 // {
1694 // integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1695 // }
1696 // }
1697 //
1698 // GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1699 // });
1700 
1701 
1702  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1703  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1704  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1705 
1706 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1707 // const int i1 = i1j1 % leftFieldBounds[1];
1708 // const int j1 = i1j1 / leftFieldBounds[1];
1709 //
1711 //
1712 // int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1713 // for (int lz=0; lz<pointBounds[2]; lz++)
1714 // {
1715 // Scalar & Gy = GyIntegrals(Gy_index);
1716 // Gy = 0.0;
1717 //
1718 // const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1719 // const bool rightRankIs3 = (rightFields_y.rank() == 3);
1720 // for (int ly=0; ly<pointBounds[1]; ly++)
1721 // {
1722 // const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1723 // const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1724 //
1725 // Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1726 //
1727 // pointEnumerationIndex++;
1728 // }
1729 // Gy_index++;
1730 // }
1731 //
1732 // for (int i2=0; i2<leftFieldBounds[2]; i2++)
1733 // {
1734 // for (int j2=0; j2<rightFieldBounds[2]; j2++)
1735 // {
1736 // Scalar Gz = 0.0;
1737 //
1738 // int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1739 //
1740 // const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1741 // const bool rightRankIs3 = (rightFields_z.rank() == 3);
1742 // for (int lz=0; lz<pointBounds[2]; lz++)
1743 // {
1744 // const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1745 // const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1746 //
1747 // Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1748 //
1749 // Gy_index++;
1750 // }
1751 //
1752 // Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1753 // Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1754 //
1755 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1756 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1757 //
1758 // if (integralViewRank == 2)
1759 // {
1760 // integralView_.access(i,j,0) += Gz;
1761 // }
1762 // else
1763 // {
1764 // integralView_.access(cellDataOrdinal,i,j) += Gz;
1765 // }
1766 // }
1767 // }
1768 // });
1769  }
1770  }
1771  }
1772  }
1773  return flopCount;
1774  }
1775 
1777  int teamSize(const int &maxTeamSizeFromKokkos) const
1778  {
1779  // TODO: fix this to match the actual parallelism expressed
1780  const int R = numTensorComponents_ - 1;
1781  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1782  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1783  }
1784 
1786  size_t team_shmem_size (int numThreads) const
1787  {
1788  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1789  size_t shmem_size = 0;
1790 
1791  const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1792 
1793  int pointsInNonzeroComponentDimensions = 1;
1794  for (int d=1; d<numTensorComponents_; d++)
1795  {
1796  pointsInNonzeroComponentDimensions *= pointBounds_[d];
1797  }
1798 
1799  if (fad_size_output_ > 0)
1800  {
1801  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1802  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1803  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_); // GzIntegral: entry with x,y,z integrated away
1804  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1805 
1806  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1807  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1808  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1809  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1810  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1811  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1812  }
1813  else
1814  {
1815  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1816  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1817  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads); // GzIntegral: entry with x,y,z integrated away
1818  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1819 
1820  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1821  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1822  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1823  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1824  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1825  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1826  }
1827 
1828  return shmem_size;
1829  }
1830  };
1831 
1832  template<class Scalar, class DeviceType>
1834  {
1835  ScalarView<Scalar,DeviceType> integral_;
1837  Data<Scalar,DeviceType> right_;
1838  Data<Scalar,DeviceType> weights_;
1839  ordinal_type dimSpan_;
1840  ordinal_type leftRank_;
1841  ordinal_type rightRank_;
1842  ordinal_type numPoints_;
1843  public:
1844  F_RefSpaceIntegral(ScalarView<Scalar,DeviceType> integralView,
1846  ordinal_type dimSpan)
1847  :
1848  integral_(integralView),
1849  left_(left),
1850  right_(right),
1851  weights_(weights),
1852  dimSpan_(dimSpan)
1853  {
1854  leftRank_ = left.rank();
1855  rightRank_ = right.rank();
1856  numPoints_ = weights.extent_int(0);
1857  }
1858 
1859  KOKKOS_INLINE_FUNCTION
1860  void operator()( const ordinal_type & i, const ordinal_type & j ) const
1861  {
1862  Scalar refSpaceIntegral = 0.0;
1863  for (int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1864  {
1865  const Scalar & weight = weights_(ptOrdinal);
1866  for (int a=0; a<dimSpan_; a++)
1867  {
1868  const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1869  const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1870  refSpaceIntegral += leftValue * rightValue * weight;
1871  }
1872  }
1873  integral_(i,j) = refSpaceIntegral;
1874  }
1875  };
1876  }
1877 
1878 template<typename DeviceType>
1879 template<class Scalar>
1881  const TensorData<Scalar,DeviceType> cellMeasures,
1882  const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
1883 {
1884  // allocates a (C,F,F) container for storing integral data
1885 
1886  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1887  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1888  const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1889  const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1890  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1891 
1892  // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1893  // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1894  // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1895  // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1896  const int CELL_DIM = 0;
1897  const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1898  const auto leftTransform = vectorDataLeft.transform();
1899 
1900  DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1901  // transforms may be invalid, indicating an identity transform. If so, it will not constrain the output at all.
1902  if (vectorDataLeft.transform().isValid())
1903  {
1904  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1905  }
1906  if (vectorDataRight.transform().isValid())
1907  {
1908  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1909  }
1910 
1911  DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1912  int cellDataExtent = combinedCellDimInfo.dataExtent;
1913 
1914  const int numCells = vectorDataLeft.numCells();
1915  const int numFieldsLeft = vectorDataLeft.numFields();
1916  const int numFieldsRight = vectorDataRight.numFields();
1917 
1918  Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1919  Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1920 
1921  if (cellVariationType != CONSTANT)
1922  {
1923  Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1924  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1925  }
1926  else
1927  {
1928  Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1929  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1930  }
1931 }
1932 
1936 template<typename DeviceType>
1937 template<class Scalar>
1939  const TensorData<Scalar,DeviceType> & cellMeasures,
1940  const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
1941  double* approximateFlops)
1942 {
1943  using ExecutionSpace = typename DeviceType::execution_space;
1944 
1945  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1946  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1947  const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1948  const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1949  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1950 
1951  if (approximateFlops != NULL)
1952  {
1953  *approximateFlops = 0;
1954  }
1955 
1956  // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1957  const int integralViewRank = integrals.getUnderlyingViewRank();
1958 
1959  if (!sumInto)
1960  {
1961  integrals.clear();
1962  }
1963 
1964  const int cellDataExtent = integrals.getDataExtent(0);
1965  const int numFieldsLeft = basisValuesLeft.numFields();
1966  const int numFieldsRight = basisValuesRight.numFields();
1967  const int spaceDim = basisValuesLeft.spaceDim();
1968 
1969  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1970 
1971  const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1972  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1973 
1974  // we require that the number of tensor components in the vectors are the same for each vector entry
1975  // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1976  int numTensorComponentsLeft = -1;
1977  const bool isVectorValued = basisValuesLeft.vectorData().isValid();
1978  if (isVectorValued)
1979  {
1980  const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
1981  INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument, "left and right must either both be vector-valued, or both scalar-valued");
1982  const auto &refVectorLeft = basisValuesLeft.vectorData();
1983  int numFamiliesLeft = refVectorLeft.numFamilies();
1984  int numVectorComponentsLeft = refVectorLeft.numComponents();
1985  Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1986  Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1987  for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1988  {
1989  for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1990  {
1991  const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1992  if (tensorData.numTensorComponents() > 0)
1993  {
1994  if (numTensorComponentsLeft == -1)
1995  {
1996  numTensorComponentsLeft = tensorData.numTensorComponents();
1997  }
1998  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1999  for (int r=0; r<numTensorComponentsLeft; r++)
2000  {
2001  maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
2002  }
2003  }
2004  }
2005  }
2006  int numTensorComponentsRight = -1;
2007  const auto &refVectorRight = basisValuesRight.vectorData();
2008  int numFamiliesRight = refVectorRight.numFamilies();
2009  int numVectorComponentsRight = refVectorRight.numComponents();
2010  for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2011  {
2012  for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2013  {
2014  const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2015  if (tensorData.numTensorComponents() > 0)
2016  {
2017  if (numTensorComponentsRight == -1)
2018  {
2019  numTensorComponentsRight = tensorData.numTensorComponents();
2020  }
2021  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2022  for (int r=0; r<numTensorComponentsRight; r++)
2023  {
2024  maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2025  }
2026  }
2027  }
2028  }
2029  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
2030  }
2031  else
2032  {
2033  numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
2034  for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
2035  {
2036  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
2037  }
2038 
2039  // check that right tensor component count also agrees
2040  for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2041  {
2042  INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2043  }
2044  }
2045  const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
2046 
2047  if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
2048  {
2049  // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
2050 
2051  // in this case, the integrals in each tensorial direction are entirely separable
2052  // allocate some temporary storage for the integrals in each tensorial direction
2053 
2054  // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
2055 
2056  Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2057  for (int r=0; r<numPointTensorComponents; r++)
2058  {
2059  // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
2060  pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
2061  }
2062 
2063  // only one of these will be a valid container:
2064  Kokkos::View<Scalar**, DeviceType> integralView2;
2065  Kokkos::View<Scalar***, DeviceType> integralView3;
2066  if (integralViewRank == 3)
2067  {
2068  integralView3 = integrals.getUnderlyingView3();
2069  }
2070  else
2071  {
2072  integralView2 = integrals.getUnderlyingView2();
2073  }
2074  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2075  {
2076  int a_offset = 0; // left vector component offset
2077  int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2078 
2079  const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
2080  for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2081  {
2082  TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
2083  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2084  if (!leftComponent.isValid())
2085  {
2086  a_offset++; // empty components are understood to take up one dimension
2087  continue;
2088  }
2089  const int leftDimSpan = leftComponent.extent_int(2);
2090 
2091  const int leftComponentFieldCount = leftComponent.extent_int(0);
2092 
2093  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2094  {
2095  int b_offset = 0; // right vector component offset
2096  int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2097 
2098  const int rightVectorComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2099  for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2100  {
2101  TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2102  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2103  if (!rightComponent.isValid())
2104  {
2105  b_offset++; // empty components are understood to take up one dimension
2106  continue;
2107  }
2108  const int rightDimSpan = rightComponent.extent_int(2);
2109 
2110  const int rightComponentFieldCount = rightComponent.extent_int(0);
2111 
2112  // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2113  if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2114  {
2115  b_offset += rightDimSpan;
2116 
2117  continue;
2118  }
2119 
2120  // if the a, b spans intersect, by assumption they should align exactly
2121  INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2122  INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2123 
2124  const int d_start = a_offset;
2125  const int d_end = d_start + leftDimSpan;
2126 
2127  using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2128  ComponentIntegralsArray componentIntegrals;
2129  for (int r=0; r<numPointTensorComponents; r++)
2130  {
2131  /*
2132  Four vector data cases to consider:
2133  1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2134  2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2135  3. First container is axial, second arbitrary.
2136  4. First is arbitrary, second axial.
2137 
2138  But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2139  the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2140 
2141  The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2142  while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2143  */
2144 
2145  Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2146  const int numPoints = pointDimensions[r];
2147 
2148  // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2149 
2150  Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2151  Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2152 
2153  const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2154  const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2155  const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2156  const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2157 
2158  INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2159 
2160  for (int d=d_start; d<d_end; d++)
2161  {
2162  ScalarView<Scalar,DeviceType> componentIntegralView;
2163 
2164  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
2165  if (allocateFadStorage)
2166  {
2167  auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2168  componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2169  }
2170  else
2171  {
2172  componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2173  }
2174 
2175  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2176 
2177  Impl::F_RefSpaceIntegral<Scalar, DeviceType> refSpaceIntegralFunctor(componentIntegralView, leftTensorComponent, rightTensorComponent, quadratureWeights,
2178  leftTensorComponentDimSpan);
2179  Kokkos::parallel_for("compute componentIntegrals", policy, refSpaceIntegralFunctor);
2180 
2181  componentIntegrals[r][d] = componentIntegralView;
2182 
2183  if (approximateFlops != NULL)
2184  {
2185  *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2186  }
2187  } // d
2188  } // r
2189 
2190  ExecutionSpace().fence();
2191 
2192  Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2193  Kokkos::Array<int,3> lowerBounds {0,0,0};
2194  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2195  // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2196  Kokkos::parallel_for("compute field integrals", policy,
2197  KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2198  const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2199 
2200  TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2201  leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2202 
2203  TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2204  rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2205 
2206  Scalar integralSum = 0.0;
2207  for (int d=d_start; d<d_end; d++)
2208  {
2209  const Scalar & transformLeft_d = basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
2210  const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
2211 
2212  const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2213  // approximateFlopCount++;
2214 
2215  Scalar integral_d = 1.0;
2216 
2217  for (int r=0; r<numPointTensorComponents; r++)
2218  {
2219  integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2220  // approximateFlopCount++; // product
2221  }
2222  integralSum += leftRightTransform_d * integral_d;
2223  // approximateFlopCount += 2; // multiply and sum
2224 
2225  const int i = leftFieldOrdinal + leftFieldOffset;
2226  const int j = rightFieldOrdinal + rightFieldOffset;
2227 
2228  if (integralViewRank == 3)
2229  {
2230  integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2231  }
2232  else
2233  {
2234  integralView2(i,j) += cellMeasureWeight * integralSum;
2235  }
2236  }
2237  });
2238  b_offset += rightDimSpan;
2239  } // rightVectorComponentOrdinal
2240  } // rightFamilyOrdinal
2241  a_offset += leftDimSpan;
2242  } // leftVectorComponentOrdinal
2243  } // leftFamilyOrdinal
2244 
2245  if (approximateFlops != NULL)
2246  {
2247  // TODO: check the accuracy of this
2248  *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2249  }
2250  }
2251  else // general case (not axis-aligned + affine tensor-product structure)
2252  {
2253  // prepare composed transformation matrices
2254  const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2255  const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2256  const bool transposeLeft = true;
2257  const bool transposeRight = false;
2258 // auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2259 // timer->start();
2260  // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2
2261  const bool matrixTransform = (leftTransform.rank() == 4) || (rightTransform.rank() == 4);
2262  Data<Scalar,DeviceType> composedTransform;
2263  // invalid/empty transforms are used when the identity is intended.
2264  if (leftTransform.isValid() && rightTransform.isValid())
2265  {
2266  if (matrixTransform)
2267  {
2268  composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2269  composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2270 
2271  // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2272  if (approximateFlops != NULL)
2273  {
2274  *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2275  }
2276  }
2277  else
2278  {
2279  composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2280  composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2281 
2282  // re-cast composedTranform as a rank 4 (C,P,D,D) object -- a 1 x 1 matrix at each (C,P).
2283  const int newRank = 4;
2284  auto extents = composedTransform.getExtents();
2285  auto variationTypes = composedTransform.getVariationTypes();
2286  composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2287  if (approximateFlops != NULL)
2288  {
2289  *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2290  }
2291  }
2292  }
2293  else if (leftTransform.isValid())
2294  {
2295  // rightTransform is the identity
2296  composedTransform = leftTransform;
2297  }
2298  else if (rightTransform.isValid())
2299  {
2300  // leftTransform is the identity
2301  composedTransform = rightTransform;
2302  }
2303  else
2304  {
2305  // both left and right transforms are identity
2306  Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2307  Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2308 
2309  Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2310  Kokkos::deep_copy(identityUnderlyingView, 1.0);
2311  composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2312  }
2313 
2314 // timer->stop();
2315 // std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2316 // timer->reset();
2317 
2318  const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2319  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2320  const int leftComponentCount = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2321  const int rightComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2322 
2323  int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2324  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2325  {
2326  // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2327  // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2328  int a_offset = 0;
2329  bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2330  for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2331  {
2332  TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2333  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2334  if (!leftComponent.isValid())
2335  {
2336  // represents zero
2337  a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2338  continue;
2339  }
2340 
2341  int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2342  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2343  {
2344  // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2345  // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2346  bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2347  int b_offset = 0;
2348  for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2349  {
2350  TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2351  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2352  if (!rightComponent.isValid())
2353  {
2354  // represents zero
2355  b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2356  continue;
2357  }
2358 
2359  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2360 
2361  const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2362  Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2363 
2364  // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2365  // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2366  // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2367  // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2368  // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2369  // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2370  // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2371  bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2372  bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2373 
2374  // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2375  // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2376  if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2377  {
2378  ExecutionSpace().fence();
2379  }
2380  haveLaunchedContributionToCurrentFamilyLeft = true;
2381  haveLaunchedContributionToCurrentFamilyRight = true;
2382 
2383  if (integralViewRank == 2)
2384  {
2385  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2386  {
2387  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2388 
2389  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2390  const int teamSize = functor.teamSize(recommendedTeamSize);
2391 
2392  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2393 
2394  Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2395 
2396  if (approximateFlops != NULL)
2397  {
2398  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2399  }
2400  }
2401  else
2402  {
2403  auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2404 
2405  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2406  const int teamSize = functor.teamSize(recommendedTeamSize);
2407 
2408  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2409 
2410  Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2411 
2412  if (approximateFlops != NULL)
2413  {
2414  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2415  }
2416  }
2417  }
2418  else if (integralViewRank == 3)
2419  {
2420  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2421  {
2422  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2423 
2424  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2425  const int teamSize = functor.teamSize(recommendedTeamSize);
2426 
2427  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2428 
2429  Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2430 
2431  if (approximateFlops != NULL)
2432  {
2433  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2434  }
2435  }
2436  else
2437  {
2438  auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2439 
2440  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2441  const int teamSize = functor.teamSize(recommendedTeamSize);
2442 
2443  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2444 
2445  Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2446 
2447  if (approximateFlops != NULL)
2448  {
2449  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2450  }
2451  }
2452  }
2453  b_offset += isVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2454  }
2455  rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2456  }
2457  a_offset += isVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2458  }
2459  leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2460  }
2461  }
2462 // if (approximateFlops != NULL)
2463 // {
2464 // std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2465 // }
2466  ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2467 }
2468 
2469 } // end namespace Intrepid2
2470 
2471 #endif
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION int numFamilies() const
For valid vectorData, returns the number of families in vectorData; otherwise, returns number of Tens...
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container...
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
static Data< Scalar, DeviceType > allocateIntegralData(const TransformedBasisValues< Scalar, DeviceType > vectorDataLeft, const TensorData< Scalar, DeviceType > cellMeasures, const TransformedBasisValues< Scalar, DeviceType > vectorDataRight)
Allocates storage for the contraction of vectorDataLeft and vectorDataRight containers on point and s...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don&#39;t (e.g., those that have been co...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
static void integrate(Data< Scalar, DeviceType > integrals, const TransformedBasisValues< Scalar, DeviceType > &vectorDataLeft, const TensorData< Scalar, DeviceType > &cellMeasures, const TransformedBasisValues< Scalar, DeviceType > &vectorDataRight, const bool sumInto=false, double *approximateFlops=NULL)
Contracts vectorDataLeft and vectorDataRight containers on point and space dimensions, weighting each point according to cellMeasures, and stores the result in outputValues. The integrals container can be constructed using allocateIntegralData().
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank...
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don&#39;t (namely, those that have been ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
TensorDataType & tensorData()
TensorData accessor for single-family scalar data.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container...
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
Structure-preserving representation of transformed vector data; reference space values and transforma...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
KOKKOS_INLINE_FUNCTION int familyFieldOrdinalOffset(const int &familyOrdinal) const
Returns the field ordinal offset for the specified family.
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container...
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...