15 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
16 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
22 #include "Teuchos_TimeMonitor.hpp"
31 template<
class Scalar,
class DeviceType,
int integralViewRank>
34 using ExecutionSpace =
typename DeviceType::execution_space;
35 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
36 using TeamMember =
typename TeamPolicy::member_type;
38 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
39 IntegralViewType integralView_;
46 int leftComponentSpan_;
47 int rightComponentSpan_;
48 int numTensorComponents_;
49 int leftFieldOrdinalOffset_;
50 int rightFieldOrdinalOffset_;
51 bool forceNonSpecialized_;
53 size_t fad_size_output_ = 0;
55 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
59 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
60 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
61 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
63 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_;
64 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
77 int leftFieldOrdinalOffset,
78 int rightFieldOrdinalOffset,
79 bool forceNonSpecialized)
81 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
82 leftComponent_(leftComponent),
83 composedTransform_(composedTransform),
84 rightComponent_(rightComponent),
85 cellMeasures_(cellMeasures),
88 leftComponentSpan_(leftComponent.
extent_int(2)),
89 rightComponentSpan_(rightComponent.
extent_int(2)),
91 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
92 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
93 forceNonSpecialized_(forceNonSpecialized)
95 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
98 const int FIELD_DIM = 0;
99 const int POINT_DIM = 1;
103 for (
int r=0; r<numTensorComponents_; r++)
105 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
106 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
107 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
108 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
109 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
110 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
114 int leftRelativeEnumerationSpan = 1;
115 int rightRelativeEnumerationSpan = 1;
116 for (
int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
118 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
119 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
120 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
121 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
127 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
128 if (allocateFadStorage)
130 fad_size_output_ = dimension_scalar(integralView_);
133 const int R = numTensorComponents_ - 1;
136 int allocationSoFar = 0;
137 offsetsForComponentOrdinal_[R] = allocationSoFar;
140 for (
int r=R-1; r>0; r--)
145 num_ij *= leftFields * rightFields;
147 offsetsForComponentOrdinal_[r] = allocationSoFar;
148 allocationSoFar += num_ij;
150 offsetsForComponentOrdinal_[0] = allocationSoFar;
153 template<
size_t maxComponents,
size_t numComponents = maxComponents>
154 KOKKOS_INLINE_FUNCTION
155 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
156 const Kokkos::Array<int,maxComponents> &bounds)
const
158 if (numComponents == 0)
164 int r =
static_cast<int>(numComponents - 1);
165 while (arguments[r] + 1 >= bounds[r])
171 if (r >= 0) ++arguments[r];
177 KOKKOS_INLINE_FUNCTION
179 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
180 const int &numComponents)
const
182 if (numComponents == 0)
return -1;
183 int r =
static_cast<int>(numComponents - 1);
184 while (arguments[r] + 1 >= bounds[r])
190 if (r >= 0) ++arguments[r];
194 template<
size_t maxComponents,
size_t numComponents = maxComponents>
195 KOKKOS_INLINE_FUNCTION
196 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
197 const Kokkos::Array<int,maxComponents> &bounds)
const
199 if (numComponents == 0)
205 int r =
static_cast<int>(numComponents - 1);
206 while (arguments[r] + 1 >= bounds[r])
216 KOKKOS_INLINE_FUNCTION
218 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
219 const int &numComponents)
const
221 if (numComponents == 0)
return -1;
222 int r = numComponents - 1;
223 while (arguments[r] + 1 >= bounds[r])
231 template<
size_t maxComponents,
size_t numComponents = maxComponents>
232 KOKKOS_INLINE_FUNCTION
233 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
234 const Kokkos::Array<int,maxComponents> &bounds,
235 const int startIndex)
const
238 if (numComponents == 0)
244 int enumerationIndex = 0;
245 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
247 enumerationIndex += arguments[r];
248 enumerationIndex *= bounds[r-1];
250 enumerationIndex += arguments[startIndex];
251 return enumerationIndex;
265 KOKKOS_INLINE_FUNCTION
268 constexpr
int numTensorComponents = 3;
270 Kokkos::Array<int,numTensorComponents> pointBounds;
271 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
272 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
273 for (
unsigned r=0; r<numTensorComponents; r++)
275 pointBounds[r] = pointBounds_[r];
276 leftFieldBounds[r] = leftFieldBounds_[r];
277 rightFieldBounds[r] = rightFieldBounds_[r];
280 const int cellDataOrdinal = teamMember.league_rank();
281 const int numThreads = teamMember.team_size();
282 const int scratchViewSize = offsetsForComponentOrdinal_[0];
284 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
285 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
286 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z;
287 if (fad_size_output_ > 0) {
288 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
289 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
290 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
291 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
292 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
293 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
294 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
295 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
298 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
299 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
300 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
301 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
302 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
303 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
304 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
305 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
311 constexpr
int R = numTensorComponents - 1;
313 const int composedTransformRank = composedTransform_.
rank();
315 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
317 const int a = a_offset_ + a_component;
318 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
320 const int b = b_offset_ + b_component;
325 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
326 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
335 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
336 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (
const int& fieldOrdinalPointOrdinal) {
337 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
338 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
339 if ((fieldOrdinal < leftTensorComponent_x.
extent_int(0)) && (pointOrdinal < leftTensorComponent_x.
extent_int(1)))
341 const int leftRank = leftTensorComponent_x.
rank();
342 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
344 if ((fieldOrdinal < leftTensorComponent_y.
extent_int(0)) && (pointOrdinal < leftTensorComponent_y.
extent_int(1)))
346 const int leftRank = leftTensorComponent_y.
rank();
347 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
349 if ((fieldOrdinal < leftTensorComponent_z.
extent_int(0)) && (pointOrdinal < leftTensorComponent_z.
extent_int(1)))
351 const int leftRank = leftTensorComponent_z.
rank();
352 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
354 if ((fieldOrdinal < rightTensorComponent_x.
extent_int(0)) && (pointOrdinal < rightTensorComponent_x.
extent_int(1)))
356 const int rightRank = rightTensorComponent_x.
rank();
357 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
359 if ((fieldOrdinal < rightTensorComponent_y.
extent_int(0)) && (pointOrdinal < rightTensorComponent_y.
extent_int(1)))
361 const int rightRank = rightTensorComponent_y.
rank();
362 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
364 if ((fieldOrdinal < rightTensorComponent_z.
extent_int(0)) && (pointOrdinal < rightTensorComponent_z.
extent_int(1)))
366 const int rightRank = rightTensorComponent_z.
rank();
367 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
373 if (composedTransformRank == 4)
376 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
377 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
383 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
384 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
390 if (composedTransformRank == 4)
392 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
393 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
398 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
399 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
407 teamMember.team_barrier();
410 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
411 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
413 scratchView(i) = 0.0;
417 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
418 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
419 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
423 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
424 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
425 rightFieldArguments[R] = jz;
426 leftFieldArguments[R] = iz;
428 Kokkos::Array<int,numTensorComponents> pointArguments;
429 for (
int i=0; i<numTensorComponents; i++)
431 pointArguments[i] = 0;
434 for (
int lx=0; lx<pointBounds[0]; lx++)
436 pointArguments[0] = lx;
440 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
441 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
443 for (
int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
445 scratchView(Gy_index) = 0;
448 for (
int ly=0; ly<pointBounds[1]; ly++)
450 pointArguments[1] = ly;
452 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
455 for (
int lz=0; lz < pointBounds[2]; lz++)
457 const Scalar & leftValue = leftFields_z(iz,lz);
458 const Scalar & rightValue = rightFields_z(jz,lz);
460 pointArguments[2] = lz;
461 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
463 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
468 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
470 leftFieldArguments[1] = iy;
471 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
473 const Scalar & leftValue = leftFields_y(iy,ly);
475 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
477 rightFieldArguments[1] = jy;
479 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
480 const Scalar & rightValue = rightFields_y(jy,ly);
482 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
483 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
485 const int Gz_index = 0;
486 const Scalar & aGz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
488 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
490 Gy += leftValue * aGz * rightValue;
495 for (
int ix=0; ix<leftFieldBounds_[0]; ix++)
497 leftFieldArguments[0] = ix;
498 const Scalar & leftValue = leftFields_x(ix,lx);
500 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
502 leftFieldArguments[1] = iy;
504 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
506 for (
int jx=0; jx<rightFieldBounds_[0]; jx++)
508 rightFieldArguments[0] = jx;
509 const Scalar & rightValue = rightFields_x(jx,lx);
511 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
513 rightFieldArguments[1] = jy;
514 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
516 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
518 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
519 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
522 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
523 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
524 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
525 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
527 if (integralViewRank == 3)
549 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
554 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
568 template<
size_t numTensorComponents>
569 KOKKOS_INLINE_FUNCTION
570 void run(
const TeamMember & teamMember )
const
572 Kokkos::Array<int,numTensorComponents> pointBounds;
573 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
574 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
575 for (
unsigned r=0; r<numTensorComponents; r++)
577 pointBounds[r] = pointBounds_[r];
578 leftFieldBounds[r] = leftFieldBounds_[r];
579 rightFieldBounds[r] = rightFieldBounds_[r];
582 const int cellDataOrdinal = teamMember.league_rank();
583 const int numThreads = teamMember.team_size();
584 const int scratchViewSize = offsetsForComponentOrdinal_[0];
585 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
586 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
587 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields;
588 if (fad_size_output_ > 0) {
589 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
590 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
591 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
592 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
595 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
596 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
597 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
598 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
604 constexpr
int R = numTensorComponents - 1;
606 const int composedTransformRank = composedTransform_.
rank();
608 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
610 const int a = a_offset_ + a_component;
611 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
613 const int b = b_offset_ + b_component;
615 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
616 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
618 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0);
619 const int numRightFieldsFinal = rightFinalComponent.extent_int(0);
621 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (
const int& r) {
622 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
623 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
624 const int leftFieldCount = leftTensorComponent.extent_int(0);
625 const int pointCount = leftTensorComponent.extent_int(1);
626 const int leftRank = leftTensorComponent.rank();
627 const int rightFieldCount = rightTensorComponent.extent_int(0);
628 const int rightRank = rightTensorComponent.rank();
629 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
632 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
633 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
635 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
636 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
639 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
642 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
643 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
651 if (composedTransformRank == 4)
653 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
654 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
659 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
660 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
666 teamMember.team_barrier();
668 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
669 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
670 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
671 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
675 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
676 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
677 rightFieldArguments[R] = j_R;
678 leftFieldArguments[R] = i_R;
681 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
683 scratchView(i) = 0.0;
685 Kokkos::Array<int,numTensorComponents> pointArguments;
686 for (
unsigned i=0; i<numTensorComponents; i++)
688 pointArguments[i] = 0;
695 const int pointBounds_R = pointBounds[R];
696 int & pointArgument_R = pointArguments[R];
697 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
702 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
706 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
707 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
709 if (integralViewRank == 3)
712 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
717 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
721 const int & pointIndex_R = pointArguments[R];
723 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
724 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
726 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
728 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
733 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
734 const int r_min = (r_next >= 0) ? r_next : 0;
736 for (
int s=R-1; s>=r_min; s--)
738 const int & pointIndex_s = pointArguments[s];
741 for (
int s1=s; s1<R; s1++)
743 leftFieldArguments[s1] = 0;
747 const int & i_s = leftFieldArguments[s];
748 const int & j_s = rightFieldArguments[s];
751 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
752 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
756 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
759 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
761 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
763 for (
int s1=s; s1<R; s1++)
765 rightFieldArguments[s1] = 0;
770 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
773 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
775 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
777 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
783 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
785 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
790 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
795 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
796 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
797 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
798 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
800 if (integralViewRank == 3)
803 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
808 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
812 *G_s += leftValue * G_sp * rightValue;
817 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
821 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
828 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
829 for (
int i=scratchOffsetForThread; i<endIndex; i++)
831 scratchView(i) = 0.0;
838 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
846 KOKKOS_INLINE_FUNCTION
847 void operator()(
const TeamMember & teamMember )
const
849 switch (numTensorComponents_)
851 case 1: run<1>(teamMember);
break;
852 case 2: run<2>(teamMember);
break;
854 if (forceNonSpecialized_)
859 case 4: run<4>(teamMember);
break;
860 case 5: run<5>(teamMember);
break;
861 case 6: run<6>(teamMember);
break;
862 case 7: run<7>(teamMember);
break;
863 #ifdef INTREPID2_HAVE_DEBUG
876 const int R = numTensorComponents_ - 1;
880 const int flopsPerPoint_ab = cellMeasures_.numTensorComponents();
882 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
884 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
889 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
890 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
892 flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
894 int flopsPer_i_R_j_R = 0;
898 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
899 leftFieldArguments[R] = 0;
901 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
902 for (
int i=0; i<numTensorComponents_; i++)
904 pointArguments[i] = 0;
909 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
910 rightFieldArguments[R] = 0;
916 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
918 flopsPer_i_R_j_R += 4;
926 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
927 const int r_min = (r_next >= 0) ? r_next : 0;
929 for (
int s=R-1; s>=r_min; s--)
932 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
933 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
935 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
939 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
942 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
950 int teamSize(
const int &maxTeamSizeFromKokkos)
const
952 const int R = numTensorComponents_ - 1;
953 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
954 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
961 size_t shmem_size = 0;
963 if (fad_size_output_ > 0)
965 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1), fad_size_output_);
967 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
972 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
973 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1));
974 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
991 template<
class Scalar,
class DeviceType,
int integralViewRank>
994 using ExecutionSpace =
typename DeviceType::execution_space;
995 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
996 using TeamMember =
typename TeamPolicy::member_type;
998 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
999 IntegralViewType integralView_;
1006 int leftComponentSpan_;
1007 int rightComponentSpan_;
1008 int numTensorComponents_;
1009 int leftFieldOrdinalOffset_;
1010 int rightFieldOrdinalOffset_;
1012 size_t fad_size_output_ = 0;
1016 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1021 int maxFieldsRight_;
1031 int leftFieldOrdinalOffset,
1032 int rightFieldOrdinalOffset)
1034 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1035 leftComponent_(leftComponent),
1036 composedTransform_(composedTransform),
1037 rightComponent_(rightComponent),
1038 cellMeasures_(cellMeasures),
1039 a_offset_(a_offset),
1040 b_offset_(b_offset),
1041 leftComponentSpan_(leftComponent.
extent_int(2)),
1042 rightComponentSpan_(rightComponent.
extent_int(2)),
1044 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1045 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1047 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
1049 const int FIELD_DIM = 0;
1050 const int POINT_DIM = 1;
1052 maxFieldsRight_ = 0;
1054 for (
int r=0; r<numTensorComponents_; r++)
1056 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1057 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1058 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1059 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1060 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1061 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1067 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
1068 if (allocateFadStorage)
1070 fad_size_output_ = dimension_scalar(integralView_);
1074 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1075 KOKKOS_INLINE_FUNCTION
1076 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1077 const Kokkos::Array<int,maxComponents> &bounds)
const
1079 if (numComponents == 0)
return -1;
1080 int r =
static_cast<int>(numComponents - 1);
1081 while (arguments[r] + 1 >= bounds[r])
1087 if (r >= 0) ++arguments[r];
1092 KOKKOS_INLINE_FUNCTION
1094 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1095 const int &numComponents)
const
1097 if (numComponents == 0)
return -1;
1098 int r =
static_cast<int>(numComponents - 1);
1099 while (arguments[r] + 1 >= bounds[r])
1105 if (r >= 0) ++arguments[r];
1109 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1110 KOKKOS_INLINE_FUNCTION
1111 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
1112 const Kokkos::Array<int,maxComponents> &bounds)
const
1114 if (numComponents == 0)
return -1;
1115 int r =
static_cast<int>(numComponents - 1);
1116 while (arguments[r] + 1 >= bounds[r])
1125 KOKKOS_INLINE_FUNCTION
1127 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128 const int &numComponents)
const
1130 if (numComponents == 0)
return -1;
1131 int r = numComponents - 1;
1132 while (arguments[r] + 1 >= bounds[r])
1140 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1141 KOKKOS_INLINE_FUNCTION
1142 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
1143 const Kokkos::Array<int,maxComponents> &bounds,
1144 const int startIndex)
const
1147 if (numComponents == 0)
1151 int enumerationIndex = 0;
1152 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
1154 enumerationIndex += arguments[r];
1155 enumerationIndex *= bounds[r-1];
1157 enumerationIndex += arguments[startIndex];
1158 return enumerationIndex;
1162 KOKKOS_INLINE_FUNCTION
1163 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1164 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1166 return integralView(cellDataOrdinal,i,j);
1170 KOKKOS_INLINE_FUNCTION
1171 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1172 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1174 return integralView(i,j);
1178 KOKKOS_INLINE_FUNCTION
1181 constexpr
int numTensorComponents = 3;
1183 const int pointBounds_x = pointBounds_[0];
1184 const int pointBounds_y = pointBounds_[1];
1185 const int pointBounds_z = pointBounds_[2];
1186 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1188 const int leftFieldBounds_x = leftFieldBounds_[0];
1189 const int rightFieldBounds_x = rightFieldBounds_[0];
1190 const int leftFieldBounds_y = leftFieldBounds_[1];
1191 const int rightFieldBounds_y = rightFieldBounds_[1];
1192 const int leftFieldBounds_z = leftFieldBounds_[2];
1193 const int rightFieldBounds_z = rightFieldBounds_[2];
1195 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1196 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1197 for (
unsigned r=0; r<numTensorComponents; r++)
1199 leftFieldBounds[r] = leftFieldBounds_[r];
1200 rightFieldBounds[r] = rightFieldBounds_[r];
1203 const auto integralView = integralView_;
1204 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1205 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1207 const int cellDataOrdinal = teamMember.league_rank();
1208 const int threadNumber = teamMember.team_rank();
1210 const int numThreads = teamMember.team_size();
1211 const int GyEntryCount = pointBounds_z;
1212 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals;
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals;
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
1216 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1217 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1219 if (fad_size_output_ > 0) {
1220 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1221 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1222 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
1224 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1225 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1226 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1227 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1228 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1229 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1232 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1233 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1234 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
1236 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1237 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1238 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1239 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1240 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1241 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1249 const int composedTransformRank = composedTransform_.
rank();
1252 teamMember.team_barrier();
1254 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1256 const int a = a_offset_ + a_component;
1257 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1259 const int b = b_offset_ + b_component;
1268 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1269 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (
const int& fieldOrdinal) {
1270 if (fieldOrdinal < leftTensorComponent_x.
extent_int(0))
1272 const int pointCount = leftTensorComponent_x.
extent_int(1);
1273 const int leftRank = leftTensorComponent_x.
rank();
1274 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1276 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1279 if (fieldOrdinal < leftTensorComponent_y.
extent_int(0))
1281 const int pointCount = leftTensorComponent_y.
extent_int(1);
1282 const int leftRank = leftTensorComponent_y.
rank();
1283 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1285 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1288 if (fieldOrdinal < leftTensorComponent_z.
extent_int(0))
1290 const int pointCount = leftTensorComponent_z.
extent_int(1);
1291 const int leftRank = leftTensorComponent_z.
rank();
1292 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1294 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1297 if (fieldOrdinal < rightTensorComponent_x.
extent_int(0))
1299 const int pointCount = rightTensorComponent_x.
extent_int(1);
1300 const int rightRank = rightTensorComponent_x.
rank();
1301 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1303 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1306 if (fieldOrdinal < rightTensorComponent_y.
extent_int(0))
1308 const int pointCount = rightTensorComponent_y.
extent_int(1);
1309 const int rightRank = rightTensorComponent_y.
rank();
1310 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1312 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1315 if (fieldOrdinal < rightTensorComponent_z.
extent_int(0))
1317 const int pointCount = rightTensorComponent_z.
extent_int(1);
1318 const int rightRank = rightTensorComponent_z.
rank();
1319 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1321 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1326 if (composedTransformRank == 4)
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1334 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1335 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1340 teamMember.team_barrier();
1342 for (
int i0=0; i0<leftFieldBounds_x; i0++)
1344 for (
int j0=0; j0<rightFieldBounds_x; j0++)
1346 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (
const int& pointEnumerationIndexLaterDimensions) {
1348 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions;
1350 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1353 if (fad_size_output_ == 0)
1356 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (
const int &x_pointOrdinal, Scalar &integralThusFar)
1358 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1363 for (
int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1365 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1371 teamMember.team_barrier();
1373 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (
const int& i1j1) {
1374 const int i1 = i1j1 % leftFieldBounds_y;
1375 const int j1 = i1j1 / leftFieldBounds_y;
1377 int Gy_index_offset = GyEntryCount * threadNumber;
1379 for (
int lz=0; lz<pointBounds_z; lz++)
1381 int pointEnumerationIndex = lz * pointBounds_y;
1382 if (fad_size_output_ == 0)
1384 Scalar Gy_local = 0;
1387 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (
const int &ly, Scalar &integralThusFar)
1389 const Scalar & leftValue = leftFields_y(i1,ly);
1390 const Scalar & rightValue = rightFields_y(j1,ly);
1392 integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1395 GyIntegrals(Gy_index_offset + lz) = Gy_local;
1399 Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
1400 for (
int ly=0; ly<pointBounds_y; ly++)
1402 const Scalar & leftValue = leftFields_y(i1,ly);
1403 const Scalar & rightValue = rightFields_y(j1,ly);
1405 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1410 for (
int i2=0; i2<leftFieldBounds_z; i2++)
1412 for (
int j2=0; j2<rightFieldBounds_z; j2++)
1416 if (fad_size_output_ == 0)
1419 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (
const int &lz, Scalar &integralThusFar)
1421 const Scalar & leftValue = leftFields_z(i2,lz);
1422 const Scalar & rightValue = rightFields_z(j2,lz);
1424 integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1429 for (
int lz=0; lz<pointBounds_z; lz++)
1431 const Scalar & leftValue = leftFields_z(i2,lz);
1432 const Scalar & rightValue = rightFields_z(j2,lz);
1434 Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1438 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1439 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1444 Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
1445 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1451 teamMember.team_barrier();
1458 template<
size_t numTensorComponents>
1459 KOKKOS_INLINE_FUNCTION
1460 void run(
const TeamMember & teamMember )
const
1598 KOKKOS_INLINE_FUNCTION
1599 void operator()(
const TeamMember & teamMember )
const
1601 switch (numTensorComponents_)
1603 case 1: run<1>(teamMember);
break;
1604 case 2: run<2>(teamMember);
break;
1606 case 4: run<4>(teamMember);
break;
1607 case 5: run<5>(teamMember);
break;
1608 case 6: run<6>(teamMember);
break;
1609 case 7: run<7>(teamMember);
break;
1610 #ifdef INTREPID2_HAVE_DEBUG
1623 constexpr
int numTensorComponents = 3;
1624 Kokkos::Array<int,numTensorComponents> pointBounds;
1625 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1626 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1628 int pointsInNonzeroComponentDimensions = 1;
1629 for (
unsigned r=0; r<numTensorComponents; r++)
1631 pointBounds[r] = pointBounds_[r];
1632 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1633 leftFieldBounds[r] = leftFieldBounds_[r];
1634 rightFieldBounds[r] = rightFieldBounds_[r];
1637 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1639 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1644 flopCount += composedTransform_.
extent_int(1) * cellMeasures_.numTensorComponents();
1646 for (
int i0=0; i0<leftFieldBounds[0]; i0++)
1648 for (
int j0=0; j0<rightFieldBounds[0]; j0++)
1650 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3;
1690 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3;
1691 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3;
1692 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1;
1768 const int R = numTensorComponents_ - 1;
1769 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1770 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1777 size_t shmem_size = 0;
1779 const int GyEntryCount = pointBounds_[2];
1781 int pointsInNonzeroComponentDimensions = 1;
1782 for (
int d=1; d<numTensorComponents_; d++)
1784 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1787 if (fad_size_output_ > 0)
1789 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_);
1790 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_);
1791 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1), fad_size_output_);
1793 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_);
1794 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_);
1795 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_);
1796 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_);
1797 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_);
1798 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_);
1802 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions);
1803 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads);
1804 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1));
1806 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]);
1807 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]);
1808 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]);
1809 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]);
1810 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]);
1811 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]);
1818 template<
class Scalar,
class DeviceType>
1821 ScalarView<Scalar,DeviceType> integral_;
1825 ordinal_type dimSpan_;
1826 ordinal_type leftRank_;
1827 ordinal_type rightRank_;
1828 ordinal_type numPoints_;
1832 ordinal_type dimSpan)
1834 integral_(integralView),
1840 leftRank_ = left.
rank();
1841 rightRank_ = right.
rank();
1845 KOKKOS_INLINE_FUNCTION
1846 void operator()(
const ordinal_type & i,
const ordinal_type & j )
const
1848 Scalar refSpaceIntegral = 0.0;
1849 for (
int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1851 const Scalar & weight = weights_(ptOrdinal);
1852 for (
int a=0; a<dimSpan_; a++)
1854 const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1855 const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1856 refSpaceIntegral += leftValue * rightValue * weight;
1859 integral_(i,j) = refSpaceIntegral;
1864 template<
typename DeviceType>
1865 template<
class Scalar>
1874 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1875 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1876 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1882 const int CELL_DIM = 0;
1884 const auto leftTransform = vectorDataLeft.
transform();
1886 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1890 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.
transform().
getDimensionInfo(CELL_DIM));
1894 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.
transform().
getDimensionInfo(CELL_DIM));
1897 DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1898 int cellDataExtent = combinedCellDimInfo.dataExtent;
1900 const int numCells = vectorDataLeft.
numCells();
1901 const int numFieldsLeft = vectorDataLeft.
numFields();
1902 const int numFieldsRight = vectorDataRight.
numFields();
1904 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1905 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1907 if (cellVariationType != CONSTANT)
1909 Kokkos::View<Scalar***,DeviceType> data(
"Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1914 Kokkos::View<Scalar**,DeviceType> data(
"Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1922 template<
typename DeviceType>
1923 template<
class Scalar>
1927 double* approximateFlops)
1929 using ExecutionSpace =
typename DeviceType::execution_space;
1933 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1934 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1935 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1937 if (approximateFlops != NULL)
1939 *approximateFlops = 0;
1951 const int numFieldsLeft = basisValuesLeft.
numFields();
1952 const int numFieldsRight = basisValuesRight.
numFields();
1953 const int spaceDim = basisValuesLeft.
spaceDim();
1955 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.
spaceDim() != basisValuesRight.
spaceDim(), std::invalid_argument,
"vectorDataLeft and vectorDataRight must agree on the space dimension");
1957 const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1958 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1962 int numTensorComponentsLeft = -1;
1963 const bool leftIsVectorValued = basisValuesLeft.
vectorData().isValid();
1965 if (leftIsVectorValued)
1967 const auto &refVectorLeft = basisValuesLeft.
vectorData();
1968 int numFamiliesLeft = refVectorLeft.numFamilies();
1969 int numVectorComponentsLeft = refVectorLeft.numComponents();
1970 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1971 for (
int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1973 for (
int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1978 if (numTensorComponentsLeft == -1)
1982 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.
numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1983 for (
int r=0; r<numTensorComponentsLeft; r++)
1993 numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents();
1994 for (
int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1996 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"All families must match in the number of tensor components");
1999 int numTensorComponentsRight = -1;
2000 const bool rightIsVectorValued = basisValuesRight.
vectorData().isValid();
2002 if (rightIsVectorValued)
2004 const auto &refVectorRight = basisValuesRight.
vectorData();
2005 int numFamiliesRight = refVectorRight.numFamilies();
2006 int numVectorComponentsRight = refVectorRight.numComponents();
2007 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
2008 for (
int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2010 for (
int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2012 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2013 if (tensorData.numTensorComponents() > 0)
2015 if (numTensorComponentsRight == -1)
2017 numTensorComponentsRight = tensorData.numTensorComponents();
2019 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2020 for (
int r=0; r<numTensorComponentsRight; r++)
2022 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2027 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
2032 for (
int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2034 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
2039 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.
axisAligned() && basisValuesRight.
axisAligned())
2048 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2049 for (
int r=0; r<numPointTensorComponents; r++)
2056 Kokkos::View<Scalar**, DeviceType> integralView2;
2057 Kokkos::View<Scalar***, DeviceType> integralView3;
2058 if (integralViewRank == 3)
2066 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2069 int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2071 const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.
vectorData().numComponents() : 1;
2072 for (
int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2075 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2081 const int leftDimSpan = leftComponent.
extent_int(2);
2083 const int leftComponentFieldCount = leftComponent.
extent_int(0);
2085 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2088 int rightFieldOffset = basisValuesRight.
vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2090 const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2091 for (
int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2094 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2095 if (!rightComponent.
isValid())
2100 const int rightDimSpan = rightComponent.
extent_int(2);
2102 const int rightComponentFieldCount = rightComponent.
extent_int(0);
2105 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset))
2107 b_offset += rightDimSpan;
2113 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error,
"left and right dimension offsets should match.");
2114 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2116 const int d_start = a_offset;
2117 const int d_end = d_start + leftDimSpan;
2119 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>,
Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2120 ComponentIntegralsArray componentIntegrals;
2121 for (
int r=0; r<numPointTensorComponents; r++)
2138 const int numPoints = pointDimensions[r];
2145 const int leftTensorComponentDimSpan = leftTensorComponent.
extent_int(2);
2146 const int leftTensorComponentFields = leftTensorComponent.
extent_int(0);
2147 const int rightTensorComponentDimSpan = rightTensorComponent.
extent_int(2);
2148 const int rightTensorComponentFields = rightTensorComponent.
extent_int(0);
2150 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2152 for (
int d=d_start; d<d_end; d++)
2154 ScalarView<Scalar,DeviceType> componentIntegralView;
2156 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
2157 if (allocateFadStorage)
2160 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2164 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2167 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2170 leftTensorComponentDimSpan);
2171 Kokkos::parallel_for(
"compute componentIntegrals", policy, refSpaceIntegralFunctor);
2173 componentIntegrals[r][d] = componentIntegralView;
2175 if (approximateFlops != NULL)
2177 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3);
2182 ExecutionSpace().fence();
2184 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount};
2185 Kokkos::Array<int,3> lowerBounds {0,0,0};
2186 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2188 Kokkos::parallel_for(
"compute field integrals", policy,
2189 KOKKOS_LAMBDA (
const int &cellDataOrdinal,
const int &leftFieldOrdinal,
const int &rightFieldOrdinal) {
2190 const Scalar & cellMeasureWeight = cellMeasures.
getTensorComponent(0)(cellDataOrdinal);
2198 Scalar integralSum = 0.0;
2199 for (
int d=d_start; d<d_end; d++)
2201 const Scalar & transformLeft_d = basisValuesLeft.
transformWeight(cellDataOrdinal,0,d,d);
2202 const Scalar & transformRight_d = basisValuesRight.
transformWeight(cellDataOrdinal,0,d,d);
2204 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2207 Scalar integral_d = 1.0;
2209 for (
int r=0; r<numPointTensorComponents; r++)
2211 integral_d *= componentIntegrals[r][d](leftTensorIterator.
argument(r),rightTensorIterator.
argument(r));
2214 integralSum += leftRightTransform_d * integral_d;
2217 const int i = leftFieldOrdinal + leftFieldOffset;
2218 const int j = rightFieldOrdinal + rightFieldOffset;
2220 if (integralViewRank == 3)
2222 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2226 integralView2(i,j) += cellMeasureWeight * integralSum;
2230 b_offset += rightDimSpan;
2233 a_offset += leftDimSpan;
2237 if (approximateFlops != NULL)
2240 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2248 const bool transposeLeft =
true;
2249 const bool transposeRight =
false;
2255 const int leftRank = leftTransform.
rank();
2256 const int rightRank = rightTransform.
rank();
2260 const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
2261 const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
2262 const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
2263 const bool ranks32 = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
2264 const bool ranks42 = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
2269 composedTransform.
storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2272 if (approximateFlops != NULL)
2280 const int newRank = 4;
2283 extents[3] = extents[2];
2285 variationTypes[3] = variationTypes[2];
2286 variationTypes[2] = CONSTANT;
2287 auto leftTransformMatrix = leftTransform.
shallowCopy(newRank, extents, variationTypes);
2292 extents[3] = extents[2];
2294 variationTypes[3] = variationTypes[2];
2295 variationTypes[2] = CONSTANT;
2296 auto rightTransformMatrix = rightTransform.
shallowCopy(newRank, extents, variationTypes);
2299 composedTransform.
storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
2301 if (approximateFlops != NULL)
2312 const int newRank = 4;
2313 auto extents = composedTransform.
getExtents();
2315 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2316 if (approximateFlops != NULL)
2323 const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
2324 const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
2332 const int newRank = 4;
2333 auto extents = composedTransform.
getExtents();
2343 extents[3] = extents[2];
2345 variationTypes[3] = variationTypes[2];
2346 variationTypes[2] = CONSTANT;
2348 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2366 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2369 else if (leftTransform.
isValid())
2378 const int newRank = 4;
2382 composedTransform = leftTransform.
shallowCopy(newRank, extents, variationTypes);
2385 case 2: composedTransform = leftTransform;
break;
2387 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2390 else if (rightTransform.
isValid())
2393 composedTransform = rightTransform;
2396 case 4: composedTransform = rightTransform;
break;
2400 const int newRank = 4;
2403 extents[3] = extents[2];
2404 variationTypes[3] = variationTypes[2];
2406 variationTypes[2] = CONSTANT;
2408 composedTransform = rightTransform.
shallowCopy(newRank, extents, variationTypes);
2411 case 2: composedTransform = rightTransform;
break;
2413 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2419 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.
numCells(),basisValuesLeft.
numPoints(),spaceDim,spaceDim};
2420 Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2422 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView(
"Intrepid2::FST::integrate() - identity view",spaceDim);
2423 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2431 const int leftComponentCount = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2432 const int rightComponentCount = rightIsVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2434 int leftFieldOrdinalOffset = 0;
2435 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2440 bool haveLaunchedContributionToCurrentFamilyLeft =
false;
2441 for (
int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2444 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2448 a_offset += basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2452 int rightFieldOrdinalOffset = 0;
2453 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2457 bool haveLaunchedContributionToCurrentFamilyRight =
false;
2459 for (
int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2462 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2463 if (!rightComponent.
isValid())
2466 b_offset += basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2472 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2473 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2482 bool forceNonSpecialized =
false;
2483 bool usePointCacheForRank3Tensor =
true;
2487 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2489 ExecutionSpace().fence();
2491 haveLaunchedContributionToCurrentFamilyLeft =
true;
2492 haveLaunchedContributionToCurrentFamilyRight =
true;
2494 if (integralViewRank == 2)
2498 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2500 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2501 const int teamSize = functor.
teamSize(recommendedTeamSize);
2503 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2505 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 2", policy, functor);
2507 if (approximateFlops != NULL)
2509 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2514 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2516 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2517 const int teamSize = functor.
teamSize(recommendedTeamSize);
2519 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2521 Kokkos::parallel_for(
"F_Integrate rank 2", policy, functor);
2523 if (approximateFlops != NULL)
2525 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2529 else if (integralViewRank == 3)
2533 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2535 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2536 const int teamSize = functor.
teamSize(recommendedTeamSize);
2538 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2540 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 3", policy, functor);
2542 if (approximateFlops != NULL)
2544 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2549 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2551 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2552 const int teamSize = functor.
teamSize(recommendedTeamSize);
2554 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2556 Kokkos::parallel_for(
"F_Integrate rank 3", policy, functor);
2558 if (approximateFlops != NULL)
2560 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2564 b_offset += rightIsVectorValued ? basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2566 rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.
vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2568 a_offset += leftIsVectorValued ? basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2570 leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.
vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2577 ExecutionSpace().fence();
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank...
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...