49 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
50 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
55 #include "Teuchos_TimeMonitor.hpp"
64 template<
class Scalar,
class DeviceType,
int integralViewRank>
67 using ExecutionSpace =
typename DeviceType::execution_space;
68 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69 using TeamMember =
typename TeamPolicy::member_type;
71 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72 IntegralViewType integralView_;
79 int leftComponentSpan_;
80 int rightComponentSpan_;
81 int numTensorComponents_;
82 int leftFieldOrdinalOffset_;
83 int rightFieldOrdinalOffset_;
84 bool forceNonSpecialized_;
86 size_t fad_size_output_ = 0;
88 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
92 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
96 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_;
97 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
110 int leftFieldOrdinalOffset,
111 int rightFieldOrdinalOffset,
112 bool forceNonSpecialized)
114 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115 leftComponent_(leftComponent),
116 composedTransform_(composedTransform),
117 rightComponent_(rightComponent),
118 cellMeasures_(cellMeasures),
121 leftComponentSpan_(leftComponent.
extent_int(2)),
122 rightComponentSpan_(rightComponent.
extent_int(2)),
124 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126 forceNonSpecialized_(forceNonSpecialized)
128 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
131 const int FIELD_DIM = 0;
132 const int POINT_DIM = 1;
136 for (
int r=0; r<numTensorComponents_; r++)
138 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
139 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
140 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
141 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
142 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
143 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
147 int leftRelativeEnumerationSpan = 1;
148 int rightRelativeEnumerationSpan = 1;
149 for (
int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
151 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
160 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161 if (allocateFadStorage)
163 fad_size_output_ = dimension_scalar(integralView_);
166 const int R = numTensorComponents_ - 1;
169 int allocationSoFar = 0;
170 offsetsForComponentOrdinal_[R] = allocationSoFar;
173 for (
int r=R-1; r>0; r--)
178 num_ij *= leftFields * rightFields;
180 offsetsForComponentOrdinal_[r] = allocationSoFar;
181 allocationSoFar += num_ij;
183 offsetsForComponentOrdinal_[0] = allocationSoFar;
186 template<
size_t maxComponents,
size_t numComponents = maxComponents>
187 KOKKOS_INLINE_FUNCTION
188 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189 const Kokkos::Array<int,maxComponents> &bounds)
const
191 if (numComponents == 0)
197 int r =
static_cast<int>(numComponents - 1);
198 while (arguments[r] + 1 >= bounds[r])
204 if (r >= 0) ++arguments[r];
210 KOKKOS_INLINE_FUNCTION
212 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213 const int &numComponents)
const
215 if (numComponents == 0)
return -1;
216 int r =
static_cast<int>(numComponents - 1);
217 while (arguments[r] + 1 >= bounds[r])
223 if (r >= 0) ++arguments[r];
227 template<
size_t maxComponents,
size_t numComponents = maxComponents>
228 KOKKOS_INLINE_FUNCTION
229 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
230 const Kokkos::Array<int,maxComponents> &bounds)
const
232 if (numComponents == 0)
238 int r =
static_cast<int>(numComponents - 1);
239 while (arguments[r] + 1 >= bounds[r])
249 KOKKOS_INLINE_FUNCTION
251 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252 const int &numComponents)
const
254 if (numComponents == 0)
return -1;
255 int r = numComponents - 1;
256 while (arguments[r] + 1 >= bounds[r])
264 template<
size_t maxComponents,
size_t numComponents = maxComponents>
265 KOKKOS_INLINE_FUNCTION
266 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
267 const Kokkos::Array<int,maxComponents> &bounds,
268 const int startIndex)
const
271 if (numComponents == 0)
277 int enumerationIndex = 0;
278 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
280 enumerationIndex += arguments[r];
281 enumerationIndex *= bounds[r-1];
283 enumerationIndex += arguments[startIndex];
284 return enumerationIndex;
298 KOKKOS_INLINE_FUNCTION
301 constexpr
int numTensorComponents = 3;
303 Kokkos::Array<int,numTensorComponents> pointBounds;
304 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306 for (
unsigned r=0; r<numTensorComponents; r++)
308 pointBounds[r] = pointBounds_[r];
309 leftFieldBounds[r] = leftFieldBounds_[r];
310 rightFieldBounds[r] = rightFieldBounds_[r];
313 const int cellDataOrdinal = teamMember.league_rank();
314 const int numThreads = teamMember.team_size();
315 const int scratchViewSize = offsetsForComponentOrdinal_[0];
317 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
318 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
319 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z;
320 if (fad_size_output_ > 0) {
321 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
323 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
331 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
333 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
344 constexpr
int R = numTensorComponents - 1;
346 const int composedTransformRank = composedTransform_.
rank();
348 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
350 const int a = a_offset_ + a_component;
351 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
353 const int b = b_offset_ + b_component;
358 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
359 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
368 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
369 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (
const int& fieldOrdinalPointOrdinal) {
370 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
371 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
372 if ((fieldOrdinal < leftTensorComponent_x.
extent_int(0)) && (pointOrdinal < leftTensorComponent_x.
extent_int(1)))
374 const int leftRank = leftTensorComponent_x.
rank();
375 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
377 if ((fieldOrdinal < leftTensorComponent_y.
extent_int(0)) && (pointOrdinal < leftTensorComponent_y.
extent_int(1)))
379 const int leftRank = leftTensorComponent_y.
rank();
380 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
382 if ((fieldOrdinal < leftTensorComponent_z.
extent_int(0)) && (pointOrdinal < leftTensorComponent_z.
extent_int(1)))
384 const int leftRank = leftTensorComponent_z.
rank();
385 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
387 if ((fieldOrdinal < rightTensorComponent_x.
extent_int(0)) && (pointOrdinal < rightTensorComponent_x.
extent_int(1)))
389 const int rightRank = rightTensorComponent_x.
rank();
390 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
392 if ((fieldOrdinal < rightTensorComponent_y.
extent_int(0)) && (pointOrdinal < rightTensorComponent_y.
extent_int(1)))
394 const int rightRank = rightTensorComponent_y.
rank();
395 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
397 if ((fieldOrdinal < rightTensorComponent_z.
extent_int(0)) && (pointOrdinal < rightTensorComponent_z.
extent_int(1)))
399 const int rightRank = rightTensorComponent_z.
rank();
400 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
406 if (composedTransformRank == 4)
409 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
410 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
416 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
417 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
423 if (composedTransformRank == 4)
425 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
426 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
431 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
432 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
440 teamMember.team_barrier();
443 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
444 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
446 scratchView(i) = 0.0;
450 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
451 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
452 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
456 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
457 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
458 rightFieldArguments[R] = jz;
459 leftFieldArguments[R] = iz;
461 Kokkos::Array<int,numTensorComponents> pointArguments;
462 for (
int i=0; i<numTensorComponents; i++)
464 pointArguments[i] = 0;
467 for (
int lx=0; lx<pointBounds[0]; lx++)
469 pointArguments[0] = lx;
473 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
474 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
476 for (
int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
478 scratchView(Gy_index) = 0;
481 for (
int ly=0; ly<pointBounds[1]; ly++)
483 pointArguments[1] = ly;
485 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
488 for (
int lz=0; lz < pointBounds[2]; lz++)
490 const Scalar & leftValue = leftFields_z(iz,lz);
491 const Scalar & rightValue = rightFields_z(jz,lz);
493 pointArguments[2] = lz;
494 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
496 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
501 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
503 leftFieldArguments[1] = iy;
504 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
506 const Scalar & leftValue = leftFields_y(iy,ly);
508 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
510 rightFieldArguments[1] = jy;
512 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
513 const Scalar & rightValue = rightFields_y(jy,ly);
515 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
516 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
518 const int Gz_index = 0;
519 const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
521 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
523 Gy += leftValue * Gz * rightValue;
528 for (
int ix=0; ix<leftFieldBounds_[0]; ix++)
530 leftFieldArguments[0] = ix;
531 const Scalar & leftValue = leftFields_x(ix,lx);
533 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
535 leftFieldArguments[1] = iy;
537 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
539 for (
int jx=0; jx<rightFieldBounds_[0]; jx++)
541 rightFieldArguments[0] = jx;
542 const Scalar & rightValue = rightFields_x(jx,lx);
544 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
546 rightFieldArguments[1] = jy;
547 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
549 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
551 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
552 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
555 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
556 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
557 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
558 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
560 if (integralViewRank == 3)
582 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
587 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
601 template<
size_t numTensorComponents>
602 KOKKOS_INLINE_FUNCTION
603 void run(
const TeamMember & teamMember )
const
605 Kokkos::Array<int,numTensorComponents> pointBounds;
606 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
607 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
608 for (
unsigned r=0; r<numTensorComponents; r++)
610 pointBounds[r] = pointBounds_[r];
611 leftFieldBounds[r] = leftFieldBounds_[r];
612 rightFieldBounds[r] = rightFieldBounds_[r];
615 const int cellDataOrdinal = teamMember.league_rank();
616 const int numThreads = teamMember.team_size();
617 const int scratchViewSize = offsetsForComponentOrdinal_[0];
618 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
619 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
620 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields;
621 if (fad_size_output_ > 0) {
622 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
623 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
624 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
625 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
628 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
629 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
630 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
631 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
637 constexpr
int R = numTensorComponents - 1;
639 const int composedTransformRank = composedTransform_.
rank();
641 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
643 const int a = a_offset_ + a_component;
644 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
646 const int b = b_offset_ + b_component;
648 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
649 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
651 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0);
652 const int numRightFieldsFinal = rightFinalComponent.extent_int(0);
654 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (
const int& r) {
655 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
656 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
657 const int leftFieldCount = leftTensorComponent.extent_int(0);
658 const int pointCount = leftTensorComponent.extent_int(1);
659 const int leftRank = leftTensorComponent.rank();
660 const int rightFieldCount = rightTensorComponent.extent_int(0);
661 const int rightRank = rightTensorComponent.rank();
662 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
665 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
666 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
668 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
669 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
672 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
675 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
676 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
678 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
679 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
684 if (composedTransformRank == 4)
686 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
687 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
692 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
693 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
699 teamMember.team_barrier();
701 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
702 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
703 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
704 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
708 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
709 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
710 rightFieldArguments[R] = j_R;
711 leftFieldArguments[R] = i_R;
714 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
716 scratchView(i) = 0.0;
718 Kokkos::Array<int,numTensorComponents> pointArguments;
719 for (
unsigned i=0; i<numTensorComponents; i++)
721 pointArguments[i] = 0;
728 const int pointBounds_R = pointBounds[R];
729 int & pointArgument_R = pointArguments[R];
730 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
735 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
739 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
740 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
742 if (integralViewRank == 3)
745 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
750 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
754 const int & pointIndex_R = pointArguments[R];
756 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
757 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
759 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
761 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
766 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
767 const int r_min = (r_next >= 0) ? r_next : 0;
769 for (
int s=R-1; s>=r_min; s--)
771 const int & pointIndex_s = pointArguments[s];
774 for (
int s1=s; s1<R; s1++)
776 leftFieldArguments[s1] = 0;
780 const int & i_s = leftFieldArguments[s];
781 const int & j_s = rightFieldArguments[s];
784 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
785 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
789 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
792 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
794 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
796 for (
int s1=s; s1<R; s1++)
798 rightFieldArguments[s1] = 0;
803 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
806 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
808 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
810 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
816 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
818 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
823 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
828 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
829 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
830 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
831 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
833 if (integralViewRank == 3)
836 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
841 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
845 *G_s += leftValue * G_sp * rightValue;
850 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
854 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
861 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
862 for (
int i=scratchOffsetForThread; i<endIndex; i++)
864 scratchView(i) = 0.0;
871 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
879 KOKKOS_INLINE_FUNCTION
880 void operator()(
const TeamMember & teamMember )
const
882 switch (numTensorComponents_)
884 case 1: run<1>(teamMember);
break;
885 case 2: run<2>(teamMember);
break;
887 if (forceNonSpecialized_)
892 case 4: run<4>(teamMember);
break;
893 case 5: run<5>(teamMember);
break;
894 case 6: run<6>(teamMember);
break;
895 case 7: run<7>(teamMember);
break;
896 #ifdef INTREPID2_HAVE_DEBUG
909 const int R = numTensorComponents_ - 1;
913 const int flopsPerPoint_ab = cellMeasures_.numTensorComponents();
915 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
917 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
922 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
923 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
925 flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
927 int flopsPer_i_R_j_R = 0;
931 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
932 leftFieldArguments[R] = 0;
934 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
935 for (
int i=0; i<numTensorComponents_; i++)
937 pointArguments[i] = 0;
942 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
943 rightFieldArguments[R] = 0;
949 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
951 flopsPer_i_R_j_R += 4;
959 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
960 const int r_min = (r_next >= 0) ? r_next : 0;
962 for (
int s=R-1; s>=r_min; s--)
965 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
966 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
968 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
972 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
975 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
983 int teamSize(
const int &maxTeamSizeFromKokkos)
const
985 const int R = numTensorComponents_ - 1;
986 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
987 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
994 size_t shmem_size = 0;
996 if (fad_size_output_ > 0)
998 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
999 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1), fad_size_output_);
1000 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
1001 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
1005 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
1006 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1));
1007 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
1008 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
1024 template<
class Scalar,
class DeviceType,
int integralViewRank>
1027 using ExecutionSpace =
typename DeviceType::execution_space;
1028 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
1029 using TeamMember =
typename TeamPolicy::member_type;
1031 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1032 IntegralViewType integralView_;
1039 int leftComponentSpan_;
1040 int rightComponentSpan_;
1041 int numTensorComponents_;
1042 int leftFieldOrdinalOffset_;
1043 int rightFieldOrdinalOffset_;
1045 size_t fad_size_output_ = 0;
1049 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1050 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1051 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1054 int maxFieldsRight_;
1064 int leftFieldOrdinalOffset,
1065 int rightFieldOrdinalOffset)
1067 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1068 leftComponent_(leftComponent),
1069 composedTransform_(composedTransform),
1070 rightComponent_(rightComponent),
1071 cellMeasures_(cellMeasures),
1072 a_offset_(a_offset),
1073 b_offset_(b_offset),
1074 leftComponentSpan_(leftComponent.
extent_int(2)),
1075 rightComponentSpan_(rightComponent.
extent_int(2)),
1077 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1078 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1080 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
1082 const int FIELD_DIM = 0;
1083 const int POINT_DIM = 1;
1085 maxFieldsRight_ = 0;
1087 for (
int r=0; r<numTensorComponents_; r++)
1089 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1090 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1091 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1092 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1093 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1094 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1100 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1101 if (allocateFadStorage)
1103 fad_size_output_ = dimension_scalar(integralView_);
1107 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1108 KOKKOS_INLINE_FUNCTION
1109 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1110 const Kokkos::Array<int,maxComponents> &bounds)
const
1112 if (numComponents == 0)
return -1;
1113 int r =
static_cast<int>(numComponents - 1);
1114 while (arguments[r] + 1 >= bounds[r])
1120 if (r >= 0) ++arguments[r];
1125 KOKKOS_INLINE_FUNCTION
1127 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128 const int &numComponents)
const
1130 if (numComponents == 0)
return -1;
1131 int r =
static_cast<int>(numComponents - 1);
1132 while (arguments[r] + 1 >= bounds[r])
1138 if (r >= 0) ++arguments[r];
1142 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1143 KOKKOS_INLINE_FUNCTION
1144 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
1145 const Kokkos::Array<int,maxComponents> &bounds)
const
1147 if (numComponents == 0)
return -1;
1148 int r =
static_cast<int>(numComponents - 1);
1149 while (arguments[r] + 1 >= bounds[r])
1158 KOKKOS_INLINE_FUNCTION
1160 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1161 const int &numComponents)
const
1163 if (numComponents == 0)
return -1;
1164 int r = numComponents - 1;
1165 while (arguments[r] + 1 >= bounds[r])
1173 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1174 KOKKOS_INLINE_FUNCTION
1175 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
1176 const Kokkos::Array<int,maxComponents> &bounds,
1177 const int startIndex)
const
1180 if (numComponents == 0)
1184 int enumerationIndex = 0;
1185 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
1187 enumerationIndex += arguments[r];
1188 enumerationIndex *= bounds[r-1];
1190 enumerationIndex += arguments[startIndex];
1191 return enumerationIndex;
1195 KOKKOS_INLINE_FUNCTION
1196 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1197 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1199 return integralView(cellDataOrdinal,i,j);
1203 KOKKOS_INLINE_FUNCTION
1204 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1205 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1207 return integralView(i,j);
1211 KOKKOS_INLINE_FUNCTION
1214 constexpr
int numTensorComponents = 3;
1216 const int pointBounds_x = pointBounds_[0];
1217 const int pointBounds_y = pointBounds_[1];
1218 const int pointBounds_z = pointBounds_[2];
1219 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1221 const int leftFieldBounds_x = leftFieldBounds_[0];
1222 const int rightFieldBounds_x = rightFieldBounds_[0];
1223 const int leftFieldBounds_y = leftFieldBounds_[1];
1224 const int rightFieldBounds_y = rightFieldBounds_[1];
1225 const int leftFieldBounds_z = leftFieldBounds_[2];
1226 const int rightFieldBounds_z = rightFieldBounds_[2];
1228 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1229 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1230 for (
unsigned r=0; r<numTensorComponents; r++)
1232 leftFieldBounds[r] = leftFieldBounds_[r];
1233 rightFieldBounds[r] = rightFieldBounds_[r];
1236 const auto integralView = integralView_;
1237 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1238 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1240 const int cellDataOrdinal = teamMember.league_rank();
1241 const int threadNumber = teamMember.team_rank();
1243 const int numThreads = teamMember.team_size();
1244 const int GyEntryCount = pointBounds_z;
1245 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals;
1246 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals;
1247 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral;
1248 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
1250 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1251 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1252 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1253 if (fad_size_output_ > 0) {
1254 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1255 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1256 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1257 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
1259 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1260 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1261 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1262 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1263 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1264 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1267 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1268 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1269 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1270 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
1272 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1273 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1274 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1275 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1276 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1277 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1285 const int composedTransformRank = composedTransform_.
rank();
1288 teamMember.team_barrier();
1290 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1292 const int a = a_offset_ + a_component;
1293 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1295 const int b = b_offset_ + b_component;
1304 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1305 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (
const int& fieldOrdinal) {
1306 if (fieldOrdinal < leftTensorComponent_x.
extent_int(0))
1308 const int pointCount = leftTensorComponent_x.
extent_int(1);
1309 const int leftRank = leftTensorComponent_x.
rank();
1310 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1312 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1315 if (fieldOrdinal < leftTensorComponent_y.
extent_int(0))
1317 const int pointCount = leftTensorComponent_y.
extent_int(1);
1318 const int leftRank = leftTensorComponent_y.
rank();
1319 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1321 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1324 if (fieldOrdinal < leftTensorComponent_z.
extent_int(0))
1326 const int pointCount = leftTensorComponent_z.
extent_int(1);
1327 const int leftRank = leftTensorComponent_z.
rank();
1328 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1330 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1333 if (fieldOrdinal < rightTensorComponent_x.
extent_int(0))
1335 const int pointCount = rightTensorComponent_x.
extent_int(1);
1336 const int rightRank = rightTensorComponent_x.
rank();
1337 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1339 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1342 if (fieldOrdinal < rightTensorComponent_y.
extent_int(0))
1344 const int pointCount = rightTensorComponent_y.
extent_int(1);
1345 const int rightRank = rightTensorComponent_y.
rank();
1346 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1348 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1351 if (fieldOrdinal < rightTensorComponent_z.
extent_int(0))
1353 const int pointCount = rightTensorComponent_z.
extent_int(1);
1354 const int rightRank = rightTensorComponent_z.
rank();
1355 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1357 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1362 if (composedTransformRank == 4)
1364 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1365 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1370 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1371 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1376 teamMember.team_barrier();
1378 for (
int i0=0; i0<leftFieldBounds_x; i0++)
1380 for (
int j0=0; j0<rightFieldBounds_x; j0++)
1382 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (
const int& pointEnumerationIndexLaterDimensions) {
1384 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions;
1386 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1389 if (fad_size_output_ == 0)
1392 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (
const int &x_pointOrdinal, Scalar &integralThusFar)
1394 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1399 for (
int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1401 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1407 teamMember.team_barrier();
1409 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (
const int& i1j1) {
1410 const int i1 = i1j1 % leftFieldBounds_y;
1411 const int j1 = i1j1 / leftFieldBounds_y;
1413 int Gy_index = GyEntryCount * threadNumber;
1415 int pointEnumerationIndex = 0;
1416 for (
int lz=0; lz<pointBounds_z; lz++)
1418 Scalar & Gy = GyIntegrals(Gy_index);
1421 for (
int ly=0; ly<pointBounds_y; ly++)
1423 const Scalar & leftValue = leftFields_y(i1,ly);
1424 const Scalar & rightValue = rightFields_y(j1,ly);
1426 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1428 pointEnumerationIndex++;
1433 Scalar & Gz = GzIntegral(threadNumber);
1434 for (
int i2=0; i2<leftFieldBounds_z; i2++)
1436 for (
int j2=0; j2<rightFieldBounds_z; j2++)
1440 int Gy_index = GyEntryCount * threadNumber;
1442 for (
int lz=0; lz<pointBounds_z; lz++)
1444 const Scalar & leftValue = leftFields_z(i2,lz);
1445 const Scalar & rightValue = rightFields_z(j2,lz);
1447 Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1452 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1453 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1458 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1463 teamMember.team_barrier();
1470 template<
size_t numTensorComponents>
1471 KOKKOS_INLINE_FUNCTION
1472 void run(
const TeamMember & teamMember )
const
1610 KOKKOS_INLINE_FUNCTION
1611 void operator()(
const TeamMember & teamMember )
const
1613 switch (numTensorComponents_)
1615 case 1: run<1>(teamMember);
break;
1616 case 2: run<2>(teamMember);
break;
1618 case 4: run<4>(teamMember);
break;
1619 case 5: run<5>(teamMember);
break;
1620 case 6: run<6>(teamMember);
break;
1621 case 7: run<7>(teamMember);
break;
1622 #ifdef INTREPID2_HAVE_DEBUG
1635 constexpr
int numTensorComponents = 3;
1636 Kokkos::Array<int,numTensorComponents> pointBounds;
1637 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1638 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1640 int pointsInNonzeroComponentDimensions = 1;
1641 for (
unsigned r=0; r<numTensorComponents; r++)
1643 pointBounds[r] = pointBounds_[r];
1644 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1645 leftFieldBounds[r] = leftFieldBounds_[r];
1646 rightFieldBounds[r] = rightFieldBounds_[r];
1649 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1651 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1656 flopCount += composedTransform_.
extent_int(1) * cellMeasures_.numTensorComponents();
1658 for (
int i0=0; i0<leftFieldBounds[0]; i0++)
1660 for (
int j0=0; j0<rightFieldBounds[0]; j0++)
1662 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3;
1702 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3;
1703 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3;
1704 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1;
1780 const int R = numTensorComponents_ - 1;
1781 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1782 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1789 size_t shmem_size = 0;
1791 const int GyEntryCount = pointBounds_[2];
1793 int pointsInNonzeroComponentDimensions = 1;
1794 for (
int d=1; d<numTensorComponents_; d++)
1796 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1799 if (fad_size_output_ > 0)
1801 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_);
1802 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_);
1803 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_);
1804 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1), fad_size_output_);
1806 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_);
1807 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_);
1808 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_);
1809 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_);
1810 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_);
1811 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_);
1815 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions);
1816 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads);
1817 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads);
1818 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1));
1820 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]);
1821 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]);
1822 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]);
1823 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]);
1824 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]);
1825 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]);
1832 template<
class Scalar,
class DeviceType>
1835 ScalarView<Scalar,DeviceType> integral_;
1839 ordinal_type dimSpan_;
1840 ordinal_type leftRank_;
1841 ordinal_type rightRank_;
1842 ordinal_type numPoints_;
1846 ordinal_type dimSpan)
1848 integral_(integralView),
1854 leftRank_ = left.
rank();
1855 rightRank_ = right.
rank();
1859 KOKKOS_INLINE_FUNCTION
1860 void operator()(
const ordinal_type & i,
const ordinal_type & j )
const
1862 Scalar refSpaceIntegral = 0.0;
1863 for (
int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1865 const Scalar & weight = weights_(ptOrdinal);
1866 for (
int a=0; a<dimSpan_; a++)
1868 const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1869 const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1870 refSpaceIntegral += leftValue * rightValue * weight;
1873 integral_(i,j) = refSpaceIntegral;
1878 template<
typename DeviceType>
1879 template<
class Scalar>
1888 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1889 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1890 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1896 const int CELL_DIM = 0;
1898 const auto leftTransform = vectorDataLeft.
transform();
1900 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1904 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.
transform().
getDimensionInfo(CELL_DIM));
1908 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.
transform().
getDimensionInfo(CELL_DIM));
1911 DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1912 int cellDataExtent = combinedCellDimInfo.dataExtent;
1914 const int numCells = vectorDataLeft.
numCells();
1915 const int numFieldsLeft = vectorDataLeft.
numFields();
1916 const int numFieldsRight = vectorDataRight.
numFields();
1918 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1919 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1921 if (cellVariationType != CONSTANT)
1923 Kokkos::View<Scalar***,DeviceType> data(
"Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1928 Kokkos::View<Scalar**,DeviceType> data(
"Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1936 template<
typename DeviceType>
1937 template<
class Scalar>
1941 double* approximateFlops)
1943 using ExecutionSpace =
typename DeviceType::execution_space;
1947 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1948 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1949 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1951 if (approximateFlops != NULL)
1953 *approximateFlops = 0;
1965 const int numFieldsLeft = basisValuesLeft.
numFields();
1966 const int numFieldsRight = basisValuesRight.
numFields();
1967 const int spaceDim = basisValuesLeft.
spaceDim();
1969 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.
spaceDim() != basisValuesRight.
spaceDim(), std::invalid_argument,
"vectorDataLeft and vectorDataRight must agree on the space dimension");
1971 const int leftFamilyCount = basisValuesLeft.basisValues().
numFamilies();
1972 const int rightFamilyCount = basisValuesRight.basisValues().
numFamilies();
1976 int numTensorComponentsLeft = -1;
1977 const bool isVectorValued = basisValuesLeft.
vectorData().isValid();
1980 const bool rightIsVectorValued = basisValuesRight.
vectorData().isValid();
1981 INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument,
"left and right must either both be vector-valued, or both scalar-valued");
1982 const auto &refVectorLeft = basisValuesLeft.
vectorData();
1983 int numFamiliesLeft = refVectorLeft.numFamilies();
1984 int numVectorComponentsLeft = refVectorLeft.numComponents();
1985 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1986 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1987 for (
int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1989 for (
int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1994 if (numTensorComponentsLeft == -1)
1998 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.
numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1999 for (
int r=0; r<numTensorComponentsLeft; r++)
2006 int numTensorComponentsRight = -1;
2007 const auto &refVectorRight = basisValuesRight.
vectorData();
2008 int numFamiliesRight = refVectorRight.numFamilies();
2009 int numVectorComponentsRight = refVectorRight.numComponents();
2010 for (
int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2012 for (
int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2014 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2015 if (tensorData.numTensorComponents() > 0)
2017 if (numTensorComponentsRight == -1)
2019 numTensorComponentsRight = tensorData.numTensorComponents();
2021 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2022 for (
int r=0; r<numTensorComponentsRight; r++)
2024 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2029 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument,
"Left and right vector entries must have the same number of tensorial components");
2034 for (
int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
2036 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().
tensorData(familyOrdinal).
numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"All families must match in the number of tensor components");
2040 for (
int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2042 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().
tensorData(familyOrdinal).
numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
2047 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.
axisAligned() && basisValuesRight.
axisAligned())
2056 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2057 for (
int r=0; r<numPointTensorComponents; r++)
2064 Kokkos::View<Scalar**, DeviceType> integralView2;
2065 Kokkos::View<Scalar***, DeviceType> integralView3;
2066 if (integralViewRank == 3)
2074 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2079 const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.
vectorData().numComponents() : 1;
2080 for (
int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2083 : basisValuesLeft.basisValues().
tensorData(leftFamilyOrdinal);
2089 const int leftDimSpan = leftComponent.
extent_int(2);
2091 const int leftComponentFieldCount = leftComponent.
extent_int(0);
2093 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2096 int rightFieldOffset = basisValuesRight.
vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2098 const int rightVectorComponentCount = isVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2099 for (
int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2102 : basisValuesRight.basisValues().
tensorData(rightFamilyOrdinal);
2103 if (!rightComponent.
isValid())
2108 const int rightDimSpan = rightComponent.
extent_int(2);
2110 const int rightComponentFieldCount = rightComponent.
extent_int(0);
2113 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset))
2115 b_offset += rightDimSpan;
2121 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error,
"left and right dimension offsets should match.");
2122 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2124 const int d_start = a_offset;
2125 const int d_end = d_start + leftDimSpan;
2127 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>,
Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2128 ComponentIntegralsArray componentIntegrals;
2129 for (
int r=0; r<numPointTensorComponents; r++)
2146 const int numPoints = pointDimensions[r];
2153 const int leftTensorComponentDimSpan = leftTensorComponent.
extent_int(2);
2154 const int leftTensorComponentFields = leftTensorComponent.
extent_int(0);
2155 const int rightTensorComponentDimSpan = rightTensorComponent.
extent_int(2);
2156 const int rightTensorComponentFields = rightTensorComponent.
extent_int(0);
2158 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2160 for (
int d=d_start; d<d_end; d++)
2162 ScalarView<Scalar,DeviceType> componentIntegralView;
2164 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
2165 if (allocateFadStorage)
2168 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2172 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2175 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2178 leftTensorComponentDimSpan);
2179 Kokkos::parallel_for(
"compute componentIntegrals", policy, refSpaceIntegralFunctor);
2181 componentIntegrals[r][d] = componentIntegralView;
2183 if (approximateFlops != NULL)
2185 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3);
2190 ExecutionSpace().fence();
2192 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount};
2193 Kokkos::Array<int,3> lowerBounds {0,0,0};
2194 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2196 Kokkos::parallel_for(
"compute field integrals", policy,
2197 KOKKOS_LAMBDA (
const int &cellDataOrdinal,
const int &leftFieldOrdinal,
const int &rightFieldOrdinal) {
2198 const Scalar & cellMeasureWeight = cellMeasures.
getTensorComponent(0)(cellDataOrdinal);
2206 Scalar integralSum = 0.0;
2207 for (
int d=d_start; d<d_end; d++)
2209 const Scalar & transformLeft_d = basisValuesLeft.
transformWeight(cellDataOrdinal,0,d,d);
2210 const Scalar & transformRight_d = basisValuesRight.
transformWeight(cellDataOrdinal,0,d,d);
2212 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2215 Scalar integral_d = 1.0;
2217 for (
int r=0; r<numPointTensorComponents; r++)
2219 integral_d *= componentIntegrals[r][d](leftTensorIterator.
argument(r),rightTensorIterator.
argument(r));
2222 integralSum += leftRightTransform_d * integral_d;
2225 const int i = leftFieldOrdinal + leftFieldOffset;
2226 const int j = rightFieldOrdinal + rightFieldOffset;
2228 if (integralViewRank == 3)
2230 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2234 integralView2(i,j) += cellMeasureWeight * integralSum;
2238 b_offset += rightDimSpan;
2241 a_offset += leftDimSpan;
2245 if (approximateFlops != NULL)
2248 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2256 const bool transposeLeft =
true;
2257 const bool transposeRight =
false;
2261 const bool matrixTransform = (leftTransform.
rank() == 4) || (rightTransform.
rank() == 4);
2266 if (matrixTransform)
2268 composedTransform = leftTransform.
allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2269 composedTransform.
storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2272 if (approximateFlops != NULL)
2283 const int newRank = 4;
2284 auto extents = composedTransform.
getExtents();
2286 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2287 if (approximateFlops != NULL)
2293 else if (leftTransform.
isValid())
2296 composedTransform = leftTransform;
2298 else if (rightTransform.
isValid())
2301 composedTransform = rightTransform;
2306 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.
numCells(),basisValuesLeft.
numPoints(),spaceDim,spaceDim};
2307 Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2309 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView(
"Intrepid2::FST::integrate() - identity view",spaceDim);
2310 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2318 const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2319 const int rightFamilyCount = basisValuesRight.basisValues().
numFamilies();
2320 const int leftComponentCount = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2321 const int rightComponentCount = isVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2323 int leftFieldOrdinalOffset = 0;
2324 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2329 bool haveLaunchedContributionToCurrentFamilyLeft =
false;
2330 for (
int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2333 : basisValuesLeft.basisValues().
tensorData(leftFamilyOrdinal);
2337 a_offset += basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2341 int rightFieldOrdinalOffset = 0;
2342 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2346 bool haveLaunchedContributionToCurrentFamilyRight =
false;
2348 for (
int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2351 : basisValuesRight.basisValues().
tensorData(rightFamilyOrdinal);
2352 if (!rightComponent.
isValid())
2355 b_offset += basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2361 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2362 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2371 bool forceNonSpecialized =
false;
2372 bool usePointCacheForRank3Tensor =
true;
2376 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2378 ExecutionSpace().fence();
2380 haveLaunchedContributionToCurrentFamilyLeft =
true;
2381 haveLaunchedContributionToCurrentFamilyRight =
true;
2383 if (integralViewRank == 2)
2387 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2389 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2390 const int teamSize = functor.
teamSize(recommendedTeamSize);
2392 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2394 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 2", policy, functor);
2396 if (approximateFlops != NULL)
2398 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2403 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2405 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2406 const int teamSize = functor.
teamSize(recommendedTeamSize);
2408 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2410 Kokkos::parallel_for(
"F_Integrate rank 2", policy, functor);
2412 if (approximateFlops != NULL)
2414 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2418 else if (integralViewRank == 3)
2422 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2424 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2425 const int teamSize = functor.
teamSize(recommendedTeamSize);
2427 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2429 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 3", policy, functor);
2431 if (approximateFlops != NULL)
2433 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2438 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2440 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2441 const int teamSize = functor.
teamSize(recommendedTeamSize);
2443 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2445 Kokkos::parallel_for(
"F_Integrate rank 3", policy, functor);
2447 if (approximateFlops != NULL)
2449 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2453 b_offset += isVectorValued ? basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2455 rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.
vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2457 a_offset += isVectorValued ? basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2459 leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.
vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2466 ExecutionSpace().fence();
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION int numFamilies() const
For valid vectorData, returns the number of families in vectorData; otherwise, returns number of Tens...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank...
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
TensorDataType & tensorData()
TensorData accessor for single-family scalar data.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
KOKKOS_INLINE_FUNCTION int familyFieldOrdinalOffset(const int &familyOrdinal) const
Returns the field ordinal offset for the specified family.
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...