15 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
16 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
22 #include "Teuchos_TimeMonitor.hpp"
31 template<
class Scalar,
class DeviceType,
int integralViewRank>
34 using ExecutionSpace =
typename DeviceType::execution_space;
35 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
36 using TeamMember =
typename TeamPolicy::member_type;
38 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
39 IntegralViewType integralView_;
46 int leftComponentSpan_;
47 int rightComponentSpan_;
48 int numTensorComponents_;
49 int leftFieldOrdinalOffset_;
50 int rightFieldOrdinalOffset_;
51 bool forceNonSpecialized_;
53 size_t fad_size_output_ = 0;
55 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
59 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
60 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
61 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
63 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_;
64 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
77 int leftFieldOrdinalOffset,
78 int rightFieldOrdinalOffset,
79 bool forceNonSpecialized)
81 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
82 leftComponent_(leftComponent),
83 composedTransform_(composedTransform),
84 rightComponent_(rightComponent),
85 cellMeasures_(cellMeasures),
88 leftComponentSpan_(leftComponent.
extent_int(2)),
89 rightComponentSpan_(rightComponent.
extent_int(2)),
91 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
92 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
93 forceNonSpecialized_(forceNonSpecialized)
95 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
98 const int FIELD_DIM = 0;
99 const int POINT_DIM = 1;
103 for (
int r=0; r<numTensorComponents_; r++)
105 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
106 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
107 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
108 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
109 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
110 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
114 int leftRelativeEnumerationSpan = 1;
115 int rightRelativeEnumerationSpan = 1;
116 for (
int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
118 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
119 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
120 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
121 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
127 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
128 if (allocateFadStorage)
130 fad_size_output_ = dimension_scalar(integralView_);
133 const int R = numTensorComponents_ - 1;
136 int allocationSoFar = 0;
137 offsetsForComponentOrdinal_[R] = allocationSoFar;
140 for (
int r=R-1; r>0; r--)
145 num_ij *= leftFields * rightFields;
147 offsetsForComponentOrdinal_[r] = allocationSoFar;
148 allocationSoFar += num_ij;
150 offsetsForComponentOrdinal_[0] = allocationSoFar;
153 template<
size_t maxComponents,
size_t numComponents = maxComponents>
154 KOKKOS_INLINE_FUNCTION
155 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
156 const Kokkos::Array<int,maxComponents> &bounds)
const
158 if (numComponents == 0)
164 int r =
static_cast<int>(numComponents - 1);
165 while (arguments[r] + 1 >= bounds[r])
171 if (r >= 0) ++arguments[r];
177 KOKKOS_INLINE_FUNCTION
179 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
180 const int &numComponents)
const
182 if (numComponents == 0)
return -1;
183 int r =
static_cast<int>(numComponents - 1);
184 while (arguments[r] + 1 >= bounds[r])
190 if (r >= 0) ++arguments[r];
194 template<
size_t maxComponents,
size_t numComponents = maxComponents>
195 KOKKOS_INLINE_FUNCTION
196 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
197 const Kokkos::Array<int,maxComponents> &bounds)
const
199 if (numComponents == 0)
205 int r =
static_cast<int>(numComponents - 1);
206 while (arguments[r] + 1 >= bounds[r])
216 KOKKOS_INLINE_FUNCTION
218 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
219 const int &numComponents)
const
221 if (numComponents == 0)
return -1;
222 int r = numComponents - 1;
223 while (arguments[r] + 1 >= bounds[r])
231 template<
size_t maxComponents,
size_t numComponents = maxComponents>
232 KOKKOS_INLINE_FUNCTION
233 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
234 const Kokkos::Array<int,maxComponents> &bounds,
235 const int startIndex)
const
238 if (numComponents == 0)
244 int enumerationIndex = 0;
245 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
247 enumerationIndex += arguments[r];
248 enumerationIndex *= bounds[r-1];
250 enumerationIndex += arguments[startIndex];
251 return enumerationIndex;
265 KOKKOS_INLINE_FUNCTION
268 constexpr
int numTensorComponents = 3;
270 Kokkos::Array<int,numTensorComponents> pointBounds;
271 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
272 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
273 for (
unsigned r=0; r<numTensorComponents; r++)
275 pointBounds[r] = pointBounds_[r];
276 leftFieldBounds[r] = leftFieldBounds_[r];
277 rightFieldBounds[r] = rightFieldBounds_[r];
280 const int cellDataOrdinal = teamMember.league_rank();
281 const int numThreads = teamMember.team_size();
282 const int scratchViewSize = offsetsForComponentOrdinal_[0];
284 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
285 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
286 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z;
287 if (fad_size_output_ > 0) {
288 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
289 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
290 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
291 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
292 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
293 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
294 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
295 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
298 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
299 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
300 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
301 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
302 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
303 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
304 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
305 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
311 constexpr
int R = numTensorComponents - 1;
313 const int composedTransformRank = composedTransform_.
rank();
315 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
317 const int a = a_offset_ + a_component;
318 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
320 const int b = b_offset_ + b_component;
325 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
326 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
335 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
336 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (
const int& fieldOrdinalPointOrdinal) {
337 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
338 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
339 if ((fieldOrdinal < leftTensorComponent_x.
extent_int(0)) && (pointOrdinal < leftTensorComponent_x.
extent_int(1)))
341 const int leftRank = leftTensorComponent_x.
rank();
342 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
344 if ((fieldOrdinal < leftTensorComponent_y.
extent_int(0)) && (pointOrdinal < leftTensorComponent_y.
extent_int(1)))
346 const int leftRank = leftTensorComponent_y.
rank();
347 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
349 if ((fieldOrdinal < leftTensorComponent_z.
extent_int(0)) && (pointOrdinal < leftTensorComponent_z.
extent_int(1)))
351 const int leftRank = leftTensorComponent_z.
rank();
352 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
354 if ((fieldOrdinal < rightTensorComponent_x.
extent_int(0)) && (pointOrdinal < rightTensorComponent_x.
extent_int(1)))
356 const int rightRank = rightTensorComponent_x.
rank();
357 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
359 if ((fieldOrdinal < rightTensorComponent_y.
extent_int(0)) && (pointOrdinal < rightTensorComponent_y.
extent_int(1)))
361 const int rightRank = rightTensorComponent_y.
rank();
362 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
364 if ((fieldOrdinal < rightTensorComponent_z.
extent_int(0)) && (pointOrdinal < rightTensorComponent_z.
extent_int(1)))
366 const int rightRank = rightTensorComponent_z.
rank();
367 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
373 if (composedTransformRank == 4)
376 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
377 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
383 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
384 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
390 if (composedTransformRank == 4)
392 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
393 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
398 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
399 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
407 teamMember.team_barrier();
410 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
411 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
413 scratchView(i) = 0.0;
417 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
418 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
419 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
423 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
424 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
425 rightFieldArguments[R] = jz;
426 leftFieldArguments[R] = iz;
428 Kokkos::Array<int,numTensorComponents> pointArguments;
429 for (
int i=0; i<numTensorComponents; i++)
431 pointArguments[i] = 0;
434 for (
int lx=0; lx<pointBounds[0]; lx++)
436 pointArguments[0] = lx;
440 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
441 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
443 for (
int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
445 scratchView(Gy_index) = 0;
448 for (
int ly=0; ly<pointBounds[1]; ly++)
450 pointArguments[1] = ly;
452 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
455 for (
int lz=0; lz < pointBounds[2]; lz++)
457 const Scalar & leftValue = leftFields_z(iz,lz);
458 const Scalar & rightValue = rightFields_z(jz,lz);
460 pointArguments[2] = lz;
461 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
463 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
468 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
470 leftFieldArguments[1] = iy;
471 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
473 const Scalar & leftValue = leftFields_y(iy,ly);
475 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
477 rightFieldArguments[1] = jy;
479 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
480 const Scalar & rightValue = rightFields_y(jy,ly);
482 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
483 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
485 const int Gz_index = 0;
486 const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
488 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
490 Gy += leftValue * Gz * rightValue;
495 for (
int ix=0; ix<leftFieldBounds_[0]; ix++)
497 leftFieldArguments[0] = ix;
498 const Scalar & leftValue = leftFields_x(ix,lx);
500 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
502 leftFieldArguments[1] = iy;
504 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
506 for (
int jx=0; jx<rightFieldBounds_[0]; jx++)
508 rightFieldArguments[0] = jx;
509 const Scalar & rightValue = rightFields_x(jx,lx);
511 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
513 rightFieldArguments[1] = jy;
514 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
516 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
518 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
519 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
522 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
523 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
524 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
525 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
527 if (integralViewRank == 3)
549 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
554 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
568 template<
size_t numTensorComponents>
569 KOKKOS_INLINE_FUNCTION
570 void run(
const TeamMember & teamMember )
const
572 Kokkos::Array<int,numTensorComponents> pointBounds;
573 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
574 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
575 for (
unsigned r=0; r<numTensorComponents; r++)
577 pointBounds[r] = pointBounds_[r];
578 leftFieldBounds[r] = leftFieldBounds_[r];
579 rightFieldBounds[r] = rightFieldBounds_[r];
582 const int cellDataOrdinal = teamMember.league_rank();
583 const int numThreads = teamMember.team_size();
584 const int scratchViewSize = offsetsForComponentOrdinal_[0];
585 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
586 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
587 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields;
588 if (fad_size_output_ > 0) {
589 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
590 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
591 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
592 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
595 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
596 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
597 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
598 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
604 constexpr
int R = numTensorComponents - 1;
606 const int composedTransformRank = composedTransform_.
rank();
608 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
610 const int a = a_offset_ + a_component;
611 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
613 const int b = b_offset_ + b_component;
615 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
616 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
618 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0);
619 const int numRightFieldsFinal = rightFinalComponent.extent_int(0);
621 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (
const int& r) {
622 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
623 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
624 const int leftFieldCount = leftTensorComponent.extent_int(0);
625 const int pointCount = leftTensorComponent.extent_int(1);
626 const int leftRank = leftTensorComponent.rank();
627 const int rightFieldCount = rightTensorComponent.extent_int(0);
628 const int rightRank = rightTensorComponent.rank();
629 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
632 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
633 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
635 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
636 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
639 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
642 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
643 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
651 if (composedTransformRank == 4)
653 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
654 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
659 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
660 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
666 teamMember.team_barrier();
668 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
669 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
670 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
671 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
675 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
676 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
677 rightFieldArguments[R] = j_R;
678 leftFieldArguments[R] = i_R;
681 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
683 scratchView(i) = 0.0;
685 Kokkos::Array<int,numTensorComponents> pointArguments;
686 for (
unsigned i=0; i<numTensorComponents; i++)
688 pointArguments[i] = 0;
695 const int pointBounds_R = pointBounds[R];
696 int & pointArgument_R = pointArguments[R];
697 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
702 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
706 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
707 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
709 if (integralViewRank == 3)
712 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
717 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
721 const int & pointIndex_R = pointArguments[R];
723 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
724 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
726 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
728 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
733 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
734 const int r_min = (r_next >= 0) ? r_next : 0;
736 for (
int s=R-1; s>=r_min; s--)
738 const int & pointIndex_s = pointArguments[s];
741 for (
int s1=s; s1<R; s1++)
743 leftFieldArguments[s1] = 0;
747 const int & i_s = leftFieldArguments[s];
748 const int & j_s = rightFieldArguments[s];
751 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
752 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
756 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
759 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
761 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
763 for (
int s1=s; s1<R; s1++)
765 rightFieldArguments[s1] = 0;
770 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
773 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
775 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
777 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
783 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
785 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
790 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
795 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
796 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
797 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
798 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
800 if (integralViewRank == 3)
803 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
808 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
812 *G_s += leftValue * G_sp * rightValue;
817 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
821 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
828 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
829 for (
int i=scratchOffsetForThread; i<endIndex; i++)
831 scratchView(i) = 0.0;
838 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
846 KOKKOS_INLINE_FUNCTION
847 void operator()(
const TeamMember & teamMember )
const
849 switch (numTensorComponents_)
851 case 1: run<1>(teamMember);
break;
852 case 2: run<2>(teamMember);
break;
854 if (forceNonSpecialized_)
859 case 4: run<4>(teamMember);
break;
860 case 5: run<5>(teamMember);
break;
861 case 6: run<6>(teamMember);
break;
862 case 7: run<7>(teamMember);
break;
863 #ifdef INTREPID2_HAVE_DEBUG
876 const int R = numTensorComponents_ - 1;
880 const int flopsPerPoint_ab = cellMeasures_.numTensorComponents();
882 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
884 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
889 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
890 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
892 flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
894 int flopsPer_i_R_j_R = 0;
898 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
899 leftFieldArguments[R] = 0;
901 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
902 for (
int i=0; i<numTensorComponents_; i++)
904 pointArguments[i] = 0;
909 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
910 rightFieldArguments[R] = 0;
916 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
918 flopsPer_i_R_j_R += 4;
926 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
927 const int r_min = (r_next >= 0) ? r_next : 0;
929 for (
int s=R-1; s>=r_min; s--)
932 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
933 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
935 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
939 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
942 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
950 int teamSize(
const int &maxTeamSizeFromKokkos)
const
952 const int R = numTensorComponents_ - 1;
953 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
954 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
961 size_t shmem_size = 0;
963 if (fad_size_output_ > 0)
965 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1), fad_size_output_);
967 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
972 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
973 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1));
974 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
991 template<
class Scalar,
class DeviceType,
int integralViewRank>
994 using ExecutionSpace =
typename DeviceType::execution_space;
995 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
996 using TeamMember =
typename TeamPolicy::member_type;
998 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
999 IntegralViewType integralView_;
1006 int leftComponentSpan_;
1007 int rightComponentSpan_;
1008 int numTensorComponents_;
1009 int leftFieldOrdinalOffset_;
1010 int rightFieldOrdinalOffset_;
1012 size_t fad_size_output_ = 0;
1016 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1021 int maxFieldsRight_;
1031 int leftFieldOrdinalOffset,
1032 int rightFieldOrdinalOffset)
1034 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1035 leftComponent_(leftComponent),
1036 composedTransform_(composedTransform),
1037 rightComponent_(rightComponent),
1038 cellMeasures_(cellMeasures),
1039 a_offset_(a_offset),
1040 b_offset_(b_offset),
1041 leftComponentSpan_(leftComponent.
extent_int(2)),
1042 rightComponentSpan_(rightComponent.
extent_int(2)),
1044 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1045 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1047 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
1049 const int FIELD_DIM = 0;
1050 const int POINT_DIM = 1;
1052 maxFieldsRight_ = 0;
1054 for (
int r=0; r<numTensorComponents_; r++)
1056 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1057 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1058 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1059 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1060 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1061 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1067 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
1068 if (allocateFadStorage)
1070 fad_size_output_ = dimension_scalar(integralView_);
1074 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1075 KOKKOS_INLINE_FUNCTION
1076 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1077 const Kokkos::Array<int,maxComponents> &bounds)
const
1079 if (numComponents == 0)
return -1;
1080 int r =
static_cast<int>(numComponents - 1);
1081 while (arguments[r] + 1 >= bounds[r])
1087 if (r >= 0) ++arguments[r];
1092 KOKKOS_INLINE_FUNCTION
1094 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1095 const int &numComponents)
const
1097 if (numComponents == 0)
return -1;
1098 int r =
static_cast<int>(numComponents - 1);
1099 while (arguments[r] + 1 >= bounds[r])
1105 if (r >= 0) ++arguments[r];
1109 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1110 KOKKOS_INLINE_FUNCTION
1111 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
1112 const Kokkos::Array<int,maxComponents> &bounds)
const
1114 if (numComponents == 0)
return -1;
1115 int r =
static_cast<int>(numComponents - 1);
1116 while (arguments[r] + 1 >= bounds[r])
1125 KOKKOS_INLINE_FUNCTION
1127 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128 const int &numComponents)
const
1130 if (numComponents == 0)
return -1;
1131 int r = numComponents - 1;
1132 while (arguments[r] + 1 >= bounds[r])
1140 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1141 KOKKOS_INLINE_FUNCTION
1142 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
1143 const Kokkos::Array<int,maxComponents> &bounds,
1144 const int startIndex)
const
1147 if (numComponents == 0)
1151 int enumerationIndex = 0;
1152 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
1154 enumerationIndex += arguments[r];
1155 enumerationIndex *= bounds[r-1];
1157 enumerationIndex += arguments[startIndex];
1158 return enumerationIndex;
1162 KOKKOS_INLINE_FUNCTION
1163 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1164 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1166 return integralView(cellDataOrdinal,i,j);
1170 KOKKOS_INLINE_FUNCTION
1171 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1172 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1174 return integralView(i,j);
1178 KOKKOS_INLINE_FUNCTION
1181 constexpr
int numTensorComponents = 3;
1183 const int pointBounds_x = pointBounds_[0];
1184 const int pointBounds_y = pointBounds_[1];
1185 const int pointBounds_z = pointBounds_[2];
1186 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1188 const int leftFieldBounds_x = leftFieldBounds_[0];
1189 const int rightFieldBounds_x = rightFieldBounds_[0];
1190 const int leftFieldBounds_y = leftFieldBounds_[1];
1191 const int rightFieldBounds_y = rightFieldBounds_[1];
1192 const int leftFieldBounds_z = leftFieldBounds_[2];
1193 const int rightFieldBounds_z = rightFieldBounds_[2];
1195 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1196 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1197 for (
unsigned r=0; r<numTensorComponents; r++)
1199 leftFieldBounds[r] = leftFieldBounds_[r];
1200 rightFieldBounds[r] = rightFieldBounds_[r];
1203 const auto integralView = integralView_;
1204 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1205 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1207 const int cellDataOrdinal = teamMember.league_rank();
1208 const int threadNumber = teamMember.team_rank();
1210 const int numThreads = teamMember.team_size();
1211 const int GyEntryCount = pointBounds_z;
1212 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals;
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals;
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
1216 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1217 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1219 if (fad_size_output_ > 0) {
1220 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1221 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1222 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
1224 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1225 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1226 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1227 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1228 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1229 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1232 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1233 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1234 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
1236 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1237 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1238 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1239 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1240 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1241 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1249 const int composedTransformRank = composedTransform_.
rank();
1252 teamMember.team_barrier();
1254 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1256 const int a = a_offset_ + a_component;
1257 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1259 const int b = b_offset_ + b_component;
1268 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1269 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (
const int& fieldOrdinal) {
1270 if (fieldOrdinal < leftTensorComponent_x.
extent_int(0))
1272 const int pointCount = leftTensorComponent_x.
extent_int(1);
1273 const int leftRank = leftTensorComponent_x.
rank();
1274 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1276 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1279 if (fieldOrdinal < leftTensorComponent_y.
extent_int(0))
1281 const int pointCount = leftTensorComponent_y.
extent_int(1);
1282 const int leftRank = leftTensorComponent_y.
rank();
1283 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1285 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1288 if (fieldOrdinal < leftTensorComponent_z.
extent_int(0))
1290 const int pointCount = leftTensorComponent_z.
extent_int(1);
1291 const int leftRank = leftTensorComponent_z.
rank();
1292 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1294 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1297 if (fieldOrdinal < rightTensorComponent_x.
extent_int(0))
1299 const int pointCount = rightTensorComponent_x.
extent_int(1);
1300 const int rightRank = rightTensorComponent_x.
rank();
1301 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1303 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1306 if (fieldOrdinal < rightTensorComponent_y.
extent_int(0))
1308 const int pointCount = rightTensorComponent_y.
extent_int(1);
1309 const int rightRank = rightTensorComponent_y.
rank();
1310 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1312 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1315 if (fieldOrdinal < rightTensorComponent_z.
extent_int(0))
1317 const int pointCount = rightTensorComponent_z.
extent_int(1);
1318 const int rightRank = rightTensorComponent_z.
rank();
1319 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1321 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1326 if (composedTransformRank == 4)
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1334 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1335 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1340 teamMember.team_barrier();
1342 for (
int i0=0; i0<leftFieldBounds_x; i0++)
1344 for (
int j0=0; j0<rightFieldBounds_x; j0++)
1346 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (
const int& pointEnumerationIndexLaterDimensions) {
1348 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions;
1350 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1353 if (fad_size_output_ == 0)
1356 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (
const int &x_pointOrdinal, Scalar &integralThusFar)
1358 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1363 for (
int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1365 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1371 teamMember.team_barrier();
1373 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (
const int& i1j1) {
1374 const int i1 = i1j1 % leftFieldBounds_y;
1375 const int j1 = i1j1 / leftFieldBounds_y;
1377 int Gy_index_offset = GyEntryCount * threadNumber;
1379 for (
int lz=0; lz<pointBounds_z; lz++)
1381 int pointEnumerationIndex = lz * pointBounds_y;
1382 if (fad_size_output_ == 0)
1384 Scalar Gy_local = 0;
1387 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (
const int &ly, Scalar &integralThusFar)
1389 const Scalar & leftValue = leftFields_y(i1,ly);
1390 const Scalar & rightValue = rightFields_y(j1,ly);
1392 integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1395 GyIntegrals(Gy_index_offset + lz) = Gy_local;
1399 Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
1400 for (
int ly=0; ly<pointBounds_y; ly++)
1402 const Scalar & leftValue = leftFields_y(i1,ly);
1403 const Scalar & rightValue = rightFields_y(j1,ly);
1405 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1410 for (
int i2=0; i2<leftFieldBounds_z; i2++)
1412 for (
int j2=0; j2<rightFieldBounds_z; j2++)
1416 int Gy_index_offset = GyEntryCount * threadNumber;
1418 if (fad_size_output_ == 0)
1421 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (
const int &lz, Scalar &integralThusFar)
1423 const Scalar & leftValue = leftFields_z(i2,lz);
1424 const Scalar & rightValue = rightFields_z(j2,lz);
1426 integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1431 for (
int lz=0; lz<pointBounds_z; lz++)
1433 const Scalar & leftValue = leftFields_z(i2,lz);
1434 const Scalar & rightValue = rightFields_z(j2,lz);
1436 Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1440 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1441 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1446 Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
1447 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1453 teamMember.team_barrier();
1460 template<
size_t numTensorComponents>
1461 KOKKOS_INLINE_FUNCTION
1462 void run(
const TeamMember & teamMember )
const
1600 KOKKOS_INLINE_FUNCTION
1601 void operator()(
const TeamMember & teamMember )
const
1603 switch (numTensorComponents_)
1605 case 1: run<1>(teamMember);
break;
1606 case 2: run<2>(teamMember);
break;
1608 case 4: run<4>(teamMember);
break;
1609 case 5: run<5>(teamMember);
break;
1610 case 6: run<6>(teamMember);
break;
1611 case 7: run<7>(teamMember);
break;
1612 #ifdef INTREPID2_HAVE_DEBUG
1625 constexpr
int numTensorComponents = 3;
1626 Kokkos::Array<int,numTensorComponents> pointBounds;
1627 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1628 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1630 int pointsInNonzeroComponentDimensions = 1;
1631 for (
unsigned r=0; r<numTensorComponents; r++)
1633 pointBounds[r] = pointBounds_[r];
1634 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1635 leftFieldBounds[r] = leftFieldBounds_[r];
1636 rightFieldBounds[r] = rightFieldBounds_[r];
1639 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1641 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1646 flopCount += composedTransform_.
extent_int(1) * cellMeasures_.numTensorComponents();
1648 for (
int i0=0; i0<leftFieldBounds[0]; i0++)
1650 for (
int j0=0; j0<rightFieldBounds[0]; j0++)
1652 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3;
1692 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3;
1693 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3;
1694 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1;
1770 const int R = numTensorComponents_ - 1;
1771 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1772 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1779 size_t shmem_size = 0;
1781 const int GyEntryCount = pointBounds_[2];
1783 int pointsInNonzeroComponentDimensions = 1;
1784 for (
int d=1; d<numTensorComponents_; d++)
1786 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1789 if (fad_size_output_ > 0)
1791 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_);
1792 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_);
1793 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1), fad_size_output_);
1795 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_);
1796 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_);
1797 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_);
1798 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_);
1799 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_);
1800 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_);
1804 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions);
1805 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads);
1806 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1));
1808 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]);
1809 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]);
1810 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]);
1811 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]);
1812 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]);
1813 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]);
1820 template<
class Scalar,
class DeviceType>
1823 ScalarView<Scalar,DeviceType> integral_;
1827 ordinal_type dimSpan_;
1828 ordinal_type leftRank_;
1829 ordinal_type rightRank_;
1830 ordinal_type numPoints_;
1834 ordinal_type dimSpan)
1836 integral_(integralView),
1842 leftRank_ = left.
rank();
1843 rightRank_ = right.
rank();
1847 KOKKOS_INLINE_FUNCTION
1848 void operator()(
const ordinal_type & i,
const ordinal_type & j )
const
1850 Scalar refSpaceIntegral = 0.0;
1851 for (
int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1853 const Scalar & weight = weights_(ptOrdinal);
1854 for (
int a=0; a<dimSpan_; a++)
1856 const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1857 const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1858 refSpaceIntegral += leftValue * rightValue * weight;
1861 integral_(i,j) = refSpaceIntegral;
1866 template<
typename DeviceType>
1867 template<
class Scalar>
1876 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1877 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1878 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1884 const int CELL_DIM = 0;
1886 const auto leftTransform = vectorDataLeft.
transform();
1888 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1892 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.
transform().
getDimensionInfo(CELL_DIM));
1896 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.
transform().
getDimensionInfo(CELL_DIM));
1899 DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1900 int cellDataExtent = combinedCellDimInfo.dataExtent;
1902 const int numCells = vectorDataLeft.
numCells();
1903 const int numFieldsLeft = vectorDataLeft.
numFields();
1904 const int numFieldsRight = vectorDataRight.
numFields();
1906 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1907 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1909 if (cellVariationType != CONSTANT)
1911 Kokkos::View<Scalar***,DeviceType> data(
"Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1916 Kokkos::View<Scalar**,DeviceType> data(
"Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1924 template<
typename DeviceType>
1925 template<
class Scalar>
1929 double* approximateFlops)
1931 using ExecutionSpace =
typename DeviceType::execution_space;
1935 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1936 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1937 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1939 if (approximateFlops != NULL)
1941 *approximateFlops = 0;
1953 const int numFieldsLeft = basisValuesLeft.
numFields();
1954 const int numFieldsRight = basisValuesRight.
numFields();
1955 const int spaceDim = basisValuesLeft.
spaceDim();
1957 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.
spaceDim() != basisValuesRight.
spaceDim(), std::invalid_argument,
"vectorDataLeft and vectorDataRight must agree on the space dimension");
1959 const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1960 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1964 int numTensorComponentsLeft = -1;
1965 const bool leftIsVectorValued = basisValuesLeft.
vectorData().isValid();
1967 if (leftIsVectorValued)
1969 const auto &refVectorLeft = basisValuesLeft.
vectorData();
1970 int numFamiliesLeft = refVectorLeft.numFamilies();
1971 int numVectorComponentsLeft = refVectorLeft.numComponents();
1972 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1973 for (
int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1975 for (
int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1980 if (numTensorComponentsLeft == -1)
1984 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.
numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1985 for (
int r=0; r<numTensorComponentsLeft; r++)
1995 numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents();
1996 for (
int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1998 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"All families must match in the number of tensor components");
2001 int numTensorComponentsRight = -1;
2002 const bool rightIsVectorValued = basisValuesRight.
vectorData().isValid();
2004 if (rightIsVectorValued)
2006 const auto &refVectorRight = basisValuesRight.
vectorData();
2007 int numFamiliesRight = refVectorRight.numFamilies();
2008 int numVectorComponentsRight = refVectorRight.numComponents();
2009 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
2010 for (
int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2012 for (
int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2014 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2015 if (tensorData.numTensorComponents() > 0)
2017 if (numTensorComponentsRight == -1)
2019 numTensorComponentsRight = tensorData.numTensorComponents();
2021 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2022 for (
int r=0; r<numTensorComponentsRight; r++)
2024 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2029 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
2034 for (
int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2036 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
2041 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.
axisAligned() && basisValuesRight.
axisAligned())
2050 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2051 for (
int r=0; r<numPointTensorComponents; r++)
2058 Kokkos::View<Scalar**, DeviceType> integralView2;
2059 Kokkos::View<Scalar***, DeviceType> integralView3;
2060 if (integralViewRank == 3)
2068 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2071 int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2073 const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.
vectorData().numComponents() : 1;
2074 for (
int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2077 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2083 const int leftDimSpan = leftComponent.
extent_int(2);
2085 const int leftComponentFieldCount = leftComponent.
extent_int(0);
2087 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2090 int rightFieldOffset = basisValuesRight.
vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2092 const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2093 for (
int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2096 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2097 if (!rightComponent.
isValid())
2102 const int rightDimSpan = rightComponent.
extent_int(2);
2104 const int rightComponentFieldCount = rightComponent.
extent_int(0);
2107 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset))
2109 b_offset += rightDimSpan;
2115 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error,
"left and right dimension offsets should match.");
2116 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2118 const int d_start = a_offset;
2119 const int d_end = d_start + leftDimSpan;
2121 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>,
Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2122 ComponentIntegralsArray componentIntegrals;
2123 for (
int r=0; r<numPointTensorComponents; r++)
2140 const int numPoints = pointDimensions[r];
2147 const int leftTensorComponentDimSpan = leftTensorComponent.
extent_int(2);
2148 const int leftTensorComponentFields = leftTensorComponent.
extent_int(0);
2149 const int rightTensorComponentDimSpan = rightTensorComponent.
extent_int(2);
2150 const int rightTensorComponentFields = rightTensorComponent.
extent_int(0);
2152 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2154 for (
int d=d_start; d<d_end; d++)
2156 ScalarView<Scalar,DeviceType> componentIntegralView;
2158 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
2159 if (allocateFadStorage)
2162 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2166 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2169 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2172 leftTensorComponentDimSpan);
2173 Kokkos::parallel_for(
"compute componentIntegrals", policy, refSpaceIntegralFunctor);
2175 componentIntegrals[r][d] = componentIntegralView;
2177 if (approximateFlops != NULL)
2179 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3);
2184 ExecutionSpace().fence();
2186 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount};
2187 Kokkos::Array<int,3> lowerBounds {0,0,0};
2188 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2190 Kokkos::parallel_for(
"compute field integrals", policy,
2191 KOKKOS_LAMBDA (
const int &cellDataOrdinal,
const int &leftFieldOrdinal,
const int &rightFieldOrdinal) {
2192 const Scalar & cellMeasureWeight = cellMeasures.
getTensorComponent(0)(cellDataOrdinal);
2200 Scalar integralSum = 0.0;
2201 for (
int d=d_start; d<d_end; d++)
2203 const Scalar & transformLeft_d = basisValuesLeft.
transformWeight(cellDataOrdinal,0,d,d);
2204 const Scalar & transformRight_d = basisValuesRight.
transformWeight(cellDataOrdinal,0,d,d);
2206 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2209 Scalar integral_d = 1.0;
2211 for (
int r=0; r<numPointTensorComponents; r++)
2213 integral_d *= componentIntegrals[r][d](leftTensorIterator.
argument(r),rightTensorIterator.
argument(r));
2216 integralSum += leftRightTransform_d * integral_d;
2219 const int i = leftFieldOrdinal + leftFieldOffset;
2220 const int j = rightFieldOrdinal + rightFieldOffset;
2222 if (integralViewRank == 3)
2224 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2228 integralView2(i,j) += cellMeasureWeight * integralSum;
2232 b_offset += rightDimSpan;
2235 a_offset += leftDimSpan;
2239 if (approximateFlops != NULL)
2242 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2250 const bool transposeLeft =
true;
2251 const bool transposeRight =
false;
2257 const int leftRank = leftTransform.
rank();
2258 const int rightRank = rightTransform.
rank();
2262 const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
2263 const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
2264 const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
2265 const bool ranks32 = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
2266 const bool ranks42 = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
2271 composedTransform.
storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2274 if (approximateFlops != NULL)
2282 const int newRank = 4;
2285 extents[3] = extents[2];
2287 variationTypes[3] = variationTypes[2];
2288 variationTypes[2] = CONSTANT;
2289 auto leftTransformMatrix = leftTransform.
shallowCopy(newRank, extents, variationTypes);
2294 extents[3] = extents[2];
2296 variationTypes[3] = variationTypes[2];
2297 variationTypes[2] = CONSTANT;
2298 auto rightTransformMatrix = rightTransform.
shallowCopy(newRank, extents, variationTypes);
2301 composedTransform.
storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
2303 if (approximateFlops != NULL)
2314 const int newRank = 4;
2315 auto extents = composedTransform.
getExtents();
2317 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2318 if (approximateFlops != NULL)
2325 const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
2326 const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
2334 const int newRank = 4;
2335 auto extents = composedTransform.
getExtents();
2345 extents[3] = extents[2];
2347 variationTypes[3] = variationTypes[2];
2348 variationTypes[2] = CONSTANT;
2350 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2368 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2371 else if (leftTransform.
isValid())
2380 const int newRank = 4;
2384 composedTransform = leftTransform.
shallowCopy(newRank, extents, variationTypes);
2387 case 2: composedTransform = leftTransform;
break;
2389 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2392 else if (rightTransform.
isValid())
2395 composedTransform = rightTransform;
2398 case 4: composedTransform = rightTransform;
break;
2402 const int newRank = 4;
2405 extents[3] = extents[2];
2406 variationTypes[3] = variationTypes[2];
2408 variationTypes[2] = CONSTANT;
2410 composedTransform = rightTransform.
shallowCopy(newRank, extents, variationTypes);
2413 case 2: composedTransform = rightTransform;
break;
2415 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2421 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.
numCells(),basisValuesLeft.
numPoints(),spaceDim,spaceDim};
2422 Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2424 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView(
"Intrepid2::FST::integrate() - identity view",spaceDim);
2425 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2433 const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2434 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2435 const int leftComponentCount = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2436 const int rightComponentCount = rightIsVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2438 int leftFieldOrdinalOffset = 0;
2439 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2444 bool haveLaunchedContributionToCurrentFamilyLeft =
false;
2445 for (
int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2448 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2452 a_offset += basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2456 int rightFieldOrdinalOffset = 0;
2457 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2461 bool haveLaunchedContributionToCurrentFamilyRight =
false;
2463 for (
int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2466 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2467 if (!rightComponent.
isValid())
2470 b_offset += basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2476 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2477 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2486 bool forceNonSpecialized =
false;
2487 bool usePointCacheForRank3Tensor =
true;
2491 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2493 ExecutionSpace().fence();
2495 haveLaunchedContributionToCurrentFamilyLeft =
true;
2496 haveLaunchedContributionToCurrentFamilyRight =
true;
2498 if (integralViewRank == 2)
2502 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2504 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2505 const int teamSize = functor.
teamSize(recommendedTeamSize);
2507 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2509 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 2", policy, functor);
2511 if (approximateFlops != NULL)
2513 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2518 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2520 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2521 const int teamSize = functor.
teamSize(recommendedTeamSize);
2523 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2525 Kokkos::parallel_for(
"F_Integrate rank 2", policy, functor);
2527 if (approximateFlops != NULL)
2529 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2533 else if (integralViewRank == 3)
2537 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2539 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2540 const int teamSize = functor.
teamSize(recommendedTeamSize);
2542 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2544 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 3", policy, functor);
2546 if (approximateFlops != NULL)
2548 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2553 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2555 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2556 const int teamSize = functor.
teamSize(recommendedTeamSize);
2558 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2560 Kokkos::parallel_for(
"F_Integrate rank 3", policy, functor);
2562 if (approximateFlops != NULL)
2564 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2568 b_offset += rightIsVectorValued ? basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2570 rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.
vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2572 a_offset += leftIsVectorValued ? basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2574 leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.
vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2581 ExecutionSpace().fence();
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank...
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...