46 #ifndef MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
51 #include <Kokkos_Core.hpp>
53 #include <KokkosBatched_LU_Decl.hpp>
54 #include <KokkosBatched_LU_Serial_Impl.hpp>
55 #include <KokkosBatched_Trsm_Decl.hpp>
56 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
57 #include <KokkosBatched_Util.hpp>
58 #include <KokkosSparse_CrsMatrix.hpp>
60 #include <Xpetra_CrsMatrixWrap.hpp>
63 #include <Xpetra_MultiVectorFactory.hpp>
73 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
75 RCP<const ParameterList>
80 std::string name =
"semicoarsen: coarsen rate";
83 "A", Teuchos::null,
"Generating factory of the matrix A");
85 "Nullspace", Teuchos::null,
"Generating factory of the nullspace");
87 "Coordinates", Teuchos::null,
"Generating factory for coordinates");
90 "LineDetection_VertLineIds", Teuchos::null,
91 "Generating factory for LineDetection vertical line ids");
93 "LineDetection_Layers", Teuchos::null,
94 "Generating factory for LineDetection layer ids");
96 "CoarseNumZLayers", Teuchos::null,
97 "Generating factory for number of coarse z-layers");
99 return validParamList;
108 Input(fineLevel,
"A");
109 Input(fineLevel,
"Nullspace");
111 Input(fineLevel,
"LineDetection_VertLineIds");
112 Input(fineLevel,
"LineDetection_Layers");
113 Input(fineLevel,
"CoarseNumZLayers");
121 bTransferCoordinates_ =
true;
123 }
else if (bTransferCoordinates_ ==
true) {
128 if (myCoordsFact == Teuchos::null) {
143 return BuildP(fineLevel, coarseLevel);
157 Get<RCP<MultiVector>>(fineLevel,
"Nullspace");
161 LO CoarsenRate = as<LO>(pL.get<
int>(
"semicoarsen: coarsen rate"));
164 "semicoarsen: coarsen rate must be greater than 1");
167 LO BlkSize = A->GetFixedBlockSize();
169 LO Ndofs = rowMap->getLocalNumElements();
170 LO Nnodes = Ndofs / BlkSize;
174 LO FineNumZLayers = Get<LO>(fineLevel,
"CoarseNumZLayers");
176 Get<Teuchos::ArrayRCP<LO>>(fineLevel,
"LineDetection_VertLineIds");
178 Get<Teuchos::ArrayRCP<LO>>(fineLevel,
"LineDetection_Layers");
182 "Cannot coarsen further");
184 LO CoarseNumZLayers =
185 (
LO)floor(((coordT)(FineNumZLayers + 1)) / ((coordT)CoarsenRate) - 1.0);
186 if (CoarseNumZLayers < 1)
187 CoarseNumZLayers = 1;
192 BuildSemiCoarsenP(coarseLevel, Ndofs, Nnodes, BlkSize, FineNumZLayers,
193 CoarseNumZLayers, TLayerId, TVertLineId, A, fineNullspace, P,
199 coarseLevel.
Set(
"NumZLayers", Teuchos::as<LO>(CoarseNumZLayers),
203 Set(coarseLevel,
"P", P);
204 Set(coarseLevel,
"Nullspace", coarseNullspace);
207 if (bTransferCoordinates_) {
213 if (fineLevel.GetLevelID() == 0 &&
218 if (myCoordsFact == Teuchos::null) {
219 myCoordsFact = fineLevel.GetFactoryManager()->GetFactory(
"Coordinates");
221 if (fineLevel.IsAvailable(
"Coordinates", myCoordsFact.
get())) {
223 fineLevel.Get<
RCP<xdMV>>(
"Coordinates", myCoordsFact.
get());
227 TEUCHOS_TEST_FOR_EXCEPTION(fineCoords == Teuchos::null,
229 "No Coordinates found provided by the user.");
231 TEUCHOS_TEST_FOR_EXCEPTION(fineCoords->getNumVectors() != 3,
233 "Three coordinates arrays must be supplied if "
234 "line detection orientation not given.");
236 fineCoords->getDataNonConst(0);
238 fineCoords->getDataNonConst(1);
240 fineCoords->getDataNonConst(2);
244 typename Teuchos::ScalarTraits<Scalar>::coordinateType zval_max =
246 typename Teuchos::ScalarTraits<Scalar>::coordinateType>::one() /
248 typename Teuchos::ScalarTraits<Scalar>::coordinateType>::sfmin();
249 typename Teuchos::ScalarTraits<Scalar>::coordinateType zval_min =
251 typename Teuchos::ScalarTraits<Scalar>::coordinateType>::one() /
253 typename Teuchos::ScalarTraits<Scalar>::coordinateType>::sfmin();
254 for (
auto it = z.
begin(); it != z.
end(); ++it) {
261 LO myCoarseZLayers = Teuchos::as<LO>(CoarseNumZLayers);
264 myZLayerCoords = Teuchos::arcp<
265 typename Teuchos::ScalarTraits<Scalar>::coordinateType>(
267 if (myCoarseZLayers == 1) {
268 myZLayerCoords[0] = zval_min;
270 typename Teuchos::ScalarTraits<Scalar>::coordinateType dz =
271 (zval_max - zval_min) / (myCoarseZLayers - 1);
272 for (LO k = 0; k < myCoarseZLayers; ++k) {
273 myZLayerCoords[k] = k * dz;
281 LO numVertLines = Nnodes / FineNumZLayers;
282 LO numLocalCoarseNodes = numVertLines * myCoarseZLayers;
285 fineCoords->getMap()->lib(),
287 Teuchos::as<size_t>(numLocalCoarseNodes),
288 fineCoords->getMap()->getIndexBase(), fineCoords->getMap()->getComm());
290 typename Teuchos::ScalarTraits<Scalar>::coordinateType, LO, GO,
291 NO>::Build(coarseCoordMap, fineCoords->getNumVectors());
292 coarseCoords->putScalar(-1.0);
294 coarseCoords->getDataNonConst(0);
296 coarseCoords->getDataNonConst(1);
298 coarseCoords->getDataNonConst(2);
301 LO cntCoarseNodes = 0;
302 for (LO vt = 0; vt < TVertLineId.size(); ++vt) {
304 LO curVertLineId = TVertLineId[vt];
306 if (cx[curVertLineId * myCoarseZLayers] == -1.0 &&
307 cy[curVertLineId * myCoarseZLayers] == -1.0) {
309 for (LO n = 0; n < myCoarseZLayers; ++n) {
310 cx[curVertLineId * myCoarseZLayers + n] = x[vt];
311 cy[curVertLineId * myCoarseZLayers + n] = y[vt];
312 cz[curVertLineId * myCoarseZLayers + n] = myZLayerCoords[n];
314 cntCoarseNodes += myCoarseZLayers;
317 TEUCHOS_TEST_FOR_EXCEPTION(cntCoarseNodes > numLocalCoarseNodes,
319 "number of coarse nodes is inconsistent.");
320 if (cntCoarseNodes == numLocalCoarseNodes)
325 Set(coarseLevel,
"Coordinates", coarseCoords);
334 BuildSemiCoarsenP(
Level &coarseLevel,
const LO NFRows,
const LO NFNodes,
335 const LO DofsPerNode,
const LO NFLayers,
341 using impl_SC =
typename Kokkos::ArithTraits<SC>::val_type;
342 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
343 using LOView1D = Kokkos::View<LO *, DeviceType>;
344 using LOView2D = Kokkos::View<LO **, DeviceType>;
348 const auto FCol2LayerVector =
350 const auto localTemp =
353 if (importer == Teuchos::null)
354 importer = ImportFactory::Build(Amat->getDomainMap(), Amat->getColMap());
357 const auto localTempHost = localTemp->getHostLocalView(Xpetra::Access::ReadWrite);
358 for (
int row = 0; row < NFRows; row++)
359 localTempHost(row, 0) = LayerId[row / DofsPerNode];
360 const auto localTempView = localTemp->getDeviceLocalView(Xpetra::Access::ReadWrite);
361 Kokkos::deep_copy(localTempView, localTempHost);
362 FCol2LayerVector->doImport(*localTemp, *(importer),
Xpetra::INSERT);
364 const auto FCol2LayerView = FCol2LayerVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
365 const auto FCol2Layer = Kokkos::subview(FCol2LayerView, Kokkos::ALL(), 0);
369 const auto FCol2DofVector =
373 const auto localTempHost = localTemp->getHostLocalView(Xpetra::Access::ReadWrite);
374 for (
int row = 0; row < NFRows; row++)
375 localTempHost(row, 0) = row % DofsPerNode;
376 const auto localTempView = localTemp->getDeviceLocalView(Xpetra::Access::ReadWrite);
377 Kokkos::deep_copy(localTempView, localTempHost);
378 FCol2DofVector->doImport(*localTemp, *(importer),
Xpetra::INSERT);
380 const auto FCol2DofView = FCol2DofVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
381 const auto FCol2Dof = Kokkos::subview(FCol2DofView, Kokkos::ALL(), 0);
387 NVertLines = VertLineId[0];
388 for (
int node = 1; node < NFNodes; ++node)
389 if (VertLineId[node] > NVertLines)
390 NVertLines = VertLineId[node];
394 LOView2D LineLayer2Node(
"LineLayer2Node", NVertLines, NFLayers);
395 typename LOView2D::HostMirror LineLayer2NodeHost =
396 Kokkos::create_mirror_view(LineLayer2Node);
397 for (
int node = 0; node < NFNodes; ++node)
398 LineLayer2NodeHost(VertLineId[node], LayerId[node]) = node;
399 Kokkos::deep_copy(LineLayer2Node, LineLayer2NodeHost);
402 LOView1D CLayer2FLayer(
"CLayer2FLayer", NCLayers);
403 typename LOView1D::HostMirror CLayer2FLayerHost =
404 Kokkos::create_mirror_view(CLayer2FLayer);
406 const LO FirstStride =
407 (
LO)ceil(((coordT)(NFLayers + 1)) / ((coordT)(NCLayers + 1)));
408 const coordT RestStride =
409 ((coordT)(NFLayers - FirstStride + 1)) / ((coordT)NCLayers);
411 (
LO)floor((((coordT)(NFLayers - FirstStride + 1)) / RestStride) + .00001);
413 "sizes do not match.");
414 coordT stride = (coordT)FirstStride;
415 for (
int clayer = 0; clayer < NCLayers; ++clayer) {
416 CLayer2FLayerHost(clayer) = (
LO)floor(stride) - 1;
417 stride += RestStride;
419 Kokkos::deep_copy(CLayer2FLayer, CLayer2FLayerHost);
423 int MaxStencilSize = 1;
424 LOView1D CLayer2StartLayer(
"CLayer2StartLayer", NCLayers);
425 LOView1D CLayer2StencilSize(
"CLayer2StencilSize", NCLayers);
426 typename LOView1D::HostMirror CLayer2StartLayerHost =
427 Kokkos::create_mirror_view(CLayer2StartLayer);
428 typename LOView1D::HostMirror CLayer2StencilSizeHost =
429 Kokkos::create_mirror_view(CLayer2StencilSize);
430 for (
int clayer = 0; clayer < NCLayers; ++clayer) {
431 const int startLayer = (clayer > 0) ? CLayer2FLayerHost(clayer - 1) + 1 : 0;
432 const int stencilSize = (clayer < NCLayers - 1)
433 ? CLayer2FLayerHost(clayer + 1) - startLayer
434 : NFLayers - startLayer;
436 if (MaxStencilSize < stencilSize)
437 MaxStencilSize = stencilSize;
438 CLayer2StartLayerHost(clayer) = startLayer;
439 CLayer2StencilSizeHost(clayer) = stencilSize;
441 Kokkos::deep_copy(CLayer2StartLayer, CLayer2StartLayerHost);
442 Kokkos::deep_copy(CLayer2StencilSize, CLayer2StencilSizeHost);
449 int Nmax = MaxStencilSize * DofsPerNode;
450 Kokkos::View<impl_SC ***, DeviceType> BandMat(
451 "BandMat", NVertLines, Nmax, Nmax);
452 Kokkos::View<impl_SC ***, DeviceType> BandSol(
453 "BandSol", NVertLines, Nmax, DofsPerNode);
459 for (
int clayer = 0; clayer < NCLayers; ++clayer)
460 NnzP += CLayer2StencilSizeHost(clayer);
461 NnzP *= NVertLines * DofsPerNode * DofsPerNode;
462 Kokkos::View<impl_SC *, DeviceType> Pvals(
"Pvals", NnzP);
463 Kokkos::View<LO *, DeviceType> Pcols(
"Pcols", NnzP);
468 Kokkos::View<size_t *, DeviceType> Pptr(
"Pptr", NFRows + 1);
469 typename Kokkos::View<size_t *, DeviceType>::HostMirror PptrHost =
470 Kokkos::create_mirror_view(Pptr);
471 Kokkos::deep_copy(PptrHost, 0);
472 for (
int line = 0; line < NVertLines; ++line) {
473 for (
int clayer = 0; clayer < NCLayers; ++clayer) {
474 const int stencilSize = CLayer2StencilSizeHost(clayer);
475 const int startLayer = CLayer2StartLayerHost(clayer);
476 for (
int snode = 0; snode < stencilSize; ++snode) {
477 for (
int dofi = 0; dofi < DofsPerNode; ++dofi) {
478 const int layer = startLayer + snode;
479 const int AmatBlkRow = LineLayer2NodeHost(line, layer);
480 const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
481 PptrHost(AmatRow + 1) += DofsPerNode;
486 for (
int i = 2; i < NFRows + 1; ++i)
487 PptrHost(i) += PptrHost(i - 1);
490 "Number of nonzeros in P does not match");
491 Kokkos::deep_copy(Pptr, PptrHost);
495 Kokkos::View<LO *, Kokkos::DefaultHostExecutionSpace> layerBuckets(
496 "layerBuckets", NFLayers);
497 Kokkos::deep_copy(layerBuckets, 0);
498 LOView2D CLayerSNode2PptrOffset(
"CLayerSNode2PptrOffset", NCLayers,
500 typename LOView2D::HostMirror CLayerSNode2PptrOffsetHost =
501 Kokkos::create_mirror_view(CLayerSNode2PptrOffset);
502 for (
int clayer = 0; clayer < NCLayers; ++clayer) {
503 const int stencilSize = CLayer2StencilSizeHost(clayer);
504 const int startLayer = CLayer2StartLayerHost(clayer);
505 for (
int snode = 0; snode < stencilSize; ++snode) {
506 const int layer = startLayer + snode;
507 CLayerSNode2PptrOffsetHost(clayer, snode) = layerBuckets(layer);
508 layerBuckets(layer)++;
511 Kokkos::deep_copy(CLayerSNode2PptrOffset, CLayerSNode2PptrOffsetHost);
516 const auto localAmat = Amat->getLocalMatrixDevice();
517 const auto zero = impl_ATS::zero();
518 const auto one = impl_ATS::one();
520 using range_policy = Kokkos::RangePolicy<execution_space>;
521 Kokkos::parallel_for(
522 "MueLu::SemiCoarsenPFactory_kokkos::BuildSemiCoarsenP Fill P",
523 range_policy(0, NVertLines), KOKKOS_LAMBDA(
const int line) {
524 for (
int clayer = 0; clayer < NCLayers; ++clayer) {
527 Kokkos::subview(BandSol, line, Kokkos::ALL(), Kokkos::ALL());
528 for (
int row = 0; row < Nmax; ++row)
529 for (
int dof = 0; dof < DofsPerNode; ++dof)
530 bandSol(row, dof) = zero;
533 const int stencilSize = CLayer2StencilSize(clayer);
534 const int N = stencilSize * DofsPerNode;
536 Kokkos::subview(BandMat, line, Kokkos::ALL(), Kokkos::ALL());
537 for (
int row = 0; row < Nmax; ++row)
538 for (
int col = 0; col < Nmax; ++col)
540 (row == col && row >= N) ? one : zero;
543 const int flayer = CLayer2FLayer(clayer);
544 const int startLayer = CLayer2StartLayer(clayer);
545 for (
int snode = 0; snode < stencilSize; ++snode) {
546 const int layer = startLayer + snode;
547 if (layer == flayer) {
548 for (
int dof = 0; dof < DofsPerNode; ++dof) {
549 const int row = snode * DofsPerNode + dof;
550 bandMat(row, row) = one;
551 bandSol(row, dof) = one;
554 const int AmatBlkRow = LineLayer2Node(line, layer);
555 for (
int dofi = 0; dofi < DofsPerNode; ++dofi) {
557 const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
558 const auto localAmatRow = localAmat.rowConst(AmatRow);
559 const int AmatRowLeng = localAmatRow.length;
561 const int row = snode * DofsPerNode + dofi;
562 for (
int dofj = 0; dofj < DofsPerNode; ++dofj) {
563 const int col = snode * DofsPerNode + dofj;
568 for (
int i = 0; i < AmatRowLeng; ++i) {
569 const int colidx = localAmatRow.colidx(i);
570 if (FCol2Layer(colidx) == layer &&
571 FCol2Dof(colidx) == dofj)
572 val += localAmatRow.value(i);
574 bandMat(row, col) = val;
580 for (
int i = 0; i < AmatRowLeng; ++i) {
581 const int colidx = localAmatRow.colidx(i);
582 if (FCol2Layer(colidx) == layer - 1 &&
583 FCol2Dof(colidx) == dofj)
584 val += localAmatRow.value(i);
586 bandMat(row, col - DofsPerNode) = val;
589 if (snode < stencilSize - 1) {
593 for (
int i = 0; i < AmatRowLeng; ++i) {
594 const int colidx = localAmatRow.colidx(i);
595 if (FCol2Layer(colidx) == layer + 1 &&
596 FCol2Dof(colidx) == dofj)
597 val += localAmatRow.value(i);
599 bandMat(row, col + DofsPerNode) = val;
607 namespace KB = KokkosBatched;
608 using lu_type =
typename KB::SerialLU<KB::Algo::LU::Unblocked>;
609 lu_type::invoke(bandMat);
611 typename KB::SerialTrsm<KB::Side::Left, KB::Uplo::Lower,
612 KB::Trans::NoTranspose, KB::Diag::Unit,
613 KB::Algo::Trsm::Unblocked>;
614 trsv_l_type::invoke(one, bandMat, bandSol);
615 using trsv_u_type =
typename KB::SerialTrsm<
616 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose,
617 KB::Diag::NonUnit, KB::Algo::Trsm::Unblocked>;
618 trsv_u_type::invoke(one, bandMat, bandSol);
621 for (
int snode = 0; snode < stencilSize; ++snode) {
622 for (
int dofj = 0; dofj < DofsPerNode; ++dofj) {
623 for (
int dofi = 0; dofi < DofsPerNode; ++dofi) {
624 const int layer = startLayer + snode;
625 const int AmatBlkRow = LineLayer2Node(line, layer);
626 const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
628 const int pptrOffset = CLayerSNode2PptrOffset(clayer, snode);
630 Pptr(AmatRow) + pptrOffset * DofsPerNode + dofj;
633 line * NCLayers + clayer;
634 Pcols(pptr) = col * DofsPerNode + dofj;
635 Pvals(pptr) = bandSol(snode * DofsPerNode + dofi, dofj);
647 std::vector<size_t> stridingInfo_{(size_t)DofsPerNode};
649 rowMap->lib(), NCLayers * itemp, NCLayers * NVertLines * DofsPerNode, 0,
650 stridingInfo_, rowMap->getComm(), -1, 0);
651 P =
rcp(
new CrsMatrixWrap(rowMap, coarseMap, 0));
652 RCP<CrsMatrix> PCrs = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
653 PCrs->setAllValues(Pptr, Pcols, Pvals);
654 PCrs->expertStaticFillComplete(coarseMap, Amat->getDomainMap());
657 if (Amat->IsView(
"stridedMaps") ==
true)
658 P->CreateView(
"stridedMaps", Amat->getRowMap(
"stridedMaps"), coarseMap);
660 P->CreateView(
"stridedMaps", P->getRangeMap(), coarseMap);
664 MultiVectorFactory::Build(coarseMap, fineNullspace->getNumVectors());
665 const int numVectors = fineNullspace->getNumVectors();
666 const auto fineNullspaceView = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadOnly);
667 const auto coarseNullspaceView = coarseNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
668 using range_policy = Kokkos::RangePolicy<execution_space>;
669 Kokkos::parallel_for(
670 "MueLu::SemiCoarsenPFactory_kokkos::BuildSemiCoarsenP Inject Nullspace",
671 range_policy(0, NVertLines), KOKKOS_LAMBDA(
const int line) {
672 for (
int clayer = 0; clayer < NCLayers; ++clayer) {
673 const int layer = CLayer2FLayer(clayer);
674 const int AmatBlkRow =
675 LineLayer2Node(line, layer);
677 line * NCLayers + clayer;
678 for (
int k = 0; k < numVectors; ++k) {
679 for (
int dofi = 0; dofi < DofsPerNode; ++dofi) {
680 const int fRow = AmatBlkRow * DofsPerNode + dofi;
681 const int cRow = col * DofsPerNode + dofi;
682 coarseNullspaceView(cRow, k) = fineNullspaceView(fRow, k);
691 #define MUELU_SEMICOARSENPFACTORY_KOKKOS_SHORT
692 #endif // MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
ParameterList & setEntry(const std::string &name, U &&entry)
MueLu::DefaultLocalOrdinal LocalOrdinal
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
ParameterList & set(std::string const &name, T const &value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
static const NoFactory * get()
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Class that holds all level-specific information.
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
void Set(const std::string &ename, const T &entry, const FactoryBase *factory=NoFactory::get())
static RCP< Vector > Build(const Teuchos::RCP< const Map > &map, bool zeroOut=true)
static const Teuchos::ParameterEntry & getEntry(const std::string &name)
Returns default entry from the "master" list corresponding to the specified name. ...
const RCP< const FactoryManagerBase > GetFactoryManager()
returns the current factory manager
int GetLevelID() const
Return level number.
Exception throws to report errors in the internal logical of the program.
Prolongator factory performing semi-coarsening.
void DeclareInput(const std::string &ename, const FactoryBase *factory, const FactoryBase *requestedBy=NoFactory::get())
Callback from FactoryBase::CallDeclareInput() and FactoryBase::DeclareInput()
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.