46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 
   47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 
   49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR 
   55 #include "MueLu_Aggregates_kokkos.hpp" 
   56 #include "MueLu_AmalgamationFactory_kokkos.hpp" 
   57 #include "MueLu_AmalgamationInfo_kokkos.hpp" 
   58 #include "MueLu_CoarseMapFactory_kokkos.hpp" 
   60 #include "MueLu_NullspaceFactory_kokkos.hpp" 
   61 #include "MueLu_PerfUtils.hpp" 
   63 #include "MueLu_Utilities_kokkos.hpp" 
   69     template<
class LocalOrdinal, 
class View>
 
   70     class ReduceMaxFunctor{
 
   72       ReduceMaxFunctor(View view) : view_(view) { }
 
   74       KOKKOS_INLINE_FUNCTION
 
   80       KOKKOS_INLINE_FUNCTION
 
   87       KOKKOS_INLINE_FUNCTION
 
   96     template<
class LOType, 
class GOType, 
class SCType,
class DeviceType, 
class NspType, 
class aggRowsType, 
class maxAggDofSizeType, 
class agg2RowMapLOType, 
class statusType, 
class rowsType, 
class rowsAuxType, 
class colsAuxType, 
class valsAuxType>
 
   97     class LocalQRDecompFunctor {
 
  103       typedef typename DeviceType::execution_space execution_space;
 
  104       typedef Kokkos::ArithTraits<SC> ATS;
 
  105       typedef typename ATS::magnitudeType Magnitude;
 
  115       maxAggDofSizeType maxAggDofSize; 
 
  116       agg2RowMapLOType agg2RowMapLO;
 
  117       statusType statusAtomic;
 
  124       LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, 
bool doQRStep_) :
 
  128         maxAggDofSize(maxAggDofSize_),
 
  129         agg2RowMapLO(agg2RowMapLO_),
 
  130         statusAtomic(statusAtomic_),
 
  138       KOKKOS_INLINE_FUNCTION
 
  140         auto agg = thread.league_rank();
 
  143         auto aggSize = aggRows(agg+1) - aggRows(agg);
 
  145         const SC one     = ATS::one();
 
  146         const SC two     = one + one;
 
  147         const SC zero    = ATS::zero();
 
  148         const auto zeroM = ATS::magnitude(zero);
 
  151         int n = fineNS.extent(1);
 
  154         Xpetra::global_size_t offset = agg * n;
 
  159           shared_matrix r(thread.team_shmem(), m, n);     
 
  160           for (
int j = 0; j < n; j++)
 
  161             for (
int k = 0; k < m; k++)
 
  162               r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
 
  165           for (
int i = 0; i < m; i++) {
 
  166             for (
int j = 0; j < n; j++)
 
  167               printf(
" %5.3lf ", r(i,j));
 
  173           shared_matrix q(thread.team_shmem(), m, m);     
 
  175             bool isSingular = 
false;
 
  179             for (
int i = 0; i < m; i++) {
 
  180               for (
int j = 0; j < m; j++)
 
  185             for (
int k = 0; k < n; k++) {  
 
  187               Magnitude s = zeroM, norm, norm_x;
 
  188               for (
int i = k+1; i < m; i++)
 
  189                 s += 
pow(ATS::magnitude(r(i,k)), 2);
 
  190               norm = 
sqrt(
pow(ATS::magnitude(r(k,k)), 2) + s);
 
  199               norm_x = 
sqrt(
pow(ATS::magnitude(r(k,k)), 2) + s);
 
  200               if (norm_x == zeroM) {
 
  208               for (
int i = k; i < m; i++)
 
  212               for (
int j = k+1; j < n; j++) {
 
  215                 for (
int i = k; i < m; i++)
 
  216                   si += r(i,k) * r(i,j);
 
  217                 for (
int i = k; i < m; i++)
 
  218                   r(i,j) -= two*si * r(i,k);
 
  222               for (
int j = k; j < m; j++) {
 
  225                 for (
int i = k; i < m; i++)
 
  226                   si += r(i,k) * qt(i,j);
 
  227                 for (
int i = k; i < m; i++)
 
  228                   qt(i,j) -= two*si * r(i,k);
 
  233               for (
int i = k+1; i < m; i++)
 
  239             for (
int i = 0; i < m; i++)
 
  240               for (
int j = 0; j < i; j++) {
 
  248             for (
int j = 0; j < n; j++)
 
  249               for (
int k = 0; k <= j; k++)
 
  250                 coarseNS(offset+k,j) = r(k,j);
 
  253               statusAtomic(1) = 
true;
 
  288             for (
int j = 0; j < n; j++)
 
  289               for (
int k = 0; k < n; k++)
 
  291                   coarseNS(offset+k,j) = r(k,j);
 
  293                   coarseNS(offset+k,j) = (k == j ? one : zero);
 
  296             for (
int i = 0; i < m; i++)
 
  297               for (
int j = 0; j < n; j++)
 
  298                 q(i,j) = (j == i ? one : zero);
 
  302           for (
int j = 0; j < m; j++) {
 
  303             LO localRow = agg2RowMapLO(aggRows(agg)+j);
 
  304             size_t rowStart = rowsAux(localRow);
 
  306             for (
int k = 0; k < n; k++) {
 
  308               if (q(j,k) != zero) {
 
  309                 colsAux(rowStart+lnnz) = offset + k;
 
  310                 valsAux(rowStart+lnnz) = q(j,k);
 
  314             rows(localRow+1) = lnnz;
 
  320           for (
int i = 0; i < m; i++) {
 
  321             for (
int j = 0; j < n; j++)
 
  322               printf(
" %5.3lf ", coarseNS(i,j));
 
  327           for (
int i = 0; i < aggSize; i++) {
 
  328             for (
int j = 0; j < aggSize; j++)
 
  329               printf(
" %5.3lf ", q(i,j));
 
  343           for (
int j = 0; j < m; j++) {
 
  344             LO localRow = agg2RowMapLO(aggRows(agg)+j);
 
  345             size_t rowStart = rowsAux(localRow);
 
  347             for (
int k = 0; k < n; k++) {
 
  348               const SC qr_jk = fineNS(localRow,k);
 
  351                 colsAux(rowStart+lnnz) = offset + k;
 
  352                 valsAux(rowStart+lnnz) = qr_jk;
 
  356             rows(localRow+1) = lnnz;
 
  360           for (
int j = 0; j < n; j++)
 
  361             coarseNS(offset+j,j) = one;
 
  368       size_t team_shmem_size( 
int  )
 const {
 
  370           int m = maxAggDofSize;
 
  371           int n = fineNS.extent(1);
 
  372           return shared_matrix::shmem_size(m, n) +    
 
  373             shared_matrix::shmem_size(m, m);     
 
  381   template <
class Scalar, 
class LocalOrdinal, 
class GlobalOrdinal, 
class DeviceType>
 
  382   RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList()
 const {
 
  383     RCP<ParameterList> validParamList = 
rcp(
new ParameterList());
 
  385 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) 
  388 #undef SET_VALID_ENTRY 
  390     validParamList->set< RCP<const FactoryBase> >(
"A",                  Teuchos::null, 
"Generating factory of the matrix A");
 
  391     validParamList->set< RCP<const FactoryBase> >(
"Aggregates",         Teuchos::null, 
"Generating factory of the aggregates");
 
  392     validParamList->set< RCP<const FactoryBase> >(
"Nullspace",          Teuchos::null, 
"Generating factory of the nullspace");
 
  393     validParamList->set< RCP<const FactoryBase> >(
"Scaled Nullspace",   Teuchos::null, 
"Generating factory of the scaled nullspace");
 
  394     validParamList->set< RCP<const FactoryBase> >(
"UnAmalgamationInfo", Teuchos::null, 
"Generating factory of UnAmalgamationInfo");
 
  395     validParamList->set< RCP<const FactoryBase> >(
"CoarseMap",          Teuchos::null, 
"Generating factory of the coarse map");
 
  396     validParamList->set< RCP<const FactoryBase> >(
"Coordinates",        Teuchos::null, 
"Generating factory of the coordinates");
 
  399     ParameterList norecurse;
 
  400     norecurse.disableRecursiveValidation();
 
  401     validParamList->set<ParameterList> (
"matrixmatrix: kernel params", norecurse, 
"MatrixMatrix kernel parameters");
 
  403     return validParamList;
 
  406   template <
class Scalar, 
class LocalOrdinal, 
class GlobalOrdinal, 
class DeviceType>
 
  407   void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& )
 const {
 
  409     const ParameterList& pL = GetParameterList();
 
  411     std::string nspName = 
"Nullspace";
 
  412     if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
 
  414     Input(fineLevel, 
"A");
 
  415     Input(fineLevel, 
"Aggregates");
 
  416     Input(fineLevel, nspName);
 
  417     Input(fineLevel, 
"UnAmalgamationInfo");
 
  418     Input(fineLevel, 
"CoarseMap");
 
  419     if( fineLevel.GetLevelID() == 0 &&
 
  421         pL.get<
bool>(
"tentative: build coarse coordinates") ) {       
 
  422       bTransferCoordinates_ = 
true;                                   
 
  423       Input(fineLevel, 
"Coordinates");
 
  424     } 
else if (bTransferCoordinates_) {
 
  425       Input(fineLevel, 
"Coordinates");
 
  429   template <
class Scalar,
class LocalOrdinal, 
class GlobalOrdinal, 
class DeviceType>
 
  430   void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel)
 const {
 
  431     return BuildP(fineLevel, coarseLevel);
 
  434   template <
class Scalar,
class LocalOrdinal, 
class GlobalOrdinal, 
class DeviceType>
 
  435   void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel)
 const {
 
  436     FactoryMonitor m(*
this, 
"Build", coarseLevel);
 
  439     typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
 
  440     const ParameterList& pL = GetParameterList();
 
  441     std::string nspName = 
"Nullspace";
 
  442     if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
 
  444     auto A             = Get< RCP<Matrix> >                  (fineLevel, 
"A");
 
  445     auto aggregates    = Get< RCP<Aggregates_kokkos> >       (fineLevel, 
"Aggregates");
 
  446     auto amalgInfo     = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel, 
"UnAmalgamationInfo");
 
  447     auto fineNullspace = Get< RCP<MultiVector> >             (fineLevel, nspName);
 
  448     auto coarseMap     = Get< RCP<const Map> >               (fineLevel, 
"CoarseMap");
 
  449     RCP<RealValuedMultiVector> fineCoords;
 
  450     if(bTransferCoordinates_) {
 
  451       fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, 
"Coordinates");
 
  454     RCP<Matrix>      Ptentative;
 
  455     RCP<MultiVector> coarseNullspace;
 
  456     RCP<RealValuedMultiVector> coarseCoords;
 
  458     if(bTransferCoordinates_) {
 
  459       ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
 
  460       GO                  indexBase    = coarseMap->getIndexBase();
 
  463       if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
 
  464         blkSize = rcp_dynamic_cast<
const StridedMap>(coarseMap)->getFixedBlockSize();
 
  466       Array<GO>           elementList;
 
  467       ArrayView<const GO> elementListView;
 
  471         elementListView = elementAList;
 
  474         auto numElements = elementAList.size() / blkSize;
 
  476         elementList.resize(numElements);
 
  479         for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
 
  480           elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
 
  482         elementListView = elementList;
 
  485       auto uniqueMap      = fineCoords->getMap();
 
  487                                               elementListView, indexBase, coarseMap->getComm());
 
  488       coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
 
  491       RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
 
  492       if (aggregates->AggregatesCrossProcessors()) {
 
  493         auto nonUniqueMap = aggregates->GetMap();
 
  494         auto importer     = ImportFactory::Build(uniqueMap, nonUniqueMap);
 
  496         ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
 
  497         ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
 
  502       auto aggGraph = aggregates->GetGraph();
 
  503       auto numAggs  = aggGraph.numRows();
 
  505       auto fineCoordsView   = fineCoords  ->template getLocalView<DeviceType>();
 
  506       auto coarseCoordsView = coarseCoords->template getLocalView<DeviceType>();
 
  510         SubFactoryMonitor m2(*
this, 
"AverageCoords", coarseLevel);
 
  512         const auto dim = fineCoords->getNumVectors();
 
  514         typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
 
  515         for (
size_t j = 0; j < dim; j++) {
 
  517                                KOKKOS_LAMBDA(
const LO i) {
 
  521                                  auto aggregate = aggGraph.rowConst(i);
 
  523                                  coordinate_type sum = 0.0; 
 
  524                                  for (
size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
 
  525                                    sum += fineCoordsRandomView(aggregate(colID),j);
 
  527                                  coarseCoordsView(i,j) = sum / aggregate.length;
 
  533     if (!aggregates->AggregatesCrossProcessors())
 
  534       BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
 
  536       BuildPcoupled  (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
 
  546     if (A->IsView(
"stridedMaps") == 
true)
 
  547       Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
 
  549       Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(),   coarseMap);
 
  551     if(bTransferCoordinates_) {
 
  552       Set(coarseLevel, 
"Coordinates", coarseCoords);
 
  554     Set(coarseLevel, 
"Nullspace", coarseNullspace);
 
  555     Set(coarseLevel, 
"P",         Ptentative);
 
  558       RCP<ParameterList> params = 
rcp(
new ParameterList());
 
  559       params->set(
"printLoadBalancingInfo", 
true);
 
  564   template <
class Scalar,
class LocalOrdinal, 
class GlobalOrdinal, 
class DeviceType>
 
  565   void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
 
  566   BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
 
  567                   RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
 
  568                   RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
 
  569                   RCP<MultiVector>& coarseNullspace, 
const int levelID)
 const {
 
  570     auto rowMap = A->getRowMap();
 
  571     auto colMap = A->getColMap();
 
  573     const size_t numRows  = rowMap->getNodeNumElements();
 
  574     const size_t NSDim    = fineNullspace->getNumVectors();
 
  576     typedef Kokkos::ArithTraits<SC>     ATS;
 
  577     using impl_ATS = Kokkos::ArithTraits<typename ATS::val_type>;
 
  578     const SC zero = ATS::zero(), one = ATS::one();
 
  582     typename Aggregates_kokkos::local_graph_type aggGraph;
 
  584       SubFactoryMonitor m2(*
this, 
"Get Aggregates graph", coarseLevel);
 
  585       aggGraph = aggregates->GetGraph();
 
  587     auto aggRows  = aggGraph.row_map;
 
  588     auto aggCols  = aggGraph.entries;
 
  595       SubFactoryMonitor m2(*
this, 
"Check good map", coarseLevel);
 
  596       goodMap = isGoodMap(*rowMap, *colMap);
 
  600         "MueLu: TentativePFactory_kokkos: for now works only with good maps " 
  601         "(i.e. \"matching\" row and column maps)");
 
  610     LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
 
  612     amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
 
  613     GO globalOffset = amalgInfo->GlobalOffset();
 
  616     auto         procWinner    = aggregates->GetProcWinner()  ->template getLocalView<DeviceType>();
 
  617     auto         vertex2AggId  = aggregates->GetVertex2AggId()->template getLocalView<DeviceType>();
 
  618     const size_t numAggregates = aggregates->GetNumAggregates();
 
  620     int myPID = aggregates->GetMap()->getComm()->getRank();
 
  625     typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
 
  626     AggSizeType aggDofSizes;
 
  628     if (stridedBlockSize == 1) {
 
  629       SubFactoryMonitor m2(*
this, 
"Calc AggSizes", coarseLevel);
 
  632       aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates+1);
 
  634       auto sizesConst = aggregates->ComputeAggregateSizes();
 
  638       SubFactoryMonitor m2(*
this, 
"Calc AggSizes", coarseLevel);
 
  641       aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
 
  643       auto nodeMap = aggregates->GetMap()->getLocalMap();
 
  644       auto dofMap  = colMap->getLocalMap();
 
  647         KOKKOS_LAMBDA(
const LO agg) {
 
  648           auto aggRowView = aggGraph.rowConst(agg);
 
  651           for (LO colID = 0; colID < aggRowView.length; colID++) {
 
  652             GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
 
  654             for (LO k = 0; k < stridedBlockSize; k++) {
 
  655               GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
 
  657               if (dofMap.getLocalElement(dofGID) != INVALID)
 
  661           aggDofSizes(agg+1) = size;
 
  668     ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
 
  669     Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
 
  673     Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
 
  674       KOKKOS_LAMBDA(
const LO i, LO& update, 
const bool& final_pass) {
 
  675         update += aggDofSizes(i);
 
  677           aggDofSizes(i) = update;
 
  684       SubFactoryMonitor m2(*
this, 
"Create Agg2RowMap", coarseLevel);
 
  686       AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
 
  689       Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
 
  690         KOKKOS_LAMBDA(
const LO lnode) {
 
  691           if (procWinner(lnode, 0) == myPID) {
 
  693             auto aggID = vertex2AggId(lnode,0);
 
  695             auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
 
  699             for (LO k = 0; k < stridedBlockSize; k++)
 
  700               agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
 
  707     coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
 
  710     auto fineNS   = fineNullspace  ->template getLocalView<DeviceType>();
 
  711     auto coarseNS = coarseNullspace->template getLocalView<DeviceType>();
 
  715     typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type    local_matrix_type;
 
  716     typedef typename local_matrix_type::row_map_type::non_const_type   rows_type;
 
  717     typedef typename local_matrix_type::index_type::non_const_type     cols_type;
 
  718     typedef typename local_matrix_type::values_type::non_const_type    vals_type;
 
  723     status_type status(
"status");
 
  725     typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
 
  726     typename AppendTrait<status_type,      Kokkos::Atomic>      ::type statusAtomic = status;
 
  728     const ParameterList& pL = GetParameterList();
 
  729     const bool& doQRStep = pL.get<
bool>(
"tentative: calculate qr");
 
  731       GetOStream(
Runtime1) << 
"TentativePFactory : bypassing local QR phase" << std::endl;
 
  733         GetOStream(
Warnings0) << 
"TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
 
  736     size_t nnzEstimate = numRows * NSDim;
 
  737     rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_rows"), numRows+1);
 
  738     cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_cols"), nnzEstimate);
 
  739     vals_type valsAux(
"Ptent_aux_vals", nnzEstimate);
 
  740     rows_type rows(
"Ptent_rows", numRows+1);
 
  743       SubFactoryMonitor m2(*
this, 
"Stage 0 (InitViews)", coarseLevel);
 
  748                            KOKKOS_LAMBDA(
const LO row) {
 
  749                              rowsAux(row) = row*NSDim;
 
  752                            KOKKOS_LAMBDA(
const LO j) {
 
  753                              colsAux(j) = INVALID;
 
  763       SubFactoryMonitor m2(*
this, 
"Stage 1 (LocalQR)", coarseLevel);
 
  774             auto agg = thread.league_rank();
 
  777             LO aggSize = aggRows(agg+1) - aggRows(agg);
 
  782             auto norm = impl_ATS::magnitude(zero);
 
  787             for (decltype(aggSize) k = 0; k < aggSize; k++) {
 
  788               auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
 
  795               statusAtomic(1) = 
true;
 
  800             coarseNS(agg, 0) = norm;
 
  803             for (decltype(aggSize) k = 0; k < aggSize; k++) {
 
  804               LO localRow = agg2RowMapLO(aggRows(agg)+k);
 
  805               SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
 
  807               rows(localRow+1) = 1;
 
  808               colsAux(localRow) = agg;
 
  809               valsAux(localRow) = localVal;
 
  814         typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
 
  816         for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
 
  818             std::ostringstream oss;
 
  819             oss << 
"MueLu::TentativePFactory::MakeTentative: ";
 
  821               case 0: oss << 
"!goodMap is not implemented";               
break;
 
  822               case 1: oss << 
"fine level NS part has a zero column";      
break;
 
  824             throw Exceptions::RuntimeError(oss.str());
 
  830             auto agg = thread.league_rank();
 
  833             LO aggSize = aggRows(agg+1) - aggRows(agg);
 
  836             coarseNS(agg, 0) = one;
 
  839             for (decltype(aggSize) k = 0; k < aggSize; k++) {
 
  840               LO localRow = agg2RowMapLO(aggRows(agg)+k);
 
  841               SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
 
  843               rows(localRow+1) = 1;
 
  844               colsAux(localRow) = agg;
 
  845               valsAux(localRow) = localVal;
 
  852                               KOKKOS_LAMBDA(
const LO i, 
size_t &nnz_count) {
 
  853                                 nnz_count += rows(i);
 
  866         SubFactoryMonitor m2 = SubFactoryMonitor(*
this, doQRStep ? 
"Stage 1 (LocalQR)" : 
"Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
 
  872             decltype(aggDofSizes ), decltype(maxAggSize), decltype(agg2RowMapLO),
 
  873             decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
 
  875                 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
 
  876                                rows, rowsAux, colsAux, valsAux, doQRStep);
 
  880       typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
 
  882       for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
 
  884           std::ostringstream oss;
 
  885           oss << 
"MueLu::TentativePFactory::MakeTentative: ";
 
  887             case 0: oss << 
"!goodMap is not implemented";               
break;
 
  888             case 1: oss << 
"fine level NS part has a zero column";      
break;
 
  890           throw Exceptions::RuntimeError(oss.str());
 
  901     if (nnz != nnzEstimate) {
 
  904         SubFactoryMonitor m2(*
this, 
"Stage 2 (CompressRows)", coarseLevel);
 
  906         Kokkos::parallel_scan(
"MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
 
  907                               KOKKOS_LAMBDA(
const LO i, LO& upd, 
const bool& 
final) {
 
  915         SubFactoryMonitor m2(*
this, 
"Stage 2 (CompressCols)", coarseLevel);
 
  917         cols = cols_type(
"Ptent_cols", nnz);
 
  918         vals = vals_type(
"Ptent_vals", nnz);
 
  924                              KOKKOS_LAMBDA(
const LO i) {
 
  925                                LO rowStart = rows(i);
 
  928                                for (
auto j = rowsAux(i); j < rowsAux(i+1); j++)
 
  929                                  if (colsAux(j) != INVALID) {
 
  930                                    cols(rowStart+lnnz) = colsAux(j);
 
  931                                    vals(rowStart+lnnz) = valsAux(j);
 
  943     GetOStream(
Runtime1) << 
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
 
  947       SubFactoryMonitor m2(*
this, 
"Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
 
  949       local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
 
  952       RCP<ParameterList> FCparams;
 
  953       if (pL.isSublist(
"matrixmatrix: kernel params"))
 
  954         FCparams = 
rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
 
  956         FCparams = 
rcp(
new ParameterList);
 
  959       FCparams->set(
"compute global constants", FCparams->get(
"compute global constants", 
false));
 
  960       FCparams->set(
"Timer Label",              std::string(
"MueLu::TentativeP-") + 
toString(levelID));
 
  962       auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
 
  963       Ptentative = 
rcp(
new CrsMatrixWrap(PtentCrs));
 
  967   template <
class Scalar,
class LocalOrdinal, 
class GlobalOrdinal, 
class DeviceType>
 
  968   void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
 
  969   BuildPcoupled(RCP<Matrix> , RCP<Aggregates_kokkos> ,
 
  970                 RCP<AmalgamationInfo_kokkos> , RCP<MultiVector> ,
 
  971                 RCP<const Map> , RCP<Matrix>& ,
 
  972                 RCP<MultiVector>& )
 const {
 
  973     throw Exceptions::RuntimeError(
"MueLu: Construction of coupled tentative P is not implemented");
 
  976   template <
class Scalar,
class LocalOrdinal, 
class GlobalOrdinal, 
class DeviceType>
 
  977   bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
 
  978   isGoodMap(
const Map& rowMap, 
const Map& colMap)
 const {
 
  979     auto rowLocalMap = rowMap.getLocalMap();
 
  980     auto colLocalMap = colMap.getLocalMap();
 
  982     const size_t numRows = rowLocalMap.getNodeNumElements();
 
  983     const size_t numCols = colLocalMap.getNodeNumElements();
 
  985     if (numCols < numRows)
 
  990       KOKKOS_LAMBDA(
const LO i, 
size_t &diff) {
 
  991         diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
 
  994     return (numDiff == 0);
 
  999 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT 
 1000 #endif // HAVE_MUELU_KOKKOS_REFACTOR 
 1001 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 
Important warning messages (one line) 
 
MueLu::DefaultLocalOrdinal LocalOrdinal
 
void deep_copy(const View< DT, DP...> &dst, typename ViewTraits< DT, DP...>::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP...>::specialize, void >::value >::type *=nullptr)
 
std::string toString(const T &what)
Little helper function to convert non-string types to strings. 
 
KOKKOS_FORCEINLINE_FUNCTION constexpr pair< T1, T2 > make_pair(T1 x, T2 y)
 
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
 
KOKKOS_INLINE_FUNCTION Kokkos::complex< RealType > pow(const complex< RealType > &x, const RealType &e)
 
static const NoFactory * get()
 
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
 
MueLu::DefaultScalar Scalar
 
MueLu::DefaultGlobalOrdinal GlobalOrdinal
 
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename std::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=nullptr)
 
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
 
KOKKOS_INLINE_FUNCTION Kokkos::complex< RealType > sqrt(const complex< RealType > &x)
 
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename std::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=nullptr)
 
Description of what is happening (more verbose) 
 
#define SET_VALID_ENTRY(name)