10 #ifndef __TPETRA_DISTRIBUTORLOWERTRIANGULARBLOCK_HPP
11 #define __TPETRA_DISTRIBUTORLOWERTRIANGULARBLOCK_HPP
14 #include "Tpetra_Distributor.hpp"
18 #include "Tpetra_Map.hpp"
19 #include "Tpetra_Operator.hpp"
20 #include "Tpetra_Vector.hpp"
21 #include "Tpetra_CrsMatrix.hpp"
26 template <
typename gno_t,
typename scalar_t>
27 class DistributionLowerTriangularBlock :
public Distribution<gno_t, scalar_t> {
109 using Distribution<gno_t, scalar_t>::me;
110 using Distribution<gno_t, scalar_t>::np;
111 using Distribution<gno_t, scalar_t>::comm;
112 using Distribution<gno_t, scalar_t>::nrows;
113 using typename Distribution<gno_t, scalar_t>::NZindex_t;
114 using typename Distribution<gno_t, scalar_t>::LocalNZmap_t;
119 DistributionLowerTriangularBlock(
size_t nrows_,
120 const Teuchos::RCP<
const Teuchos::Comm<int> > &comm_,
121 const Teuchos::ParameterList ¶ms)
122 : Distribution<gno_t, scalar_t>(nrows_, comm_, params)
123 , initialDist(nrows_, comm_, params)
124 , sortByDegree(false)
125 , permMatrix(Teuchos::null)
126 , redistributed(false)
127 , chunksComputed(false)
130 nChunks = int(std::sqrt(
float(npnp)));
131 while (nChunks * (nChunks + 1) < npnp) nChunks++;
133 TEUCHOS_TEST_FOR_EXCEPTION(nChunks * (nChunks + 1) != npnp, std::logic_error,
134 "Number of processors np = " << np <<
" must satisfy np = q(q+1)/2 for some q"
135 <<
" for LowerTriangularBlock distribution");
136 nChunksPerRow = double(nChunks) / double(nrows);
138 const Teuchos::ParameterEntry *pe = params.getEntryPtr(
"sortByDegree");
139 if (pe != NULL) sortByDegree = pe->getValue<
bool>(&sortByDegree);
141 pe = params.getEntryPtr(
"readPerProcess");
142 if (pe != NULL) redistributed = pe->getValue<
bool>(&redistributed);
144 if (me == 0) std::cout <<
"\n LowerTriangularBlock Distribution: "
146 <<
"\n nChunks = " << nChunks
150 enum DistributionType DistType() {
return LowerTriangularBlock; }
152 bool areChunksComputed() {
return chunksComputed; }
154 Teuchos::Array<gno_t> getChunkCuts() {
158 throw std::runtime_error(
"Error: Requested chunk cuts have not been computed yet.");
165 inline bool VecMine(gno_t i) {
return initialDist.VecMine(i); }
170 bool Mine(gno_t i, gno_t j) {
175 return (procFromChunks(i, j) == me);
177 return initialDist.Mine(i, j);
180 inline bool Mine(gno_t i, gno_t j,
int p) {
return Mine(i, j); }
183 void Redistribute(LocalNZmap_t &localNZ) {
188 gno_t myFirstRow = initialDist.getFirstRow(me);
189 gno_t nMyRows = initialDist.getNumRow(me);
190 Teuchos::Array<gno_t> nnzPerRow(nMyRows, 0);
192 Teuchos::Array<int> rcvcnt(np);
193 Teuchos::Array<int> disp(np);
194 for (
int sum = 0, p = 0; p < np; p++) {
195 int prows = initialDist.getNumRow(p);
206 Teuchos::Array<gno_t> permuteIndex;
207 Teuchos::Array<gno_t> sortedOrder;
209 Teuchos::Array<gno_t> globalRowBuf;
213 globalRowBuf.resize(nrows, 0);
218 for (
auto it = localNZ.begin(); it != localNZ.end(); it++) {
219 gno_t I = it->first.first;
220 nnzPerRow[I - myFirstRow]++;
223 Teuchos::gatherv<int, gno_t>(nnzPerRow.getRawPtr(), nMyRows,
224 globalRowBuf.getRawPtr(),
225 rcvcnt.getRawPtr(), disp.getRawPtr(),
228 permuteIndex.resize(nrows);
229 sortedOrder.resize(nrows);
233 for (
size_t i = 0; i != nrows; i++) sortedOrder[i] = i;
235 std::sort(sortedOrder.begin(), sortedOrder.end(),
236 [&](
const size_t &a,
const size_t &b) {
237 return (globalRowBuf[a] > globalRowBuf[b]);
241 for (
size_t i = 0; i < nrows; i++) {
242 permuteIndex[sortedOrder[i]] = i;
246 Teuchos::broadcast<int, gno_t>(*comm, 0, permuteIndex(0, nrows));
255 Teuchos::Array<gno_t> myRows;
256 for (
size_t i = 0; i < nrows; i++) {
257 if (VecMine(i)) myRows.push_back(i);
261 Teuchos::OrdinalTraits<Tpetra::global_size_t>::invalid();
262 Teuchos::RCP<const map_t> permMap =
263 rcp(
new map_t(dummy, myRows(), 0, comm));
265 permMatrix = rcp(
new matrix_t(permMap, 1));
267 Teuchos::Array<gno_t> cols(1);
268 Teuchos::Array<scalar_t> vals(1);
271 for (
size_t i = 0; i < permMap->getLocalNumElements(); i++) {
272 gno_t gid = permMap->getGlobalElement(i);
273 cols[0] = permuteIndex[gid];
274 permMatrix->insertGlobalValues(gid, cols(), vals());
277 permMatrix->fillComplete(permMap, permMap);
283 nnzPerRow.assign(nMyRows, 0);
285 for (
auto it = localNZ.begin(); it != localNZ.end(); it++) {
286 gno_t I = (sortByDegree ? permuteIndex[it->first.first]
288 gno_t J = (sortByDegree ? permuteIndex[it->first.second]
291 nnzPerRow[it->first.first - myFirstRow]++;
299 Teuchos::gatherv<int, gno_t>(nnzPerRow.getRawPtr(), nMyRows,
300 globalRowBuf.getRawPtr(),
301 rcvcnt.getRawPtr(), disp.getRawPtr(),
304 Teuchos::Array<int>().swap(rcvcnt);
305 Teuchos::Array<int>().swap(disp);
308 Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, 1, &nnz, &gNnz);
310 chunkCuts.resize(nChunks + 1, 0);
316 size_t target = gNnz / np;
317 size_t targetRunningTotal = 0;
318 size_t currentRunningTotal = 0;
320 for (
int chunkCnt = 0; chunkCnt < nChunks; chunkCnt++) {
321 targetRunningTotal = (target * (chunkCnt + 1));
322 currentRunningTotal = 0;
323 while (I < static_cast<gno_t>(nrows)) {
324 size_t nextNnz = (sortByDegree ? globalRowBuf[sortedOrder[I]]
326 if (currentRunningTotal + nextNnz <= targetRunningTotal) {
327 currentRunningTotal += nextNnz;
332 chunkCuts[chunkCnt + 1] = I;
334 chunkCuts[nChunks] =
static_cast<gno_t
>(nrows);
338 Teuchos::Array<gno_t>().swap(globalRowBuf);
340 Teuchos::broadcast<int, gno_t>(*comm, 0, chunkCuts(0, nChunks + 1));
341 chunksComputed =
true;
344 Kokkos::View<gno_t *, Kokkos::HostSpace> iOut(
"iOut", localNZ.size());
345 Kokkos::View<gno_t *, Kokkos::HostSpace> jOut(
"jOut", localNZ.size());
346 Kokkos::View<scalar_t *, Kokkos::HostSpace> vOut(
"vOut", localNZ.size());
347 Teuchos::Array<int> pOut(localNZ.size());
350 for (
auto it = localNZ.begin(); it != localNZ.end(); it++) {
351 iOut[sendCnt] = (sortByDegree ? permuteIndex[it->first.first]
353 jOut[sendCnt] = (sortByDegree ? permuteIndex[it->first.second]
355 if (jOut[sendCnt] <= iOut[sendCnt]) {
356 vOut[sendCnt] = it->second;
357 pOut[sendCnt] = procFromChunks(iOut[sendCnt], jOut[sendCnt]);
364 LocalNZmap_t().swap(localNZ);
365 if (sortByDegree) Teuchos::Array<gno_t>().swap(permuteIndex);
369 size_t nrecvs = plan.createFromSends(pOut(0, sendCnt));
370 Kokkos::View<gno_t *, Kokkos::HostSpace> iIn(
"iIn", nrecvs);
371 Kokkos::View<gno_t *, Kokkos::HostSpace> jIn(
"jIn", nrecvs);
372 Kokkos::View<scalar_t *, Kokkos::HostSpace> vIn(
"vIn", nrecvs);
375 auto sendIndices = std::make_pair(static_cast<size_t>(0), sendCnt);
376 plan.doPostsAndWaits(Kokkos::subview(iOut, sendIndices), 1, iIn);
377 plan.doPostsAndWaits(Kokkos::subview(jOut, sendIndices), 1, jIn);
378 plan.doPostsAndWaits(Kokkos::subview(vOut, sendIndices), 1, vIn);
381 for (
size_t n = 0; n < nrecvs; n++) {
382 NZindex_t nz(iIn[n], jIn[n]);
383 localNZ[nz] = vIn[n];
386 redistributed =
true;
389 Teuchos::RCP<matrix_t> getPermutationMatrix()
const {
return permMatrix; }
393 Distribution1DLinear<gno_t, scalar_t> initialDist;
400 Teuchos::RCP<matrix_t> permMatrix;
414 double nChunksPerRow;
415 Teuchos::Array<gno_t> chunkCuts;
417 int findIdxInChunks(gno_t I) {
418 int m = I * nChunksPerRow;
419 while (I < chunkCuts[m]) m--;
420 while (I >= chunkCuts[m + 1]) m++;
424 int procFromChunks(gno_t I, gno_t J) {
425 int m = findIdxInChunks(I);
426 int n = findIdxInChunks(J);
427 int p = m * (m + 1) / 2 + n;
435 template <
typename scalar_t,
436 class Node = ::Tpetra::Details::DefaultTypes::node_type>
437 class LowerTriangularBlockOperator :
public Tpetra::Operator<scalar_t, Tpetra::Map<>::local_ordinal_type,
438 Tpetra::Map<>::global_ordinal_type,
449 using dist_t = Tpetra::DistributionLowerTriangularBlock<gno_t, scalar_t>;
451 LowerTriangularBlockOperator(
452 const Teuchos::RCP<const matrix_t> &lowerTriangularMatrix_,
454 : lowerTriangularMatrix(lowerTriangularMatrix_)
455 , permMatrix(dist.getPermutationMatrix()) {
458 TEUCHOS_TEST_FOR_EXCEPTION(
459 !lowerTriangularMatrix->getRangeMap()->isSameAs(
460 *lowerTriangularMatrix->getDomainMap()),
462 "The Domain and Range maps of the LowerTriangularBlock matrix "
467 vector_t diagByRowMap(lowerTriangularMatrix->getRowMap());
468 lowerTriangularMatrix->getLocalDiagCopy(diagByRowMap);
469 diag = Teuchos::rcp(
new vector_t(lowerTriangularMatrix->getRangeMap()));
471 lowerTriangularMatrix->getRangeMap());
472 diag->doExport(diagByRowMap, exporter,
Tpetra::ADD);
475 void apply(
const mvector_t &x, mvector_t &y, Teuchos::ETransp mode,
476 scalar_t alpha, scalar_t beta)
const {
477 scalar_t
ZERO = Teuchos::ScalarTraits<scalar_t>::zero();
478 scalar_t ONE = Teuchos::ScalarTraits<scalar_t>::one();
487 if (permMatrix == Teuchos::null) {
489 lowerTriangularMatrix->apply(x, y, Teuchos::NO_TRANS, alpha, beta);
492 lowerTriangularMatrix->apply(x, y, Teuchos::TRANS, alpha, ONE);
495 y.elementWiseMultiply(-alpha, *diag, x, ONE);
502 vector_t xtmp(x.getMap(), x.getNumVectors());
503 vector_t ytmp(y.getMap(), y.getNumVectors());
505 permMatrix->apply(x, xtmp, Teuchos::TRANS);
506 if (beta != ZERO) permMatrix->apply(y, ytmp, Teuchos::TRANS);
509 lowerTriangularMatrix->apply(xtmp, ytmp, Teuchos::NO_TRANS, alpha, beta);
512 lowerTriangularMatrix->apply(xtmp, ytmp, Teuchos::TRANS, alpha, ONE);
515 ytmp.elementWiseMultiply(-alpha, *diag, xtmp, ONE);
517 permMatrix->apply(ytmp, y, Teuchos::NO_TRANS);
521 Teuchos::RCP<const map_t> getDomainMap()
const {
522 return lowerTriangularMatrix->getDomainMap();
525 Teuchos::RCP<const map_t> getRangeMap()
const {
526 return lowerTriangularMatrix->getRangeMap();
529 bool hasTransposeApply()
const {
return true; }
532 const Teuchos::RCP<const matrix_t> lowerTriangularMatrix;
533 const Teuchos::RCP<const matrix_t> permMatrix;
534 Teuchos::RCP<vector_t> diag;
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
LocalOrdinal local_ordinal_type
The type of local indices.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
GlobalOrdinal global_ordinal_type
The type of global indices.
One or more distributed dense vectors.
virtual void apply(const MultiVector< scalar_t, Tpetra::Map<>::local_ordinal_type, Tpetra::Map<>::global_ordinal_type, Node > &X, MultiVector< scalar_t, Tpetra::Map<>::local_ordinal_type, Tpetra::Map<>::global_ordinal_type, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_talpha=Teuchos::ScalarTraits< scalar_t >::one(), scalar_tbeta=Teuchos::ScalarTraits< scalar_t >::zero()) const =0
Computes the operator-multivector application.
void sort(View &view, const size_t &size)
Convenience wrapper for std::sort for host-accessible views.
size_t global_size_t
Global size_t object.
Abstract interface for operators (e.g., matrices and preconditioners).
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Functions for initializing and finalizing Tpetra.
Sets up and executes a communication plan for a Tpetra DistObject.
Replace old values with zero.
A parallel distribution of indices over processes.
A distributed dense vector.