Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_PartitioningSolution.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
45 
50 #ifndef _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
51 #define _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
52 
53 namespace Zoltan2 {
54 template <typename Adapter>
56 }
57 
58 #include <Zoltan2_Environment.hpp>
59 #include <Zoltan2_Solution.hpp>
60 #include <Zoltan2_GreedyMWM.hpp>
61 #include <Zoltan2_Algorithm.hpp>
63 #include <cmath>
64 #include <algorithm>
65 #include <vector>
66 #include <limits>
67 
68 #ifdef _MSC_VER
69 #define NOMINMAX
70 #include <windows.h>
71 #endif
72 
73 
74 namespace Zoltan2 {
75 
90 template <typename Adapter>
91  class PartitioningSolution : public Solution
92 {
93 public:
94 
95 #ifndef DOXYGEN_SHOULD_SKIP_THIS
96  typedef typename Adapter::gno_t gno_t;
97  typedef typename Adapter::scalar_t scalar_t;
98  typedef typename Adapter::lno_t lno_t;
99  typedef typename Adapter::part_t part_t;
100  typedef typename Adapter::user_t user_t;
101 #endif
102 
120  PartitioningSolution( const RCP<const Environment> &env,
121  const RCP<const Comm<int> > &comm,
122  int nUserWeights,
123  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
124 
154  PartitioningSolution(const RCP<const Environment> &env,
155  const RCP<const Comm<int> > &comm,
156  int nUserWeights, ArrayView<ArrayRCP<part_t> > reqPartIds,
157  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
158  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
159 
161  // Information that the algorithm may wish to query.
162 
165  size_t getTargetGlobalNumberOfParts() const { return nGlobalParts_; }
166 
169  size_t getActualGlobalNumberOfParts() const { return nGlobalPartsSolution_; }
170 
173  size_t getLocalNumberOfParts() const { return nLocalParts_; }
174 
182  scalar_t getLocalFractionOfPart() const { return localFraction_; }
183 
193  bool oneToOnePartDistribution() const { return onePartPerProc_; }
194 
214  const int *getPartDistribution() const {
215  if (partDist_.size() > 0) return &partDist_[0];
216  else return NULL;
217  }
218 
235  const part_t *getProcDistribution() const {
236  if (procDist_.size() > 0) return &procDist_[0];
237  else return NULL;
238  }
239 
243  int getNumberOfCriteria() const { return nWeightsPerObj_; }
244 
245 
252  bool criteriaHasUniformPartSizes(int idx) const { return pSizeUniform_[idx];}
253 
266  scalar_t getCriteriaPartSize(int idx, part_t part) const {
267  if (pSizeUniform_[idx])
268  return 1.0 / nGlobalParts_;
269  else if (pCompactIndex_[idx].size())
270  return pSize_[idx][pCompactIndex_[idx][part]];
271  else
272  return pSize_[idx][part];
273  }
274 
288  bool criteriaHaveSamePartSizes(int c1, int c2) const;
289 
291  // Method used by the algorithm to set results.
292 
319  void setParts(ArrayRCP<part_t> &partList);
320 
322 
334  void RemapParts();
335 
337  /* Return the weight of objects staying with a given remap.
338  * If remap is NULL, compute weight of objects staying with given partition
339  */
340  long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt,
341  part_t nrhs, part_t nlhs)
342  {
343  long staying = 0;
344  for (part_t i = 0; i < nrhs; i++) {
345  part_t k = (remap ? remap[i] : i);
346  for (part_t j = idx[k]; j < idx[k+1]; j++) {
347  if (i == (adj[j]-nlhs)) {
348  staying += wgt[j];
349  break;
350  }
351  }
352  }
353  return staying;
354  }
355 
357  // Results that may be queried by the user, by migration methods,
358  // or by metric calculation methods.
359  // We return raw pointers so users don't have to learn about our
360  // pointer wrappers.
361 
364  inline const RCP<const Comm<int> > &getCommunicator() const { return comm_;}
365 
368  inline const RCP<const Environment> &getEnvironment() const { return env_;}
369 
372  const part_t *getPartListView() const {
373  if (parts_.size() > 0) return parts_.getRawPtr();
374  else return NULL;
375  }
376 
381  const int *getProcListView() const {
382  if (procs_.size() > 0) return procs_.getRawPtr();
383  else return NULL;
384  }
385 
388  virtual bool isPartitioningTreeBinary() const
389  {
390  if (this->algorithm_ == Teuchos::null)
391  throw std::logic_error("no partitioning algorithm has been run yet");
392  return this->algorithm_->isPartitioningTreeBinary();
393  }
394 
397  void getPartitionTree(part_t & numTreeVerts,
398  std::vector<part_t> & permPartNums,
399  std::vector<part_t> & splitRangeBeg,
400  std::vector<part_t> & splitRangeEnd,
401  std::vector<part_t> & treeVertParents) const {
402 
403  part_t numParts = static_cast<part_t>(getTargetGlobalNumberOfParts());
404 
405  if (this->algorithm_ == Teuchos::null)
406  throw std::logic_error("no partitioning algorithm has been run yet");
407  this->algorithm_->getPartitionTree(
408  numParts, // may want to change how this is passed through
409  numTreeVerts,
410  permPartNums,
411  splitRangeBeg,
412  splitRangeEnd,
413  treeVertParents);
414  }
415 
418  std::vector<Zoltan2::coordinateModelPartBox> &
420  {
421  return this->algorithm_->getPartBoxesView();
422  }
423 
425  // when a point lies on a part boundary, the lowest part
426  // number on that boundary is returned.
427  // Note that not all partitioning algorithms will support
428  // this method.
429  //
430  // \param dim : the number of dimensions specified for the point in space
431  // \param point : the coordinates of the point in space; array of size dim
432  // \return the part number of a part overlapping the given point
433  part_t pointAssign(int dim, scalar_t *point) const
434  {
435  part_t p;
436  try {
437  if (this->algorithm_ == Teuchos::null)
438  throw std::logic_error("no partitioning algorithm has been run yet");
439 
440  p = this->algorithm_->pointAssign(dim, point);
441  }
443  return p;
444  }
445 
447  // This method allocates memory for the return argument, but does not
448  // control that memory. The user is responsible for freeing the
449  // memory.
450  //
451  // \param dim : (in) the number of dimensions specified for the box
452  // \param lower : (in) the coordinates of the lower corner of the box;
453  // array of size dim
454  // \param upper : (in) the coordinates of the upper corner of the box;
455  // array of size dim
456  // \param nPartsFound : (out) the number of parts overlapping the box
457  // \param partsFound : (out) array of parts overlapping the box
458  void boxAssign(int dim, scalar_t *lower, scalar_t *upper,
459  size_t &nPartsFound, part_t **partsFound) const
460  {
461  try {
462  if (this->algorithm_ == Teuchos::null)
463  throw std::logic_error("no partitioning algorithm has been run yet");
464 
465  this->algorithm_->boxAssign(dim, lower, upper, nPartsFound, partsFound);
466  }
468  }
469 
470 
473  void getCommunicationGraph(ArrayRCP <part_t> &comXAdj,
474  ArrayRCP <part_t> &comAdj) const
475  {
476  try {
477  if (this->algorithm_ == Teuchos::null)
478  throw std::logic_error("no partitioning algorithm has been run yet");
479 
480  this->algorithm_->getCommunicationGraph(this, comXAdj, comAdj);
481  }
483  }
484 
502  void getPartsForProc(int procId, double &numParts, part_t &partMin,
503  part_t &partMax) const
504  {
505  env_->localInputAssertion(__FILE__, __LINE__, "invalid process id",
506  procId >= 0 && procId < comm_->getSize(), BASIC_ASSERTION);
507 
508  procToPartsMap(procId, numParts, partMin, partMax);
509  }
510 
522  void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
523  {
524  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
525  partId >= 0 && partId < nGlobalParts_, BASIC_ASSERTION);
526 
527  partToProcsMap(partId, procMin, procMax);
528  }
529 
530 private:
531  void partToProc(bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
532  int numLocalParts, int numGlobalParts);
533 
534  void procToPartsMap(int procId, double &numParts, part_t &partMin,
535  part_t &partMax) const;
536 
537  void partToProcsMap(part_t partId, int &procMin, int &procMax) const;
538 
539  void setPartDistribution();
540 
541  void setPartSizes(ArrayView<ArrayRCP<part_t> > reqPartIds,
542  ArrayView<ArrayRCP<scalar_t> > reqPartSizes);
543 
544  void computePartSizes(int widx, ArrayView<part_t> ids,
545  ArrayView<scalar_t> sizes);
546 
547  void broadcastPartSizes(int widx);
548 
549 
550  RCP<const Environment> env_; // has application communicator
551  const RCP<const Comm<int> > comm_; // the problem communicator
552 
553  //part box boundaries as a result of geometric partitioning algorithm.
554  RCP<std::vector<Zoltan2::coordinateModelPartBox> > partBoxes;
555 
556  part_t nGlobalParts_;// target global number of parts
557  part_t nLocalParts_; // number of parts to be on this process
558 
559  scalar_t localFraction_; // approx fraction of a part on this process
560  int nWeightsPerObj_; // if user has no weights, this is 1 TODO: WHY???
561 
562  // If process p is to be assigned part p for all p, then onePartPerProc_
563  // is true. Otherwise it is false, and either procDist_ or partDist_
564  // describes the allocation of parts to processes.
565  //
566  // If parts are never split across processes, then procDist_ is defined
567  // as follows:
568  //
569  // partId = procDist_[procId]
570  // partIdNext = procDist_[procId+1]
571  // globalNumberOfParts = procDist_[numProcs]
572  //
573  // meaning that the parts assigned to process procId range from
574  // [partId, partIdNext). If partIdNext is the same as partId, then
575  // process procId has no parts.
576  //
577  // If the number parts is less than the number of processes, and the
578  // user did not specify "num_local_parts" for each of the processes, then
579  // parts are split across processes, and partDist_ is defined rather than
580  // procDist_.
581  //
582  // procId = partDist_[partId]
583  // procIdNext = partDist_[partId+1]
584  // globalNumberOfProcs = partDist_[numParts]
585  //
586  // which implies that the part partId is shared by processes in the
587  // the range [procId, procIdNext).
588  //
589  // We use std::vector so we can use upper_bound algorithm
590 
591  bool onePartPerProc_; // either this is true...
592  std::vector<int> partDist_; // or this is defined ...
593  std::vector<part_t> procDist_; // or this is defined.
594  bool procDistEquallySpread_; // if procDist_ is used and
595  // #parts > #procs and
596  // num_local_parts is not specified,
597  // parts are evenly distributed to procs
598 
599  // In order to minimize the storage required for part sizes, we
600  // have three different representations.
601  //
602  // If the part sizes for weight index w are all the same, then:
603  // pSizeUniform_[w] = true
604  // pCompactIndex_[w].size() = 0
605  // pSize_[w].size() = 0
606  //
607  // and the size for part p is 1.0 / nparts.
608  //
609  // If part sizes differ for each part in weight index w, but there
610  // are no more than 64 distinct sizes:
611  // pSizeUniform_[w] = false
612  // pCompactIndex_[w].size() = number of parts
613  // pSize_[w].size() = number of different sizes
614  //
615  // and the size for part p is pSize_[pCompactIndex_[p]].
616  //
617  // If part sizes differ for each part in weight index w, and there
618  // are more than 64 distinct sizes:
619  // pSizeUniform_[w] = false
620  // pCompactIndex_[w].size() = 0
621  // pSize_[w].size() = nparts
622  //
623  // and the size for part p is pSize_[p].
624  //
625  // NOTE: If we expect to have similar cases, i.e. a long list of scalars
626  // where it is highly possible that just a few unique values appear,
627  // then we may want to make this a class. The advantage is that we
628  // save a long list of 1-byte indices instead of a long list of scalars.
629 
630  ArrayRCP<bool> pSizeUniform_;
631  ArrayRCP<ArrayRCP<unsigned char> > pCompactIndex_;
632  ArrayRCP<ArrayRCP<scalar_t> > pSize_;
633 
635  // The algorithm sets these values upon completion.
636 
637  ArrayRCP<part_t> parts_; // part number assigned to localid[i]
638 
639  bool haveSolution_;
640 
641  part_t nGlobalPartsSolution_; // global number of parts in solution
642 
644  // The solution calculates this from the part assignments,
645  // unless onePartPerProc_.
646 
647  ArrayRCP<int> procs_; // process rank assigned to localid[i]
648 
650  // Algorithm used to compute the solution;
651  // needed for post-processing with pointAssign or getCommunicationGraph
652  const RCP<Algorithm<Adapter> > algorithm_; //
653 };
654 
656 // Definitions
658 
659 template <typename Adapter>
661  const RCP<const Environment> &env,
662  const RCP<const Comm<int> > &comm,
663  int nUserWeights,
664  const RCP<Algorithm<Adapter> > &algorithm)
665  : env_(env), comm_(comm),
666  partBoxes(),
667  nGlobalParts_(0), nLocalParts_(0),
668  localFraction_(0), nWeightsPerObj_(),
669  onePartPerProc_(false), partDist_(), procDist_(),
670  procDistEquallySpread_(false),
671  pSizeUniform_(), pCompactIndex_(), pSize_(),
672  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
673  procs_(), algorithm_(algorithm)
674 {
675  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
676 
677  setPartDistribution();
678 
679  // We must call setPartSizes() because part sizes may have
680  // been provided by the user on other processes.
681 
682  ArrayRCP<part_t> *noIds = new ArrayRCP<part_t> [nWeightsPerObj_];
683  ArrayRCP<scalar_t> *noSizes = new ArrayRCP<scalar_t> [nWeightsPerObj_];
684  ArrayRCP<ArrayRCP<part_t> > ids(noIds, 0, nWeightsPerObj_, true);
685  ArrayRCP<ArrayRCP<scalar_t> > sizes(noSizes, 0, nWeightsPerObj_, true);
686 
687  setPartSizes(ids.view(0, nWeightsPerObj_), sizes.view(0, nWeightsPerObj_));
688 
689  env_->memory("After construction of solution");
690 }
691 
692 template <typename Adapter>
694  const RCP<const Environment> &env,
695  const RCP<const Comm<int> > &comm,
696  int nUserWeights,
697  ArrayView<ArrayRCP<part_t> > reqPartIds,
698  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
699  const RCP<Algorithm<Adapter> > &algorithm)
700  : env_(env), comm_(comm),
701  partBoxes(),
702  nGlobalParts_(0), nLocalParts_(0),
703  localFraction_(0), nWeightsPerObj_(),
704  onePartPerProc_(false), partDist_(), procDist_(),
705  procDistEquallySpread_(false),
706  pSizeUniform_(), pCompactIndex_(), pSize_(),
707  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
708  procs_(), algorithm_(algorithm)
709 {
710  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
711 
712  setPartDistribution();
713 
714  setPartSizes(reqPartIds, reqPartSizes);
715 
716  env_->memory("After construction of solution");
717 }
718 
719 template <typename Adapter>
721 {
722  // Did the caller define num_global_parts and/or num_local_parts?
723 
724  const ParameterList &pl = env_->getParameters();
725  size_t haveGlobalNumParts=0, haveLocalNumParts=0;
726  int numLocal=0, numGlobal=0;
727 
728  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("num_global_parts");
729 
730  if (pe){
731  haveGlobalNumParts = 1;
732  nGlobalParts_ = part_t(pe->getValue(&nGlobalParts_));
733  numGlobal = nGlobalParts_;
734  }
735 
736  pe = pl.getEntryPtr("num_local_parts");
737 
738  if (pe){
739  haveLocalNumParts = 1;
740  nLocalParts_ = part_t(pe->getValue(&nLocalParts_));
741  numLocal = nLocalParts_;
742  }
743 
744  try{
745  // Sets onePartPerProc_, partDist_, and procDist_
746 
747  partToProc(true, haveLocalNumParts, haveGlobalNumParts,
748  numLocal, numGlobal);
749  }
751 
752  int nprocs = comm_->getSize();
753  int rank = comm_->getRank();
754 
755  if (onePartPerProc_){
756  nGlobalParts_ = nprocs;
757  nLocalParts_ = 1;
758  }
759  else if (partDist_.size() > 0){ // more procs than parts
760  nGlobalParts_ = partDist_.size() - 1;
761  int pstart = partDist_[0];
762  for (part_t i=1; i <= nGlobalParts_; i++){
763  int pend = partDist_[i];
764  if (rank >= pstart && rank < pend){
765  int numOwners = pend - pstart;
766  nLocalParts_ = 1;
767  localFraction_ = 1.0 / numOwners;
768  break;
769  }
770  pstart = pend;
771  }
772  }
773  else if (procDist_.size() > 0){ // more parts than procs
774  nGlobalParts_ = procDist_[nprocs];
775  nLocalParts_ = procDist_[rank+1] - procDist_[rank];
776  }
777  else {
778  throw std::logic_error("partToProc error");
779  }
780 
781 }
782 
783 template <typename Adapter>
784  void PartitioningSolution<Adapter>::setPartSizes(
785  ArrayView<ArrayRCP<part_t> > ids, ArrayView<ArrayRCP<scalar_t> > sizes)
786 {
787  int widx = nWeightsPerObj_;
788  bool fail=false;
789 
790  size_t *countBuf = new size_t [widx*2];
791  ArrayRCP<size_t> counts(countBuf, 0, widx*2, true);
792 
793  fail = ((ids.size() != widx) || (sizes.size() != widx));
794 
795  for (int w=0; !fail && w < widx; w++){
796  counts[w] = ids[w].size();
797  if (ids[w].size() != sizes[w].size()) fail=true;
798  }
799 
800  env_->globalBugAssertion(__FILE__, __LINE__, "bad argument arrays", fail==0,
801  COMPLEX_ASSERTION, comm_);
802 
803  // Are all part sizes the same? This is the common case.
804 
805  ArrayRCP<scalar_t> *emptySizes= new ArrayRCP<scalar_t> [widx];
806  pSize_ = arcp(emptySizes, 0, widx);
807 
808  ArrayRCP<unsigned char> *emptyIndices= new ArrayRCP<unsigned char> [widx];
809  pCompactIndex_ = arcp(emptyIndices, 0, widx);
810 
811  bool *info = new bool [widx];
812  pSizeUniform_ = arcp(info, 0, widx);
813  for (int w=0; w < widx; w++)
814  pSizeUniform_[w] = true;
815 
816  if (nGlobalParts_ == 1){
817  return; // there's only one part in the whole problem
818  }
819 
820  size_t *ptr1 = counts.getRawPtr();
821  size_t *ptr2 = counts.getRawPtr() + widx;
822 
823  try{
824  reduceAll<int, size_t>(*comm_, Teuchos::REDUCE_MAX, widx, ptr1, ptr2);
825  }
826  Z2_THROW_OUTSIDE_ERROR(*env_);
827 
828  bool zero = true;
829 
830  for (int w=0; w < widx; w++)
831  if (counts[widx+w] > 0){
832  zero = false;
833  pSizeUniform_[w] = false;
834  }
835 
836  if (zero) // Part sizes for all criteria are uniform.
837  return;
838 
839  // Compute the part sizes for criteria for which part sizes were
840  // supplied. Normalize for each criteria so part sizes sum to one.
841 
842  int nprocs = comm_->getSize();
843  int rank = comm_->getRank();
844 
845  for (int w=0; w < widx; w++){
846  if (pSizeUniform_[w]) continue;
847 
848  // Send all ids and sizes to one process.
849  // (There is no simple gather method in Teuchos.)
850 
851  part_t length = ids[w].size();
852  part_t *allLength = new part_t [nprocs];
853  Teuchos::gatherAll<int, part_t>(*comm_, 1, &length,
854  nprocs, allLength);
855 
856  if (rank == 0){
857  int total = 0;
858  for (int i=0; i < nprocs; i++)
859  total += allLength[i];
860 
861  part_t *partNums = new part_t [total];
862  scalar_t *partSizes = new scalar_t [total];
863 
864  ArrayView<part_t> idArray(partNums, total);
865  ArrayView<scalar_t> sizeArray(partSizes, total);
866 
867  if (length > 0){
868  for (int i=0; i < length; i++){
869  *partNums++ = ids[w][i];
870  *partSizes++ = sizes[w][i];
871  }
872  }
873 
874  for (int p=1; p < nprocs; p++){
875  if (allLength[p] > 0){
876  Teuchos::receive<int, part_t>(*comm_, p,
877  allLength[p], partNums);
878  Teuchos::receive<int, scalar_t>(*comm_, p,
879  allLength[p], partSizes);
880  partNums += allLength[p];
881  partSizes += allLength[p];
882  }
883  }
884 
885  delete [] allLength;
886 
887  try{
888  computePartSizes(w, idArray, sizeArray);
889  }
891 
892  delete [] idArray.getRawPtr();
893  delete [] sizeArray.getRawPtr();
894  }
895  else{
896  delete [] allLength;
897  if (length > 0){
898  Teuchos::send<int, part_t>(*comm_, length, ids[w].getRawPtr(), 0);
899  Teuchos::send<int, scalar_t>(*comm_, length, sizes[w].getRawPtr(), 0);
900  }
901  }
902 
903  broadcastPartSizes(w);
904  }
905 }
906 
907 template <typename Adapter>
908  void PartitioningSolution<Adapter>::broadcastPartSizes(int widx)
909 {
910  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
911  pSize_.size()>widx &&
912  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
914 
915  int rank = comm_->getRank();
916  int nprocs = comm_->getSize();
917  part_t nparts = nGlobalParts_;
918 
919  if (nprocs < 2)
920  return;
921 
922  char flag=0;
923 
924  if (rank == 0){
925  if (pSizeUniform_[widx] == true)
926  flag = 1;
927  else if (pCompactIndex_[widx].size() > 0)
928  flag = 2;
929  else
930  flag = 3;
931  }
932 
933  try{
934  Teuchos::broadcast<int, char>(*comm_, 0, 1, &flag);
935  }
936  Z2_THROW_OUTSIDE_ERROR(*env_);
937 
938  if (flag == 1){
939  if (rank > 0)
940  pSizeUniform_[widx] = true;
941 
942  return;
943  }
944 
945  if (flag == 2){
946 
947  // broadcast the indices into the size list
948 
949  unsigned char *idxbuf = NULL;
950 
951  if (rank > 0){
952  idxbuf = new unsigned char [nparts];
953  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, idxbuf);
954  }
955  else{
956  env_->localBugAssertion(__FILE__, __LINE__, "index list size",
957  pCompactIndex_[widx].size() == nparts, COMPLEX_ASSERTION);
958  idxbuf = pCompactIndex_[widx].getRawPtr();
959  }
960 
961  try{
962  // broadcast of unsigned char is not supported
963  Teuchos::broadcast<int, char>(*comm_, 0, nparts,
964  reinterpret_cast<char *>(idxbuf));
965  }
966  Z2_THROW_OUTSIDE_ERROR(*env_);
967 
968  if (rank > 0)
969  pCompactIndex_[widx] = arcp(idxbuf, 0, nparts, true);
970 
971  // broadcast the list of different part sizes
972 
973  unsigned char maxIdx=0;
974  for (part_t p=0; p < nparts; p++)
975  if (idxbuf[p] > maxIdx) maxIdx = idxbuf[p];
976 
977  int numSizes = maxIdx + 1;
978 
979  scalar_t *sizeList = NULL;
980 
981  if (rank > 0){
982  sizeList = new scalar_t [numSizes];
983  env_->localMemoryAssertion(__FILE__, __LINE__, numSizes, sizeList);
984  }
985  else{
986  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
987  numSizes == pSize_[widx].size(), COMPLEX_ASSERTION);
988 
989  sizeList = pSize_[widx].getRawPtr();
990  }
991 
992  try{
993  Teuchos::broadcast<int, scalar_t>(*comm_, 0, numSizes, sizeList);
994  }
995  Z2_THROW_OUTSIDE_ERROR(*env_);
996 
997  if (rank > 0)
998  pSize_[widx] = arcp(sizeList, 0, numSizes, true);
999 
1000  return;
1001  }
1002 
1003  if (flag == 3){
1004 
1005  // broadcast the size of each part
1006 
1007  scalar_t *sizeList = NULL;
1008 
1009  if (rank > 0){
1010  sizeList = new scalar_t [nparts];
1011  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, sizeList);
1012  }
1013  else{
1014  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
1015  nparts == pSize_[widx].size(), COMPLEX_ASSERTION);
1016 
1017  sizeList = pSize_[widx].getRawPtr();
1018  }
1019 
1020  try{
1021  Teuchos::broadcast<int, scalar_t >(*comm_, 0, nparts, sizeList);
1022  }
1023  Z2_THROW_OUTSIDE_ERROR(*env_);
1024 
1025  if (rank > 0)
1026  pSize_[widx] = arcp(sizeList, 0, nparts);
1027 
1028  return;
1029  }
1030 }
1031 
1032 template <typename Adapter>
1033  void PartitioningSolution<Adapter>::computePartSizes(int widx,
1034  ArrayView<part_t> ids, ArrayView<scalar_t> sizes)
1035 {
1036  int len = ids.size();
1037 
1038  if (len == 0){
1039  pSizeUniform_[widx] = true;
1040  return;
1041  }
1042 
1043  env_->localBugAssertion(__FILE__, __LINE__, "bad array sizes",
1044  len>0 && sizes.size()==len, COMPLEX_ASSERTION);
1045 
1046  env_->localBugAssertion(__FILE__, __LINE__, "bad index",
1047  widx>=0 && widx<nWeightsPerObj_, COMPLEX_ASSERTION);
1048 
1049  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
1050  pSize_.size()>widx &&
1051  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
1053 
1054  // Check ids and sizes and find min, max and average sizes.
1055  // If sizes are very close to uniform, call them uniform parts.
1056 
1057  part_t nparts = nGlobalParts_;
1058  unsigned char *buf = new unsigned char [nparts];
1059  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, buf);
1060  memset(buf, 0, nparts);
1061  ArrayRCP<unsigned char> partIdx(buf, 0, nparts, true);
1062 
1063  scalar_t epsilon = 10e-5 / nparts;
1064  scalar_t min=sizes[0], max=sizes[0], sum=0;
1065 
1066  for (int i=0; i < len; i++){
1067  part_t id = ids[i];
1068  scalar_t size = sizes[i];
1069 
1070  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
1071  id>=0 && id<nparts, BASIC_ASSERTION);
1072 
1073  env_->localInputAssertion(__FILE__, __LINE__, "invalid part size", size>=0,
1074  BASIC_ASSERTION);
1075 
1076  // TODO: we could allow users to specify multiple sizes for the same
1077  // part if we add a parameter that says what we are to do with them:
1078  // add them or take the max.
1079 
1080  env_->localInputAssertion(__FILE__, __LINE__,
1081  "multiple sizes provided for one part", partIdx[id]==0, BASIC_ASSERTION);
1082 
1083  partIdx[id] = 1; // mark that we have a size for this part
1084 
1085  if (size < min) min = size;
1086  if (size > max) max = size;
1087  sum += size;
1088  }
1089 
1090  if (sum == 0){
1091 
1092  // User has given us a list of parts of size 0, we'll set
1093  // the rest to them to equally sized parts.
1094 
1095  scalar_t *allSizes = new scalar_t [2];
1096  env_->localMemoryAssertion(__FILE__, __LINE__, 2, allSizes);
1097 
1098  ArrayRCP<scalar_t> sizeArray(allSizes, 0, 2, true);
1099 
1100  int numNonZero = nparts - len;
1101 
1102  allSizes[0] = 0.0;
1103  allSizes[1] = 1.0 / numNonZero;
1104 
1105  for (part_t p=0; p < nparts; p++)
1106  buf[p] = 1; // index to default part size
1107 
1108  for (int i=0; i < len; i++)
1109  buf[ids[i]] = 0; // index to part size zero
1110 
1111  pSize_[widx] = sizeArray;
1112  pCompactIndex_[widx] = partIdx;
1113 
1114  return;
1115  }
1116 
1117  if (max - min <= epsilon){
1118  pSizeUniform_[widx] = true;
1119  return;
1120  }
1121 
1122  // A size for parts that were not specified:
1123  scalar_t avg = sum / nparts;
1124 
1125  // We are going to merge part sizes that are very close. This takes
1126  // computation time now, but can save considerably in the storage of
1127  // all part sizes on each process. For example, a common case may
1128  // be some parts are size 1 and all the rest are size 2.
1129 
1130  scalar_t *tmp = new scalar_t [len];
1131  env_->localMemoryAssertion(__FILE__, __LINE__, len, tmp);
1132  memcpy(tmp, sizes.getRawPtr(), sizeof(scalar_t) * len);
1133  ArrayRCP<scalar_t> partSizes(tmp, 0, len, true);
1134 
1135  std::sort(partSizes.begin(), partSizes.end());
1136 
1137  // create a list of sizes that are unique within epsilon
1138 
1139  Array<scalar_t> nextUniqueSize;
1140  nextUniqueSize.push_back(partSizes[len-1]); // largest
1141  scalar_t curr = partSizes[len-1];
1142  int avgIndex = len;
1143  bool haveAvg = false;
1144  if (curr - avg <= epsilon)
1145  avgIndex = 0;
1146 
1147  for (int i=len-2; i >= 0; i--){
1148  scalar_t val = partSizes[i];
1149  if (curr - val > epsilon){
1150  nextUniqueSize.push_back(val); // the highest in the group
1151  curr = val;
1152  if (avgIndex==len && val > avg && val - avg <= epsilon){
1153  // the average would be in this group
1154  avgIndex = nextUniqueSize.size() - 1;
1155  haveAvg = true;
1156  }
1157  }
1158  }
1159 
1160  partSizes.clear();
1161 
1162  size_t numSizes = nextUniqueSize.size();
1163  int sizeArrayLen = numSizes;
1164 
1165  if (numSizes < 64){
1166  // We can store the size for every part in a compact way.
1167 
1168  // Create a list of all sizes in increasing order
1169 
1170  if (!haveAvg) sizeArrayLen++; // need to include average
1171 
1172  scalar_t *allSizes = new scalar_t [sizeArrayLen];
1173  env_->localMemoryAssertion(__FILE__, __LINE__, sizeArrayLen, allSizes);
1174  ArrayRCP<scalar_t> sizeArray(allSizes, 0, sizeArrayLen, true);
1175 
1176  int newAvgIndex = sizeArrayLen;
1177 
1178  for (int i=numSizes-1, idx=0; i >= 0; i--){
1179 
1180  if (newAvgIndex == sizeArrayLen){
1181 
1182  if (haveAvg && i==avgIndex)
1183  newAvgIndex = idx;
1184 
1185  else if (!haveAvg && avg < nextUniqueSize[i]){
1186  newAvgIndex = idx;
1187  allSizes[idx++] = avg;
1188  }
1189  }
1190 
1191  allSizes[idx++] = nextUniqueSize[i];
1192  }
1193 
1194  env_->localBugAssertion(__FILE__, __LINE__, "finding average in list",
1195  newAvgIndex < sizeArrayLen, COMPLEX_ASSERTION);
1196 
1197  for (int i=0; i < nparts; i++){
1198  buf[i] = newAvgIndex; // index to default part size
1199  }
1200 
1201  sum = (nparts - len) * allSizes[newAvgIndex];
1202 
1203  for (int i=0; i < len; i++){
1204  int id = ids[i];
1205  scalar_t size = sizes[i];
1206  int index;
1207 
1208  // Find the first size greater than or equal to this size.
1209 
1210  if (size < avg && avg - size <= epsilon)
1211  index = newAvgIndex;
1212  else{
1213  typename ArrayRCP<scalar_t>::iterator found =
1214  std::lower_bound(sizeArray.begin(), sizeArray.end(), size);
1215 
1216  env_->localBugAssertion(__FILE__, __LINE__, "size array",
1217  found != sizeArray.end(), COMPLEX_ASSERTION);
1218 
1219  index = found - sizeArray.begin();
1220  }
1221 
1222  buf[id] = index;
1223  sum += allSizes[index];
1224  }
1225 
1226  for (int i=0; i < sizeArrayLen; i++){
1227  sizeArray[i] /= sum;
1228  }
1229 
1230  pCompactIndex_[widx] = partIdx;
1231  pSize_[widx] = sizeArray;
1232  }
1233  else{
1234  // To have access to part sizes, we must store nparts scalar_ts on
1235  // every process. We expect this is a rare case.
1236 
1237  tmp = new scalar_t [nparts];
1238  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, tmp);
1239 
1240  sum += ((nparts - len) * avg);
1241 
1242  for (int i=0; i < nparts; i++){
1243  tmp[i] = avg/sum;
1244  }
1245 
1246  for (int i=0; i < len; i++){
1247  tmp[ids[i]] = sizes[i]/sum;
1248  }
1249 
1250  pSize_[widx] = arcp(tmp, 0, nparts);
1251  }
1252 }
1253 
1254 template <typename Adapter>
1255  void PartitioningSolution<Adapter>::setParts(ArrayRCP<part_t> &partList)
1256 {
1257  env_->debug(DETAILED_STATUS, "Entering setParts");
1258 
1259  size_t len = partList.size();
1260 
1261  // Find the actual number of parts in the solution, which can
1262  // be more or less than the nGlobalParts_ target.
1263  // (We may want to compute the imbalance of a given solution with
1264  // respect to a desired solution. This solution may have more or
1265  // fewer parts that the desired solution.)
1266 
1267  part_t lMax = -1;
1268  part_t lMin = (len > 0 ? std::numeric_limits<part_t>::max() : 0);
1269  part_t gMax, gMin;
1270 
1271  for (size_t i = 0; i < len; i++) {
1272  if (partList[i] < lMin) lMin = partList[i];
1273  if (partList[i] > lMax) lMax = partList[i];
1274  }
1275  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MIN, 1, &lMin, &gMin);
1276  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MAX, 1, &lMax, &gMax);
1277 
1278  nGlobalPartsSolution_ = gMax - gMin + 1;
1279  parts_ = partList;
1280 
1281  // Now determine which process gets each object, if not one-to-one.
1282 
1283  if (!onePartPerProc_){
1284 
1285  int *procs = new int [len];
1286  env_->localMemoryAssertion(__FILE__, __LINE__, len, procs);
1287  procs_ = arcp<int>(procs, 0, len);
1288 
1289  if (len > 0) {
1290  part_t *parts = partList.getRawPtr();
1291 
1292  if (procDist_.size() > 0){ // parts are not split across procs
1293 
1294  int procId;
1295  for (size_t i=0; i < len; i++){
1296  partToProcsMap(parts[i], procs[i], procId);
1297  }
1298  }
1299  else{ // harder - we need to split the parts across multiple procs
1300 
1301  lno_t *partCounter = new lno_t [nGlobalPartsSolution_];
1302  env_->localMemoryAssertion(__FILE__, __LINE__, nGlobalPartsSolution_,
1303  partCounter);
1304 
1305  int numProcs = comm_->getSize();
1306 
1307  //MD NOTE: there was no initialization for partCounter.
1308  //I added the line below, correct me if I am wrong.
1309  memset(partCounter, 0, sizeof(lno_t) * nGlobalPartsSolution_);
1310 
1311  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++)
1312  partCounter[parts[i]]++;
1313 
1314  lno_t *procCounter = new lno_t [numProcs];
1315  env_->localMemoryAssertion(__FILE__, __LINE__, numProcs, procCounter);
1316 
1317  int proc1;
1318  int proc2 = partDist_[0];
1319 
1320  for (part_t part=1; part < nGlobalParts_; part++){
1321  proc1 = proc2;
1322  proc2 = partDist_[part+1];
1323  int numprocs = proc2 - proc1;
1324 
1325  double dNum = partCounter[part];
1326  double dProcs = numprocs;
1327 
1328  //std::cout << "dNum:" << dNum << " dProcs:" << dProcs << std::endl;
1329  double each = floor(dNum/dProcs);
1330  double extra = fmod(dNum,dProcs);
1331 
1332  for (int proc=proc1, i=0; proc<proc2; proc++, i++){
1333  if (i < extra)
1334  procCounter[proc] = lno_t(each) + 1;
1335  else
1336  procCounter[proc] = lno_t(each);
1337  }
1338  }
1339 
1340  delete [] partCounter;
1341 
1342  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++){
1343  if (partList[i] >= nGlobalParts_){
1344  // Solution has more parts that targeted. These
1345  // objects just remain on this process.
1346  procs[i] = comm_->getRank();
1347  continue;
1348  }
1349  part_t partNum = parts[i];
1350  proc1 = partDist_[partNum];
1351  proc2 = partDist_[partNum + 1];
1352 
1353  int proc;
1354  for (proc=proc1; proc < proc2; proc++){
1355  if (procCounter[proc] > 0){
1356  procs[i] = proc;
1357  procCounter[proc]--;
1358  break;
1359  }
1360  }
1361  env_->localBugAssertion(__FILE__, __LINE__, "part to proc",
1362  proc < proc2, COMPLEX_ASSERTION);
1363  }
1364 
1365  delete [] procCounter;
1366  }
1367  }
1368  }
1369 
1370  // Now that parts_ info is back on home process, remap the parts.
1371  // TODO: The parts will be inconsistent with the proc assignments after
1372  // TODO: remapping. This problem will go away after we separate process
1373  // TODO: mapping from setParts. But for MueLu's use case, the part
1374  // TODO: remapping is all that matters; they do not use the process mapping.
1375  bool doRemap = false;
1376  const Teuchos::ParameterEntry *pe =
1377  env_->getParameters().getEntryPtr("remap_parts");
1378  if (pe) doRemap = pe->getValue(&doRemap);
1379  if (doRemap) RemapParts();
1380 
1381  haveSolution_ = true;
1382 
1383  env_->memory("After Solution has processed algorithm's answer");
1384  env_->debug(DETAILED_STATUS, "Exiting setParts");
1385 }
1386 
1387 
1388 template <typename Adapter>
1390  double &numParts, part_t &partMin, part_t &partMax) const
1391 {
1392  if (onePartPerProc_){
1393  numParts = 1.0;
1394  partMin = partMax = procId;
1395  }
1396  else if (procDist_.size() > 0){
1397  partMin = procDist_[procId];
1398  partMax = procDist_[procId+1] - 1;
1399  numParts = procDist_[procId+1] - partMin;
1400  }
1401  else{
1402  // find the first p such that partDist_[p] > procId
1403 
1404  std::vector<int>::const_iterator entry;
1405  entry = std::upper_bound(partDist_.begin(), partDist_.end(), procId);
1406 
1407  size_t partIdx = entry - partDist_.begin();
1408  int numProcs = partDist_[partIdx] - partDist_[partIdx-1];
1409  partMin = partMax = int(partIdx) - 1;
1410  numParts = 1.0 / numProcs;
1411  }
1412 }
1413 
1414 template <typename Adapter>
1415  void PartitioningSolution<Adapter>::partToProcsMap(part_t partId,
1416  int &procMin, int &procMax) const
1417 {
1418  if (partId >= nGlobalParts_){
1419  // setParts() may be given an initial solution which uses a
1420  // different number of parts than the desired solution. It is
1421  // still a solution. We keep it on this process.
1422  procMin = procMax = comm_->getRank();
1423  }
1424  else if (onePartPerProc_){
1425  procMin = procMax = int(partId);
1426  }
1427  else if (procDist_.size() > 0){
1428  if (procDistEquallySpread_) {
1429  // Avoid binary search.
1430  double fProcs = comm_->getSize();
1431  double fParts = nGlobalParts_;
1432  double each = fParts / fProcs;
1433  procMin = int(partId / each);
1434  while (procDist_[procMin] > partId) procMin--;
1435  while (procDist_[procMin+1] <= partId) procMin++;
1436  procMax = procMin;
1437  }
1438  else {
1439  // find the first p such that procDist_[p] > partId.
1440  // For now, do a binary search.
1441 
1442  typename std::vector<part_t>::const_iterator entry;
1443  entry = std::upper_bound(procDist_.begin(), procDist_.end(), partId);
1444 
1445  size_t procIdx = entry - procDist_.begin();
1446  procMin = procMax = int(procIdx) - 1;
1447  }
1448  }
1449  else{
1450  procMin = partDist_[partId];
1451  procMax = partDist_[partId+1] - 1;
1452  }
1453 }
1454 
1455 template <typename Adapter>
1457  int c1, int c2) const
1458 {
1459  if (c1 < 0 || c1 >= nWeightsPerObj_ || c2 < 0 || c2 >= nWeightsPerObj_ )
1460  throw std::logic_error("criteriaHaveSamePartSizes error");
1461 
1462  bool theSame = false;
1463 
1464  if (c1 == c2)
1465  theSame = true;
1466 
1467  else if (pSizeUniform_[c1] == true && pSizeUniform_[c2] == true)
1468  theSame = true;
1469 
1470  else if (pCompactIndex_[c1].size() == pCompactIndex_[c2].size()){
1471  theSame = true;
1472  bool useIndex = pCompactIndex_[c1].size() > 0;
1473  if (useIndex){
1474  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1475  if (pSize_[c1][pCompactIndex_[c1][p]] !=
1476  pSize_[c2][pCompactIndex_[c2][p]])
1477  theSame = false;
1478  }
1479  else{
1480  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1481  if (pSize_[c1][p] != pSize_[c2][p])
1482  theSame = false;
1483  }
1484  }
1485 
1486  return theSame;
1487 }
1488 
1504 template <typename Adapter>
1506  bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
1507  int numLocalParts, int numGlobalParts)
1508 {
1509 #ifdef _MSC_VER
1510  typedef SSIZE_T ssize_t;
1511 #endif
1512  int nprocs = comm_->getSize();
1513  ssize_t reducevals[4];
1514  ssize_t sumHaveGlobal=0, sumHaveLocal=0;
1515  ssize_t sumGlobal=0, sumLocal=0;
1516  ssize_t maxGlobal=0, maxLocal=0;
1517  ssize_t vals[4] = {haveNumGlobalParts, haveNumLocalParts,
1518  numGlobalParts, numLocalParts};
1519 
1520  partDist_.clear();
1521  procDist_.clear();
1522 
1523  if (doCheck){
1524 
1525  try{
1526  reduceAll<int, ssize_t>(*comm_, Teuchos::REDUCE_SUM, 4, vals, reducevals);
1527  }
1528  Z2_THROW_OUTSIDE_ERROR(*env_);
1529 
1530  sumHaveGlobal = reducevals[0];
1531  sumHaveLocal = reducevals[1];
1532  sumGlobal = reducevals[2];
1533  sumLocal = reducevals[3];
1534 
1535  env_->localInputAssertion(__FILE__, __LINE__,
1536  "Either all procs specify num_global/local_parts or none do",
1537  (sumHaveGlobal == 0 || sumHaveGlobal == nprocs) &&
1538  (sumHaveLocal == 0 || sumHaveLocal == nprocs),
1539  BASIC_ASSERTION);
1540  }
1541  else{
1542  if (haveNumLocalParts)
1543  sumLocal = numLocalParts * nprocs;
1544  if (haveNumGlobalParts)
1545  sumGlobal = numGlobalParts * nprocs;
1546 
1547  sumHaveGlobal = haveNumGlobalParts ? nprocs : 0;
1548  sumHaveLocal = haveNumLocalParts ? nprocs : 0;
1549 
1550  maxLocal = numLocalParts;
1551  maxGlobal = numGlobalParts;
1552  }
1553 
1554  if (!haveNumLocalParts && !haveNumGlobalParts){
1555  onePartPerProc_ = true; // default if user did not specify
1556  return;
1557  }
1558 
1559  if (haveNumGlobalParts){
1560  if (doCheck){
1561  vals[0] = numGlobalParts;
1562  vals[1] = numLocalParts;
1563  try{
1564  reduceAll<int, ssize_t>(
1565  *comm_, Teuchos::REDUCE_MAX, 2, vals, reducevals);
1566  }
1567  Z2_THROW_OUTSIDE_ERROR(*env_);
1568 
1569  maxGlobal = reducevals[0];
1570  maxLocal = reducevals[1];
1571 
1572  env_->localInputAssertion(__FILE__, __LINE__,
1573  "Value for num_global_parts is different on different processes.",
1574  maxGlobal * nprocs == sumGlobal, BASIC_ASSERTION);
1575  }
1576 
1577  if (sumLocal){
1578  env_->localInputAssertion(__FILE__, __LINE__,
1579  "Sum of num_local_parts does not equal requested num_global_parts",
1580  sumLocal == numGlobalParts, BASIC_ASSERTION);
1581 
1582  if (sumLocal == nprocs && maxLocal == 1){
1583  onePartPerProc_ = true; // user specified one part per proc
1584  return;
1585  }
1586  }
1587  else{
1588  if (maxGlobal == nprocs){
1589  onePartPerProc_ = true; // user specified num parts is num procs
1590  return;
1591  }
1592  }
1593  }
1594 
1595  // If we are here, we do not have #parts == #procs.
1596 
1597  if (sumHaveLocal == nprocs){
1598  //
1599  // We will go by the number of local parts specified.
1600  //
1601 
1602  try{
1603  procDist_.resize(nprocs+1);
1604  }
1605  catch (std::exception &e){
1606  throw(std::bad_alloc());
1607  }
1608 
1609  part_t *procArray = &procDist_[0];
1610 
1611  try{
1612  part_t tmp = part_t(numLocalParts);
1613  gatherAll<int, part_t>(*comm_, 1, &tmp, nprocs, procArray + 1);
1614  }
1615  Z2_THROW_OUTSIDE_ERROR(*env_);
1616 
1617  procArray[0] = 0;
1618 
1619  for (int proc=0; proc < nprocs; proc++)
1620  procArray[proc+1] += procArray[proc];
1621  }
1622  else{
1623  //
1624  // We will allocate global number of parts to the processes.
1625  //
1626  double fParts = numGlobalParts;
1627  double fProcs = nprocs;
1628 
1629  if (fParts < fProcs){
1630 
1631  try{
1632  partDist_.resize(size_t(fParts+1));
1633  }
1634  catch (std::exception &e){
1635  throw(std::bad_alloc());
1636  }
1637 
1638  int *partArray = &partDist_[0];
1639 
1640  double each = floor(fProcs / fParts);
1641  double extra = fmod(fProcs, fParts);
1642  partDist_[0] = 0;
1643 
1644  for (part_t part=0; part < numGlobalParts; part++){
1645  int numOwners = int(each + ((part<extra) ? 1 : 0));
1646  partArray[part+1] = partArray[part] + numOwners;
1647  }
1648 
1649  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1650  partDist_[numGlobalParts] == nprocs, COMPLEX_ASSERTION, comm_);
1651  }
1652  else if (fParts > fProcs){
1653 
1654  // User did not specify local number of parts per proc;
1655  // Distribute the parts evenly among the procs.
1656 
1657  procDistEquallySpread_ = true;
1658 
1659  try{
1660  procDist_.resize(size_t(fProcs+1));
1661  }
1662  catch (std::exception &e){
1663  throw(std::bad_alloc());
1664  }
1665 
1666  part_t *procArray = &procDist_[0];
1667 
1668  double each = floor(fParts / fProcs);
1669  double extra = fmod(fParts, fProcs);
1670  procArray[0] = 0;
1671 
1672  for (int proc=0; proc < nprocs; proc++){
1673  part_t numParts = part_t(each + ((proc<extra) ? 1 : 0));
1674  procArray[proc+1] = procArray[proc] + numParts;
1675  }
1676 
1677  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1678  procDist_[nprocs] == numGlobalParts, COMPLEX_ASSERTION, comm_);
1679  }
1680  else{
1681  env_->globalBugAssertion(__FILE__, __LINE__,
1682  "should never get here", 1, COMPLEX_ASSERTION, comm_);
1683  }
1684  }
1685 }
1686 
1688 // Remap a new part assignment vector for maximum overlap with an input
1689 // part assignment.
1690 //
1691 // Assumptions for this version:
1692 // input part assignment == processor rank for every local object.
1693 // assuming nGlobalParts_ <= num ranks
1694 // TODO: Write a version that takes the input part number as input;
1695 // this change requires input parts in adapters to be provided in
1696 // the Adapter.
1697 // TODO: For repartitioning, compare to old remapping results; see Zoltan1.
1698 
1699 template <typename Adapter>
1701 {
1702  size_t len = parts_.size();
1703 
1704  part_t me = comm_->getRank();
1705  int np = comm_->getSize();
1706 
1707  if (np < nGlobalParts_) {
1708  if (me == 0)
1709  std::cout << "Remapping not yet supported for "
1710  << "num_global_parts " << nGlobalParts_
1711  << " > num procs " << np << std::endl;
1712  return;
1713  }
1714  // Build edges of a bipartite graph with np + nGlobalParts_ vertices,
1715  // and edges between a process vtx and any parts to which that process'
1716  // objects are assigned.
1717  // Weight edge[parts_[i]] by the number of objects that are going from
1718  // this rank to parts_[i].
1719  // We use a std::map, assuming the number of unique parts in the parts_ array
1720  // is small to keep the binary search efficient.
1721  // TODO We use the count of objects to move; should change to SIZE of objects
1722  // to move; need SIZE function in Adapter.
1723 
1724  std::map<part_t, long> edges;
1725  long lstaying = 0; // Total num of local objects staying if we keep the
1726  // current mapping. TODO: change to SIZE of local objs
1727  long gstaying = 0; // Total num of objects staying in the current partition
1728 
1729  for (size_t i = 0; i < len; i++) {
1730  edges[parts_[i]]++; // TODO Use obj size instead of count
1731  if (parts_[i] == me) lstaying++; // TODO Use obj size instead of count
1732  }
1733 
1734  Teuchos::reduceAll<int, long>(*comm_, Teuchos::REDUCE_SUM, 1,
1735  &lstaying, &gstaying);
1736 //TODO if (gstaying == Adapter::getGlobalNumObjs()) return; // Nothing to do
1737 
1738  part_t *remap = NULL;
1739 
1740  int nedges = edges.size();
1741 
1742  // Gather the graph to rank 0.
1743  part_t tnVtx = np + nGlobalParts_; // total # vertices
1744  int *idx = NULL; // Pointer index into graph adjacencies
1745  int *sizes = NULL; // nedges per rank
1746  if (me == 0) {
1747  idx = new int[tnVtx+1];
1748  sizes = new int[np];
1749  sizes[0] = nedges;
1750  }
1751  if (np > 1)
1752  Teuchos::gather<int, int>(&nedges, 1, sizes, 1, 0, *comm_);
1753 
1754  // prefix sum to build the idx array
1755  if (me == 0) {
1756  idx[0] = 0;
1757  for (int i = 0; i < np; i++)
1758  idx[i+1] = idx[i] + sizes[i];
1759  }
1760 
1761  // prepare to send edges
1762  int cnt = 0;
1763  part_t *bufv = NULL;
1764  long *bufw = NULL;
1765  if (nedges) {
1766  bufv = new part_t[nedges];
1767  bufw = new long[nedges];
1768  // Create buffer with edges (me, part[i]) and weight edges[parts_[i]].
1769  for (typename std::map<part_t, long>::iterator it = edges.begin();
1770  it != edges.end(); it++) {
1771  bufv[cnt] = it->first; // target part
1772  bufw[cnt] = it->second; // weight
1773  cnt++;
1774  }
1775  }
1776 
1777  // Prepare to receive edges on rank 0
1778  part_t *adj = NULL;
1779  long *wgt = NULL;
1780  if (me == 0) {
1781 //SYM adj = new part_t[2*idx[np]]; // need 2x space to symmetrize later
1782 //SYM wgt = new long[2*idx[np]]; // need 2x space to symmetrize later
1783  adj = new part_t[idx[np]];
1784  wgt = new long[idx[np]];
1785  }
1786 
1787  Teuchos::gatherv<int, part_t>(bufv, cnt, adj, sizes, idx, 0, *comm_);
1788  Teuchos::gatherv<int, long>(bufw, cnt, wgt, sizes, idx, 0, *comm_);
1789  delete [] bufv;
1790  delete [] bufw;
1791 
1792  // Now have constructed graph on rank 0.
1793  // Call the matching algorithm
1794 
1795  int doRemap;
1796  if (me == 0) {
1797  // We have the "LHS" vertices of the bipartite graph; need to create
1798  // "RHS" vertices.
1799  for (int i = 0; i < idx[np]; i++) {
1800  adj[i] += np; // New RHS vertex number; offset by num LHS vertices
1801  }
1802 
1803  // Build idx for RHS vertices
1804  for (part_t i = np; i < tnVtx; i++) {
1805  idx[i+1] = idx[i]; // No edges for RHS vertices
1806  }
1807 
1808 #ifdef KDDKDD_DEBUG
1809  std::cout << "IDX ";
1810  for (part_t i = 0; i <= tnVtx; i++) std::cout << idx[i] << " ";
1811  std::cout << std::endl;
1812 
1813  std::cout << "ADJ ";
1814  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << adj[i] << " ";
1815  std::cout << std::endl;
1816 
1817  std::cout << "WGT ";
1818  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << wgt[i] << " ";
1819  std::cout << std::endl;
1820 #endif
1821 
1822  // Perform matching on the graph
1823  part_t *match = new part_t[tnVtx];
1824  for (part_t i = 0; i < tnVtx; i++) match[i] = i;
1825  part_t nmatches =
1826  Zoltan2::GreedyMWM<part_t, long>(idx, adj, wgt, tnVtx, match);
1827 
1828 #ifdef KDDKDD_DEBUG
1829  std::cout << "After matching: " << nmatches << " found" << std::endl;
1830  for (part_t i = 0; i < tnVtx; i++)
1831  std::cout << "match[" << i << "] = " << match[i]
1832  << ((match[i] != i &&
1833  (i < np && match[i] != i+np))
1834  ? " *" : " ")
1835  << std::endl;
1836 #endif
1837 
1838  // See whether there were nontrivial changes in the matching.
1839  bool nontrivial = false;
1840  if (nmatches) {
1841  for (part_t i = 0; i < np; i++) {
1842  if ((match[i] != i) && (match[i] != (i+np))) {
1843  nontrivial = true;
1844  break;
1845  }
1846  }
1847  }
1848 
1849  // Process the matches
1850  if (nontrivial) {
1851  remap = new part_t[nGlobalParts_];
1852  for (part_t i = 0; i < nGlobalParts_; i++) remap[i] = -1;
1853 
1854  bool *used = new bool[np];
1855  for (part_t i = 0; i < np; i++) used[i] = false;
1856 
1857  // First, process all matched parts
1858  for (part_t i = 0; i < nGlobalParts_; i++) {
1859  part_t tmp = i + np;
1860  if (match[tmp] != tmp) {
1861  remap[i] = match[tmp];
1862  used[match[tmp]] = true;
1863  }
1864  }
1865 
1866  // Second, process unmatched parts; keep same part number if possible
1867  for (part_t i = 0; i < nGlobalParts_; i++) {
1868  if (remap[i] > -1) continue;
1869  if (!used[i]) {
1870  remap[i] = i;
1871  used[i] = true;
1872  }
1873  }
1874 
1875  // Third, process unmatched parts; give them the next unused part
1876  for (part_t i = 0, uidx = 0; i < nGlobalParts_; i++) {
1877  if (remap[i] > -1) continue;
1878  while (used[uidx]) uidx++;
1879  remap[i] = uidx;
1880  used[uidx] = true;
1881  }
1882  delete [] used;
1883  }
1884  delete [] match;
1885 
1886 #ifdef KDDKDD_DEBUG
1887  std::cout << "Remap vector: ";
1888  for (part_t i = 0; i < nGlobalParts_; i++) std::cout << remap[i] << " ";
1889  std::cout << std::endl;
1890 #endif
1891 
1892  long newgstaying = measure_stays(remap, idx, adj, wgt,
1893  nGlobalParts_, np);
1894  doRemap = (newgstaying > gstaying);
1895  std::cout << "gstaying " << gstaying << " measure(input) "
1896  << measure_stays(NULL, idx, adj, wgt, nGlobalParts_, np)
1897  << " newgstaying " << newgstaying
1898  << " nontrivial " << nontrivial
1899  << " doRemap " << doRemap << std::endl;
1900  }
1901  delete [] idx;
1902  delete [] sizes;
1903  delete [] adj;
1904  delete [] wgt;
1905 
1906  Teuchos::broadcast<int, int>(*comm_, 0, 1, &doRemap);
1907 
1908  if (doRemap) {
1909  if (me != 0) remap = new part_t[nGlobalParts_];
1910  Teuchos::broadcast<int, part_t>(*comm_, 0, nGlobalParts_, remap);
1911  for (size_t i = 0; i < len; i++) {
1912  parts_[i] = remap[parts_[i]];
1913  }
1914  }
1915 
1916  delete [] remap; // TODO May want to keep for repartitioning as in Zoltan
1917 }
1918 
1919 
1920 } // namespace Zoltan2
1921 
1922 #endif
const int * getProcListView() const
Returns the process list corresponding to the global ID list.
scalar_t getCriteriaPartSize(int idx, part_t part) const
Get the size for a given weight index and a given part.
Defines the Solution base class.
fast typical checks for valid arguments
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
part_t pointAssign(int dim, scalar_t *point) const
Return the part overlapping a given point in space;.
void RemapParts()
Remap a new partition for maximum overlap with an input partition.
Just a placeholder for now.
more involved, like validate a graph
tuple id
Definition: xml2dox.py:189
size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
virtual bool isPartitioningTreeBinary() const
calculate if partition tree is binary.
void getPartsForProc(int procId, double &numParts, part_t &partMin, part_t &partMax) const
Get the parts belonging to a process.
void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
Get the processes containing a part.
sub-steps, each method&#39;s entry and exit
bool oneToOnePartDistribution() const
Is the part-to-process distribution is one-to-one.
SparseMatrixAdapter_t::part_t part_t
long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt, part_t nrhs, part_t nlhs)
scalar_t getLocalFractionOfPart() const
If parts are divided across processes, return the fraction of a part on this process.
dictionary vals
Definition: xml2dox.py:186
A PartitioningSolution is a solution to a partitioning problem.
const RCP< const Comm< int > > & getCommunicator() const
Return the communicator associated with the solution.
void boxAssign(int dim, scalar_t *lower, scalar_t *upper, size_t &nPartsFound, part_t **partsFound) const
Return an array of all parts overlapping a given box in space.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
PartitioningSolution(const RCP< const Environment > &env, const RCP< const Comm< int > > &comm, int nUserWeights, const RCP< Algorithm< Adapter > > &algorithm=Teuchos::null)
Constructor when part sizes are not supplied.
Algorithm defines the base class for all algorithms.
void getPartitionTree(part_t &numTreeVerts, std::vector< part_t > &permPartNums, std::vector< part_t > &splitRangeBeg, std::vector< part_t > &splitRangeEnd, std::vector< part_t > &treeVertParents) const
get the partition tree - fill the relevant arrays
#define epsilon
Greedy Maximal Weight Matching.
static const std::string fail
void setParts(ArrayRCP< part_t > &partList)
The algorithm uses setParts to set the solution.
std::vector< Zoltan2::coordinateModelPartBox > & getPartBoxesView() const
returns the part box boundary list.
Defines the Environment class.
const part_t * getProcDistribution() const
Return a distribution by process.
void getCommunicationGraph(ArrayRCP< part_t > &comXAdj, ArrayRCP< part_t > &comAdj) const
returns communication graph resulting from geometric partitioning.
int getNumberOfCriteria() const
Get the number of criteria (object weights)
const int * getPartDistribution() const
Return a distribution by part.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
bool criteriaHasUniformPartSizes(int idx) const
Determine if balancing criteria has uniform part sizes. (User can specify differing part sizes...
bool criteriaHaveSamePartSizes(int c1, int c2) const
Return true if the two weight indices have the same part size information.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
const RCP< const Environment > & getEnvironment() const
Return the environment associated with the solution.
Zoltan2::BasicUserTypes< zscalar_t, zlno_t, zgno_t > user_t
Definition: Metric.cpp:74