Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_PartitioningSolution.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
45 
50 #ifndef _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
51 #define _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
52 
53 namespace Zoltan2 {
54 template <typename Adapter>
56 }
57 
58 #include <Zoltan2_Environment.hpp>
59 #include <Zoltan2_Solution.hpp>
60 #include <Zoltan2_GreedyMWM.hpp>
61 #include <Zoltan2_Algorithm.hpp>
63 #include <cmath>
64 #include <algorithm>
65 #include <vector>
66 #include <limits>
67 #include <sstream>
68 
69 #ifdef _MSC_VER
70 #define NOMINMAX
71 #include <windows.h>
72 #endif
73 
74 
75 namespace Zoltan2 {
76 
91 template <typename Adapter>
92  class PartitioningSolution : public Solution
93 {
94 public:
95 
96 #ifndef DOXYGEN_SHOULD_SKIP_THIS
97  typedef typename Adapter::gno_t gno_t;
98  typedef typename Adapter::scalar_t scalar_t;
99  typedef typename Adapter::lno_t lno_t;
100  typedef typename Adapter::part_t part_t;
101  typedef typename Adapter::user_t user_t;
102 #endif
103 
121  PartitioningSolution( const RCP<const Environment> &env,
122  const RCP<const Comm<int> > &comm,
123  int nUserWeights,
124  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
125 
155  PartitioningSolution(const RCP<const Environment> &env,
156  const RCP<const Comm<int> > &comm,
157  int nUserWeights, ArrayView<ArrayRCP<part_t> > reqPartIds,
158  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
159  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
160 
162  // Information that the algorithm may wish to query.
163 
166  size_t getTargetGlobalNumberOfParts() const { return nGlobalParts_; }
167 
170  size_t getActualGlobalNumberOfParts() const { return nGlobalPartsSolution_; }
171 
174  size_t getLocalNumberOfParts() const { return nLocalParts_; }
175 
183  scalar_t getLocalFractionOfPart() const { return localFraction_; }
184 
194  bool oneToOnePartDistribution() const { return onePartPerProc_; }
195 
215  const int *getPartDistribution() const {
216  if (partDist_.size() > 0) return &partDist_[0];
217  else return NULL;
218  }
219 
236  const part_t *getProcDistribution() const {
237  if (procDist_.size() > 0) return &procDist_[0];
238  else return NULL;
239  }
240 
244  int getNumberOfCriteria() const { return nWeightsPerObj_; }
245 
246 
253  bool criteriaHasUniformPartSizes(int idx) const { return pSizeUniform_[idx];}
254 
267  scalar_t getCriteriaPartSize(int idx, part_t part) const {
268  if (pSizeUniform_[idx])
269  return 1.0 / nGlobalParts_;
270  else if (pCompactIndex_[idx].size())
271  return pSize_[idx][pCompactIndex_[idx][part]];
272  else
273  return pSize_[idx][part];
274  }
275 
289  bool criteriaHaveSamePartSizes(int c1, int c2) const;
290 
292  // Method used by the algorithm to set results.
293 
320  void setParts(ArrayRCP<part_t> &partList);
321 
323 
335  void RemapParts();
336 
338  /* Return the weight of objects staying with a given remap.
339  * If remap is NULL, compute weight of objects staying with given partition
340  */
341  long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt,
342  part_t nrhs, part_t nlhs)
343  {
344  long staying = 0;
345  for (part_t i = 0; i < nrhs; i++) {
346  part_t k = (remap ? remap[i] : i);
347  for (part_t j = idx[k]; j < idx[k+1]; j++) {
348  if (i == (adj[j]-nlhs)) {
349  staying += wgt[j];
350  break;
351  }
352  }
353  }
354  return staying;
355  }
356 
358  // Results that may be queried by the user, by migration methods,
359  // or by metric calculation methods.
360  // We return raw pointers so users don't have to learn about our
361  // pointer wrappers.
362 
365  inline const RCP<const Comm<int> > &getCommunicator() const { return comm_;}
366 
369  inline const RCP<const Environment> &getEnvironment() const { return env_;}
370 
373  const part_t *getPartListView() const {
374  if (parts_.size() > 0) return parts_.getRawPtr();
375  else return NULL;
376  }
377 
382  const int *getProcListView() const {
383  if (procs_.size() > 0) return procs_.getRawPtr();
384  else return NULL;
385  }
386 
389  virtual bool isPartitioningTreeBinary() const
390  {
391  if (this->algorithm_ == Teuchos::null)
392  throw std::logic_error("no partitioning algorithm has been run yet");
393  return this->algorithm_->isPartitioningTreeBinary();
394  }
395 
398  void getPartitionTree(part_t & numTreeVerts,
399  std::vector<part_t> & permPartNums,
400  std::vector<part_t> & splitRangeBeg,
401  std::vector<part_t> & splitRangeEnd,
402  std::vector<part_t> & treeVertParents) const {
403 
404  part_t numParts = static_cast<part_t>(getTargetGlobalNumberOfParts());
405 
406  if (this->algorithm_ == Teuchos::null)
407  throw std::logic_error("no partitioning algorithm has been run yet");
408  this->algorithm_->getPartitionTree(
409  numParts, // may want to change how this is passed through
410  numTreeVerts,
411  permPartNums,
412  splitRangeBeg,
413  splitRangeEnd,
414  treeVertParents);
415  }
416 
419  std::vector<Zoltan2::coordinateModelPartBox> &
421  {
422  return this->algorithm_->getPartBoxesView();
423  }
424 
426  // when a point lies on a part boundary, the lowest part
427  // number on that boundary is returned.
428  // Note that not all partitioning algorithms will support
429  // this method.
430  //
431  // \param dim : the number of dimensions specified for the point in space
432  // \param point : the coordinates of the point in space; array of size dim
433  // \return the part number of a part overlapping the given point
434  part_t pointAssign(int dim, scalar_t *point) const
435  {
436  part_t p;
437  try {
438  if (this->algorithm_ == Teuchos::null)
439  throw std::logic_error("no partitioning algorithm has been run yet");
440 
441  p = this->algorithm_->pointAssign(dim, point);
442  }
444  return p;
445  }
446 
448  // This method allocates memory for the return argument, but does not
449  // control that memory. The user is responsible for freeing the
450  // memory.
451  //
452  // \param dim : (in) the number of dimensions specified for the box
453  // \param lower : (in) the coordinates of the lower corner of the box;
454  // array of size dim
455  // \param upper : (in) the coordinates of the upper corner of the box;
456  // array of size dim
457  // \param nPartsFound : (out) the number of parts overlapping the box
458  // \param partsFound : (out) array of parts overlapping the box
459  void boxAssign(int dim, scalar_t *lower, scalar_t *upper,
460  size_t &nPartsFound, part_t **partsFound) const
461  {
462  try {
463  if (this->algorithm_ == Teuchos::null)
464  throw std::logic_error("no partitioning algorithm has been run yet");
465 
466  this->algorithm_->boxAssign(dim, lower, upper, nPartsFound, partsFound);
467  }
469  }
470 
471 
474  void getCommunicationGraph(ArrayRCP <part_t> &comXAdj,
475  ArrayRCP <part_t> &comAdj) const
476  {
477  try {
478  if (this->algorithm_ == Teuchos::null)
479  throw std::logic_error("no partitioning algorithm has been run yet");
480 
481  this->algorithm_->getCommunicationGraph(this, comXAdj, comAdj);
482  }
484  }
485 
503  void getPartsForProc(int procId, double &numParts, part_t &partMin,
504  part_t &partMax) const
505  {
506  env_->localInputAssertion(__FILE__, __LINE__, "invalid process id",
507  procId >= 0 && procId < comm_->getSize(), BASIC_ASSERTION);
508 
509  procToPartsMap(procId, numParts, partMin, partMax);
510  }
511 
523  void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
524  {
525  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
526  partId >= 0 && partId < nGlobalParts_, BASIC_ASSERTION);
527 
528  partToProcsMap(partId, procMin, procMax);
529  }
530 
531 private:
532  void partToProc(bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
533  int numLocalParts, int numGlobalParts);
534 
535  void procToPartsMap(int procId, double &numParts, part_t &partMin,
536  part_t &partMax) const;
537 
538  void partToProcsMap(part_t partId, int &procMin, int &procMax) const;
539 
540  void setPartDistribution();
541 
542  void setPartSizes(ArrayView<ArrayRCP<part_t> > reqPartIds,
543  ArrayView<ArrayRCP<scalar_t> > reqPartSizes);
544 
545  void computePartSizes(int widx, ArrayView<part_t> ids,
546  ArrayView<scalar_t> sizes);
547 
548  void broadcastPartSizes(int widx);
549 
550 
551  RCP<const Environment> env_; // has application communicator
552  const RCP<const Comm<int> > comm_; // the problem communicator
553 
554  //part box boundaries as a result of geometric partitioning algorithm.
555  RCP<std::vector<Zoltan2::coordinateModelPartBox> > partBoxes;
556 
557  part_t nGlobalParts_;// target global number of parts
558  part_t nLocalParts_; // number of parts to be on this process
559 
560  scalar_t localFraction_; // approx fraction of a part on this process
561  int nWeightsPerObj_; // if user has no weights, this is 1 TODO: WHY???
562 
563  // If process p is to be assigned part p for all p, then onePartPerProc_
564  // is true. Otherwise it is false, and either procDist_ or partDist_
565  // describes the allocation of parts to processes.
566  //
567  // If parts are never split across processes, then procDist_ is defined
568  // as follows:
569  //
570  // partId = procDist_[procId]
571  // partIdNext = procDist_[procId+1]
572  // globalNumberOfParts = procDist_[numProcs]
573  //
574  // meaning that the parts assigned to process procId range from
575  // [partId, partIdNext). If partIdNext is the same as partId, then
576  // process procId has no parts.
577  //
578  // If the number parts is less than the number of processes, and the
579  // user did not specify "num_local_parts" for each of the processes, then
580  // parts are split across processes, and partDist_ is defined rather than
581  // procDist_.
582  //
583  // procId = partDist_[partId]
584  // procIdNext = partDist_[partId+1]
585  // globalNumberOfProcs = partDist_[numParts]
586  //
587  // which implies that the part partId is shared by processes in the
588  // the range [procId, procIdNext).
589  //
590  // We use std::vector so we can use upper_bound algorithm
591 
592  bool onePartPerProc_; // either this is true...
593  std::vector<int> partDist_; // or this is defined ...
594  std::vector<part_t> procDist_; // or this is defined.
595  bool procDistEquallySpread_; // if procDist_ is used and
596  // #parts > #procs and
597  // num_local_parts is not specified,
598  // parts are evenly distributed to procs
599 
600  // In order to minimize the storage required for part sizes, we
601  // have three different representations.
602  //
603  // If the part sizes for weight index w are all the same, then:
604  // pSizeUniform_[w] = true
605  // pCompactIndex_[w].size() = 0
606  // pSize_[w].size() = 0
607  //
608  // and the size for part p is 1.0 / nparts.
609  //
610  // If part sizes differ for each part in weight index w, but there
611  // are no more than 64 distinct sizes:
612  // pSizeUniform_[w] = false
613  // pCompactIndex_[w].size() = number of parts
614  // pSize_[w].size() = number of different sizes
615  //
616  // and the size for part p is pSize_[pCompactIndex_[p]].
617  //
618  // If part sizes differ for each part in weight index w, and there
619  // are more than 64 distinct sizes:
620  // pSizeUniform_[w] = false
621  // pCompactIndex_[w].size() = 0
622  // pSize_[w].size() = nparts
623  //
624  // and the size for part p is pSize_[p].
625  //
626  // NOTE: If we expect to have similar cases, i.e. a long list of scalars
627  // where it is highly possible that just a few unique values appear,
628  // then we may want to make this a class. The advantage is that we
629  // save a long list of 1-byte indices instead of a long list of scalars.
630 
631  ArrayRCP<bool> pSizeUniform_;
632  ArrayRCP<ArrayRCP<unsigned char> > pCompactIndex_;
633  ArrayRCP<ArrayRCP<scalar_t> > pSize_;
634 
636  // The algorithm sets these values upon completion.
637 
638  ArrayRCP<part_t> parts_; // part number assigned to localid[i]
639 
640  bool haveSolution_;
641 
642  part_t nGlobalPartsSolution_; // global number of parts in solution
643 
645  // The solution calculates this from the part assignments,
646  // unless onePartPerProc_.
647 
648  ArrayRCP<int> procs_; // process rank assigned to localid[i]
649 
651  // Algorithm used to compute the solution;
652  // needed for post-processing with pointAssign or getCommunicationGraph
653  const RCP<Algorithm<Adapter> > algorithm_; //
654 };
655 
657 // Definitions
659 
660 template <typename Adapter>
662  const RCP<const Environment> &env,
663  const RCP<const Comm<int> > &comm,
664  int nUserWeights,
665  const RCP<Algorithm<Adapter> > &algorithm)
666  : env_(env), comm_(comm),
667  partBoxes(),
668  nGlobalParts_(0), nLocalParts_(0),
669  localFraction_(0), nWeightsPerObj_(),
670  onePartPerProc_(false), partDist_(), procDist_(),
671  procDistEquallySpread_(false),
672  pSizeUniform_(), pCompactIndex_(), pSize_(),
673  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
674  procs_(), algorithm_(algorithm)
675 {
676  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
677 
678  setPartDistribution();
679 
680  // We must call setPartSizes() because part sizes may have
681  // been provided by the user on other processes.
682 
683  ArrayRCP<part_t> *noIds = new ArrayRCP<part_t> [nWeightsPerObj_];
684  ArrayRCP<scalar_t> *noSizes = new ArrayRCP<scalar_t> [nWeightsPerObj_];
685  ArrayRCP<ArrayRCP<part_t> > ids(noIds, 0, nWeightsPerObj_, true);
686  ArrayRCP<ArrayRCP<scalar_t> > sizes(noSizes, 0, nWeightsPerObj_, true);
687 
688  setPartSizes(ids.view(0, nWeightsPerObj_), sizes.view(0, nWeightsPerObj_));
689 
690  env_->memory("After construction of solution");
691 }
692 
693 template <typename Adapter>
695  const RCP<const Environment> &env,
696  const RCP<const Comm<int> > &comm,
697  int nUserWeights,
698  ArrayView<ArrayRCP<part_t> > reqPartIds,
699  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
700  const RCP<Algorithm<Adapter> > &algorithm)
701  : env_(env), comm_(comm),
702  partBoxes(),
703  nGlobalParts_(0), nLocalParts_(0),
704  localFraction_(0), nWeightsPerObj_(),
705  onePartPerProc_(false), partDist_(), procDist_(),
706  procDistEquallySpread_(false),
707  pSizeUniform_(), pCompactIndex_(), pSize_(),
708  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
709  procs_(), algorithm_(algorithm)
710 {
711  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
712 
713  setPartDistribution();
714 
715  setPartSizes(reqPartIds, reqPartSizes);
716 
717  env_->memory("After construction of solution");
718 }
719 
720 template <typename Adapter>
722 {
723  // Did the caller define num_global_parts and/or num_local_parts?
724 
725  const ParameterList &pl = env_->getParameters();
726  size_t haveGlobalNumParts=0, haveLocalNumParts=0;
727  int numLocal=0, numGlobal=0;
728 
729  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("num_global_parts");
730 
731  if (pe){
732  haveGlobalNumParts = 1;
733  nGlobalParts_ = part_t(pe->getValue(&nGlobalParts_));
734  numGlobal = nGlobalParts_;
735  }
736 
737  pe = pl.getEntryPtr("num_local_parts");
738 
739  if (pe){
740  haveLocalNumParts = 1;
741  nLocalParts_ = part_t(pe->getValue(&nLocalParts_));
742  numLocal = nLocalParts_;
743  }
744 
745  try{
746  // Sets onePartPerProc_, partDist_, and procDist_
747 
748  partToProc(true, haveLocalNumParts, haveGlobalNumParts,
749  numLocal, numGlobal);
750  }
752 
753  int nprocs = comm_->getSize();
754  int rank = comm_->getRank();
755 
756  if (onePartPerProc_){
757  nGlobalParts_ = nprocs;
758  nLocalParts_ = 1;
759  }
760  else if (partDist_.size() > 0){ // more procs than parts
761  nGlobalParts_ = partDist_.size() - 1;
762  int pstart = partDist_[0];
763  for (part_t i=1; i <= nGlobalParts_; i++){
764  int pend = partDist_[i];
765  if (rank >= pstart && rank < pend){
766  int numOwners = pend - pstart;
767  nLocalParts_ = 1;
768  localFraction_ = 1.0 / numOwners;
769  break;
770  }
771  pstart = pend;
772  }
773  }
774  else if (procDist_.size() > 0){ // more parts than procs
775  nGlobalParts_ = procDist_[nprocs];
776  nLocalParts_ = procDist_[rank+1] - procDist_[rank];
777  }
778  else {
779  throw std::logic_error("partToProc error");
780  }
781 
782 }
783 
784 template <typename Adapter>
785  void PartitioningSolution<Adapter>::setPartSizes(
786  ArrayView<ArrayRCP<part_t> > ids, ArrayView<ArrayRCP<scalar_t> > sizes)
787 {
788  int widx = nWeightsPerObj_;
789  bool fail=false;
790 
791  size_t *countBuf = new size_t [widx*2];
792  ArrayRCP<size_t> counts(countBuf, 0, widx*2, true);
793 
794  fail = ((ids.size() != widx) || (sizes.size() != widx));
795 
796  for (int w=0; !fail && w < widx; w++){
797  counts[w] = ids[w].size();
798  if (ids[w].size() != sizes[w].size()) fail=true;
799  }
800 
801  env_->globalBugAssertion(__FILE__, __LINE__, "bad argument arrays", fail==0,
802  COMPLEX_ASSERTION, comm_);
803 
804  // Are all part sizes the same? This is the common case.
805 
806  ArrayRCP<scalar_t> *emptySizes= new ArrayRCP<scalar_t> [widx];
807  pSize_ = arcp(emptySizes, 0, widx);
808 
809  ArrayRCP<unsigned char> *emptyIndices= new ArrayRCP<unsigned char> [widx];
810  pCompactIndex_ = arcp(emptyIndices, 0, widx);
811 
812  bool *info = new bool [widx];
813  pSizeUniform_ = arcp(info, 0, widx);
814  for (int w=0; w < widx; w++)
815  pSizeUniform_[w] = true;
816 
817  if (nGlobalParts_ == 1){
818  return; // there's only one part in the whole problem
819  }
820 
821  size_t *ptr1 = counts.getRawPtr();
822  size_t *ptr2 = counts.getRawPtr() + widx;
823 
824  try{
825  reduceAll<int, size_t>(*comm_, Teuchos::REDUCE_MAX, widx, ptr1, ptr2);
826  }
827  Z2_THROW_OUTSIDE_ERROR(*env_);
828 
829  bool zero = true;
830 
831  for (int w=0; w < widx; w++)
832  if (counts[widx+w] > 0){
833  zero = false;
834  pSizeUniform_[w] = false;
835  }
836 
837  if (zero) // Part sizes for all criteria are uniform.
838  return;
839 
840  // Compute the part sizes for criteria for which part sizes were
841  // supplied. Normalize for each criteria so part sizes sum to one.
842 
843  int nprocs = comm_->getSize();
844  int rank = comm_->getRank();
845 
846  for (int w=0; w < widx; w++){
847  if (pSizeUniform_[w]) continue;
848 
849  // Send all ids and sizes to one process.
850  // (There is no simple gather method in Teuchos.)
851 
852  part_t length = ids[w].size();
853  part_t *allLength = new part_t [nprocs];
854  Teuchos::gatherAll<int, part_t>(*comm_, 1, &length,
855  nprocs, allLength);
856 
857  if (rank == 0){
858  int total = 0;
859  for (int i=0; i < nprocs; i++)
860  total += allLength[i];
861 
862  part_t *partNums = new part_t [total];
863  scalar_t *partSizes = new scalar_t [total];
864 
865  ArrayView<part_t> idArray(partNums, total);
866  ArrayView<scalar_t> sizeArray(partSizes, total);
867 
868  if (length > 0){
869  for (int i=0; i < length; i++){
870  *partNums++ = ids[w][i];
871  *partSizes++ = sizes[w][i];
872  }
873  }
874 
875  for (int p=1; p < nprocs; p++){
876  if (allLength[p] > 0){
877  Teuchos::receive<int, part_t>(*comm_, p,
878  allLength[p], partNums);
879  Teuchos::receive<int, scalar_t>(*comm_, p,
880  allLength[p], partSizes);
881  partNums += allLength[p];
882  partSizes += allLength[p];
883  }
884  }
885 
886  delete [] allLength;
887 
888  try{
889  computePartSizes(w, idArray, sizeArray);
890  }
892 
893  delete [] idArray.getRawPtr();
894  delete [] sizeArray.getRawPtr();
895  }
896  else{
897  delete [] allLength;
898  if (length > 0){
899  Teuchos::send<int, part_t>(*comm_, length, ids[w].getRawPtr(), 0);
900  Teuchos::send<int, scalar_t>(*comm_, length, sizes[w].getRawPtr(), 0);
901  }
902  }
903 
904  broadcastPartSizes(w);
905  }
906 }
907 
908 template <typename Adapter>
909  void PartitioningSolution<Adapter>::broadcastPartSizes(int widx)
910 {
911  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
912  pSize_.size()>widx &&
913  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
915 
916  int rank = comm_->getRank();
917  int nprocs = comm_->getSize();
918  part_t nparts = nGlobalParts_;
919 
920  if (nprocs < 2)
921  return;
922 
923  char flag=0;
924 
925  if (rank == 0){
926  if (pSizeUniform_[widx] == true)
927  flag = 1;
928  else if (pCompactIndex_[widx].size() > 0)
929  flag = 2;
930  else
931  flag = 3;
932  }
933 
934  try{
935  Teuchos::broadcast<int, char>(*comm_, 0, 1, &flag);
936  }
937  Z2_THROW_OUTSIDE_ERROR(*env_);
938 
939  if (flag == 1){
940  if (rank > 0)
941  pSizeUniform_[widx] = true;
942 
943  return;
944  }
945 
946  if (flag == 2){
947 
948  // broadcast the indices into the size list
949 
950  unsigned char *idxbuf = NULL;
951 
952  if (rank > 0){
953  idxbuf = new unsigned char [nparts];
954  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, idxbuf);
955  }
956  else{
957  env_->localBugAssertion(__FILE__, __LINE__, "index list size",
958  pCompactIndex_[widx].size() == nparts, COMPLEX_ASSERTION);
959  idxbuf = pCompactIndex_[widx].getRawPtr();
960  }
961 
962  try{
963  // broadcast of unsigned char is not supported
964  Teuchos::broadcast<int, char>(*comm_, 0, nparts,
965  reinterpret_cast<char *>(idxbuf));
966  }
967  Z2_THROW_OUTSIDE_ERROR(*env_);
968 
969  if (rank > 0)
970  pCompactIndex_[widx] = arcp(idxbuf, 0, nparts, true);
971 
972  // broadcast the list of different part sizes
973 
974  unsigned char maxIdx=0;
975  for (part_t p=0; p < nparts; p++)
976  if (idxbuf[p] > maxIdx) maxIdx = idxbuf[p];
977 
978  int numSizes = maxIdx + 1;
979 
980  scalar_t *sizeList = NULL;
981 
982  if (rank > 0){
983  sizeList = new scalar_t [numSizes];
984  env_->localMemoryAssertion(__FILE__, __LINE__, numSizes, sizeList);
985  }
986  else{
987  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
988  numSizes == pSize_[widx].size(), COMPLEX_ASSERTION);
989 
990  sizeList = pSize_[widx].getRawPtr();
991  }
992 
993  try{
994  Teuchos::broadcast<int, scalar_t>(*comm_, 0, numSizes, sizeList);
995  }
996  Z2_THROW_OUTSIDE_ERROR(*env_);
997 
998  if (rank > 0)
999  pSize_[widx] = arcp(sizeList, 0, numSizes, true);
1000 
1001  return;
1002  }
1003 
1004  if (flag == 3){
1005 
1006  // broadcast the size of each part
1007 
1008  scalar_t *sizeList = NULL;
1009 
1010  if (rank > 0){
1011  sizeList = new scalar_t [nparts];
1012  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, sizeList);
1013  }
1014  else{
1015  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
1016  nparts == pSize_[widx].size(), COMPLEX_ASSERTION);
1017 
1018  sizeList = pSize_[widx].getRawPtr();
1019  }
1020 
1021  try{
1022  Teuchos::broadcast<int, scalar_t >(*comm_, 0, nparts, sizeList);
1023  }
1024  Z2_THROW_OUTSIDE_ERROR(*env_);
1025 
1026  if (rank > 0)
1027  pSize_[widx] = arcp(sizeList, 0, nparts);
1028 
1029  return;
1030  }
1031 }
1032 
1033 template <typename Adapter>
1034  void PartitioningSolution<Adapter>::computePartSizes(int widx,
1035  ArrayView<part_t> ids, ArrayView<scalar_t> sizes)
1036 {
1037  int len = ids.size();
1038 
1039  if (len == 0){
1040  pSizeUniform_[widx] = true;
1041  return;
1042  }
1043 
1044  env_->localBugAssertion(__FILE__, __LINE__, "bad array sizes",
1045  len>0 && sizes.size()==len, COMPLEX_ASSERTION);
1046 
1047  env_->localBugAssertion(__FILE__, __LINE__, "bad index",
1048  widx>=0 && widx<nWeightsPerObj_, COMPLEX_ASSERTION);
1049 
1050  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
1051  pSize_.size()>widx &&
1052  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
1054 
1055  // Check ids and sizes and find min, max and average sizes.
1056  // If sizes are very close to uniform, call them uniform parts.
1057 
1058  part_t nparts = nGlobalParts_;
1059  unsigned char *buf = new unsigned char [nparts];
1060  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, buf);
1061  memset(buf, 0, nparts);
1062  ArrayRCP<unsigned char> partIdx(buf, 0, nparts, true);
1063 
1064  scalar_t epsilon = 10e-5 / nparts;
1065  scalar_t min=sizes[0], max=sizes[0], sum=0;
1066 
1067  for (int i=0; i < len; i++){
1068  part_t id = ids[i];
1069  scalar_t size = sizes[i];
1070 
1071  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
1072  id>=0 && id<nparts, BASIC_ASSERTION);
1073 
1074  env_->localInputAssertion(__FILE__, __LINE__, "invalid part size", size>=0,
1075  BASIC_ASSERTION);
1076 
1077  // TODO: we could allow users to specify multiple sizes for the same
1078  // part if we add a parameter that says what we are to do with them:
1079  // add them or take the max.
1080 
1081  env_->localInputAssertion(__FILE__, __LINE__,
1082  "multiple sizes provided for one part", partIdx[id]==0, BASIC_ASSERTION);
1083 
1084  partIdx[id] = 1; // mark that we have a size for this part
1085 
1086  if (size < min) min = size;
1087  if (size > max) max = size;
1088  sum += size;
1089  }
1090 
1091  if (sum == 0){
1092 
1093  // User has given us a list of parts of size 0, we'll set
1094  // the rest to them to equally sized parts.
1095 
1096  scalar_t *allSizes = new scalar_t [2];
1097  env_->localMemoryAssertion(__FILE__, __LINE__, 2, allSizes);
1098 
1099  ArrayRCP<scalar_t> sizeArray(allSizes, 0, 2, true);
1100 
1101  int numNonZero = nparts - len;
1102 
1103  allSizes[0] = 0.0;
1104  allSizes[1] = 1.0 / numNonZero;
1105 
1106  for (part_t p=0; p < nparts; p++)
1107  buf[p] = 1; // index to default part size
1108 
1109  for (int i=0; i < len; i++)
1110  buf[ids[i]] = 0; // index to part size zero
1111 
1112  pSize_[widx] = sizeArray;
1113  pCompactIndex_[widx] = partIdx;
1114 
1115  return;
1116  }
1117 
1118  if (max - min <= epsilon){
1119  pSizeUniform_[widx] = true;
1120  return;
1121  }
1122 
1123  // A size for parts that were not specified:
1124  scalar_t avg = sum / nparts;
1125 
1126  // We are going to merge part sizes that are very close. This takes
1127  // computation time now, but can save considerably in the storage of
1128  // all part sizes on each process. For example, a common case may
1129  // be some parts are size 1 and all the rest are size 2.
1130 
1131  scalar_t *tmp = new scalar_t [len];
1132  env_->localMemoryAssertion(__FILE__, __LINE__, len, tmp);
1133  memcpy(tmp, sizes.getRawPtr(), sizeof(scalar_t) * len);
1134  ArrayRCP<scalar_t> partSizes(tmp, 0, len, true);
1135 
1136  std::sort(partSizes.begin(), partSizes.end());
1137 
1138  // create a list of sizes that are unique within epsilon
1139 
1140  Array<scalar_t> nextUniqueSize;
1141  nextUniqueSize.push_back(partSizes[len-1]); // largest
1142  scalar_t curr = partSizes[len-1];
1143  int avgIndex = len;
1144  bool haveAvg = false;
1145  if (curr - avg <= epsilon)
1146  avgIndex = 0;
1147 
1148  for (int i=len-2; i >= 0; i--){
1149  scalar_t val = partSizes[i];
1150  if (curr - val > epsilon){
1151  nextUniqueSize.push_back(val); // the highest in the group
1152  curr = val;
1153  if (avgIndex==len && val > avg && val - avg <= epsilon){
1154  // the average would be in this group
1155  avgIndex = nextUniqueSize.size() - 1;
1156  haveAvg = true;
1157  }
1158  }
1159  }
1160 
1161  partSizes.clear();
1162 
1163  size_t numSizes = nextUniqueSize.size();
1164  int sizeArrayLen = numSizes;
1165 
1166  if (numSizes < 64){
1167  // We can store the size for every part in a compact way.
1168 
1169  // Create a list of all sizes in increasing order
1170 
1171  if (!haveAvg) sizeArrayLen++; // need to include average
1172 
1173  scalar_t *allSizes = new scalar_t [sizeArrayLen];
1174  env_->localMemoryAssertion(__FILE__, __LINE__, sizeArrayLen, allSizes);
1175  ArrayRCP<scalar_t> sizeArray(allSizes, 0, sizeArrayLen, true);
1176 
1177  int newAvgIndex = sizeArrayLen;
1178 
1179  for (int i=numSizes-1, idx=0; i >= 0; i--){
1180 
1181  if (newAvgIndex == sizeArrayLen){
1182 
1183  if (haveAvg && i==avgIndex)
1184  newAvgIndex = idx;
1185 
1186  else if (!haveAvg && avg < nextUniqueSize[i]){
1187  newAvgIndex = idx;
1188  allSizes[idx++] = avg;
1189  }
1190  }
1191 
1192  allSizes[idx++] = nextUniqueSize[i];
1193  }
1194 
1195  env_->localBugAssertion(__FILE__, __LINE__, "finding average in list",
1196  newAvgIndex < sizeArrayLen, COMPLEX_ASSERTION);
1197 
1198  for (int i=0; i < nparts; i++){
1199  buf[i] = newAvgIndex; // index to default part size
1200  }
1201 
1202  sum = (nparts - len) * allSizes[newAvgIndex];
1203 
1204  for (int i=0; i < len; i++){
1205  int id = ids[i];
1206  scalar_t size = sizes[i];
1207  int index;
1208 
1209  // Find the first size greater than or equal to this size.
1210 
1211  if (size < avg && avg - size <= epsilon)
1212  index = newAvgIndex;
1213  else{
1214  typename ArrayRCP<scalar_t>::iterator found =
1215  std::lower_bound(sizeArray.begin(), sizeArray.end(), size);
1216 
1217  env_->localBugAssertion(__FILE__, __LINE__, "size array",
1218  found != sizeArray.end(), COMPLEX_ASSERTION);
1219 
1220  index = found - sizeArray.begin();
1221  }
1222 
1223  buf[id] = index;
1224  sum += allSizes[index];
1225  }
1226 
1227  for (int i=0; i < sizeArrayLen; i++){
1228  sizeArray[i] /= sum;
1229  }
1230 
1231  pCompactIndex_[widx] = partIdx;
1232  pSize_[widx] = sizeArray;
1233  }
1234  else{
1235  // To have access to part sizes, we must store nparts scalar_ts on
1236  // every process. We expect this is a rare case.
1237 
1238  tmp = new scalar_t [nparts];
1239  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, tmp);
1240 
1241  sum += ((nparts - len) * avg);
1242 
1243  for (int i=0; i < nparts; i++){
1244  tmp[i] = avg/sum;
1245  }
1246 
1247  for (int i=0; i < len; i++){
1248  tmp[ids[i]] = sizes[i]/sum;
1249  }
1250 
1251  pSize_[widx] = arcp(tmp, 0, nparts);
1252  }
1253 }
1254 
1255 template <typename Adapter>
1256  void PartitioningSolution<Adapter>::setParts(ArrayRCP<part_t> &partList)
1257 {
1258  env_->debug(DETAILED_STATUS, "Entering setParts");
1259 
1260  size_t len = partList.size();
1261 
1262  // Find the actual number of parts in the solution, which can
1263  // be more or less than the nGlobalParts_ target.
1264  // (We may want to compute the imbalance of a given solution with
1265  // respect to a desired solution. This solution may have more or
1266  // fewer parts that the desired solution.)
1267 
1268  part_t lMax = -1;
1269  part_t lMin = (len > 0 ? std::numeric_limits<part_t>::max() : 0);
1270  part_t gMax, gMin;
1271 
1272  for (size_t i = 0; i < len; i++) {
1273  if (partList[i] < lMin) lMin = partList[i];
1274  if (partList[i] > lMax) lMax = partList[i];
1275  }
1276  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MIN, 1, &lMin, &gMin);
1277  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MAX, 1, &lMax, &gMax);
1278 
1279  nGlobalPartsSolution_ = gMax - gMin + 1;
1280  parts_ = partList;
1281 
1282  // Now determine which process gets each object, if not one-to-one.
1283 
1284  if (!onePartPerProc_){
1285 
1286  int *procs = new int [len];
1287  env_->localMemoryAssertion(__FILE__, __LINE__, len, procs);
1288  procs_ = arcp<int>(procs, 0, len);
1289 
1290  if (len > 0) {
1291  part_t *parts = partList.getRawPtr();
1292 
1293  if (procDist_.size() > 0){ // parts are not split across procs
1294 
1295  int procId;
1296  for (size_t i=0; i < len; i++){
1297  partToProcsMap(parts[i], procs[i], procId);
1298  }
1299  }
1300  else{ // harder - we need to split the parts across multiple procs
1301 
1302  lno_t *partCounter = new lno_t [nGlobalPartsSolution_];
1303  env_->localMemoryAssertion(__FILE__, __LINE__, nGlobalPartsSolution_,
1304  partCounter);
1305 
1306  int numProcs = comm_->getSize();
1307 
1308  //MD NOTE: there was no initialization for partCounter.
1309  //I added the line below, correct me if I am wrong.
1310  memset(partCounter, 0, sizeof(lno_t) * nGlobalPartsSolution_);
1311 
1312  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++)
1313  partCounter[parts[i]]++;
1314 
1315  lno_t *procCounter = new lno_t [numProcs];
1316  env_->localMemoryAssertion(__FILE__, __LINE__, numProcs, procCounter);
1317 
1318  int proc1;
1319  int proc2 = partDist_[0];
1320 
1321  for (part_t part=1; part < nGlobalParts_; part++){
1322  proc1 = proc2;
1323  proc2 = partDist_[part+1];
1324  int numprocs = proc2 - proc1;
1325 
1326  double dNum = partCounter[part];
1327  double dProcs = numprocs;
1328 
1329  //std::cout << "dNum:" << dNum << " dProcs:" << dProcs << std::endl;
1330  double each = floor(dNum/dProcs);
1331  double extra = fmod(dNum,dProcs);
1332 
1333  for (int proc=proc1, i=0; proc<proc2; proc++, i++){
1334  if (i < extra)
1335  procCounter[proc] = lno_t(each) + 1;
1336  else
1337  procCounter[proc] = lno_t(each);
1338  }
1339  }
1340 
1341  delete [] partCounter;
1342 
1343  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++){
1344  if (partList[i] >= nGlobalParts_){
1345  // Solution has more parts that targeted. These
1346  // objects just remain on this process.
1347  procs[i] = comm_->getRank();
1348  continue;
1349  }
1350  part_t partNum = parts[i];
1351  proc1 = partDist_[partNum];
1352  proc2 = partDist_[partNum + 1];
1353 
1354  int proc;
1355  for (proc=proc1; proc < proc2; proc++){
1356  if (procCounter[proc] > 0){
1357  procs[i] = proc;
1358  procCounter[proc]--;
1359  break;
1360  }
1361  }
1362  env_->localBugAssertion(__FILE__, __LINE__, "part to proc",
1363  proc < proc2, COMPLEX_ASSERTION);
1364  }
1365 
1366  delete [] procCounter;
1367  }
1368  }
1369  }
1370 
1371  // Now that parts_ info is back on home process, remap the parts.
1372  // TODO: The parts will be inconsistent with the proc assignments after
1373  // TODO: remapping. This problem will go away after we separate process
1374  // TODO: mapping from setParts. But for MueLu's use case, the part
1375  // TODO: remapping is all that matters; they do not use the process mapping.
1376  bool doRemap = false;
1377  const Teuchos::ParameterEntry *pe =
1378  env_->getParameters().getEntryPtr("remap_parts");
1379  if (pe) doRemap = pe->getValue(&doRemap);
1380  if (doRemap) RemapParts();
1381 
1382  haveSolution_ = true;
1383 
1384  env_->memory("After Solution has processed algorithm's answer");
1385  env_->debug(DETAILED_STATUS, "Exiting setParts");
1386 }
1387 
1388 
1389 template <typename Adapter>
1391  double &numParts, part_t &partMin, part_t &partMax) const
1392 {
1393  if (onePartPerProc_){
1394  numParts = 1.0;
1395  partMin = partMax = procId;
1396  }
1397  else if (procDist_.size() > 0){
1398  partMin = procDist_[procId];
1399  partMax = procDist_[procId+1] - 1;
1400  numParts = procDist_[procId+1] - partMin;
1401  }
1402  else{
1403  // find the first p such that partDist_[p] > procId
1404 
1405  std::vector<int>::const_iterator entry;
1406  entry = std::upper_bound(partDist_.begin(), partDist_.end(), procId);
1407 
1408  size_t partIdx = entry - partDist_.begin();
1409  int numProcs = partDist_[partIdx] - partDist_[partIdx-1];
1410  partMin = partMax = int(partIdx) - 1;
1411  numParts = 1.0 / numProcs;
1412  }
1413 }
1414 
1415 template <typename Adapter>
1416  void PartitioningSolution<Adapter>::partToProcsMap(part_t partId,
1417  int &procMin, int &procMax) const
1418 {
1419  if (partId >= nGlobalParts_){
1420  // setParts() may be given an initial solution which uses a
1421  // different number of parts than the desired solution. It is
1422  // still a solution. We keep it on this process.
1423  procMin = procMax = comm_->getRank();
1424  }
1425  else if (onePartPerProc_){
1426  procMin = procMax = int(partId);
1427  }
1428  else if (procDist_.size() > 0){
1429  if (procDistEquallySpread_) {
1430  // Avoid binary search.
1431  double fProcs = comm_->getSize();
1432  double fParts = nGlobalParts_;
1433  double each = fParts / fProcs;
1434  procMin = int(partId / each);
1435  while (procDist_[procMin] > partId) procMin--;
1436  while (procDist_[procMin+1] <= partId) procMin++;
1437  procMax = procMin;
1438  }
1439  else {
1440  // find the first p such that procDist_[p] > partId.
1441  // For now, do a binary search.
1442 
1443  typename std::vector<part_t>::const_iterator entry;
1444  entry = std::upper_bound(procDist_.begin(), procDist_.end(), partId);
1445 
1446  size_t procIdx = entry - procDist_.begin();
1447  procMin = procMax = int(procIdx) - 1;
1448  }
1449  }
1450  else{
1451  procMin = partDist_[partId];
1452  procMax = partDist_[partId+1] - 1;
1453  }
1454 }
1455 
1456 template <typename Adapter>
1458  int c1, int c2) const
1459 {
1460  if (c1 < 0 || c1 >= nWeightsPerObj_ || c2 < 0 || c2 >= nWeightsPerObj_ )
1461  throw std::logic_error("criteriaHaveSamePartSizes error");
1462 
1463  bool theSame = false;
1464 
1465  if (c1 == c2)
1466  theSame = true;
1467 
1468  else if (pSizeUniform_[c1] == true && pSizeUniform_[c2] == true)
1469  theSame = true;
1470 
1471  else if (pCompactIndex_[c1].size() == pCompactIndex_[c2].size()){
1472  theSame = true;
1473  bool useIndex = pCompactIndex_[c1].size() > 0;
1474  if (useIndex){
1475  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1476  if (pSize_[c1][pCompactIndex_[c1][p]] !=
1477  pSize_[c2][pCompactIndex_[c2][p]])
1478  theSame = false;
1479  }
1480  else{
1481  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1482  if (pSize_[c1][p] != pSize_[c2][p])
1483  theSame = false;
1484  }
1485  }
1486 
1487  return theSame;
1488 }
1489 
1505 template <typename Adapter>
1507  bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
1508  int numLocalParts, int numGlobalParts)
1509 {
1510 #ifdef _MSC_VER
1511  typedef SSIZE_T ssize_t;
1512 #endif
1513  int nprocs = comm_->getSize();
1514  ssize_t reducevals[4];
1515  ssize_t sumHaveGlobal=0, sumHaveLocal=0;
1516  ssize_t sumGlobal=0, sumLocal=0;
1517  ssize_t maxGlobal=0, maxLocal=0;
1518  ssize_t vals[4] = {haveNumGlobalParts, haveNumLocalParts,
1519  numGlobalParts, numLocalParts};
1520 
1521  partDist_.clear();
1522  procDist_.clear();
1523 
1524  if (doCheck){
1525 
1526  try{
1527  reduceAll<int, ssize_t>(*comm_, Teuchos::REDUCE_SUM, 4, vals, reducevals);
1528  }
1529  Z2_THROW_OUTSIDE_ERROR(*env_);
1530 
1531  sumHaveGlobal = reducevals[0];
1532  sumHaveLocal = reducevals[1];
1533  sumGlobal = reducevals[2];
1534  sumLocal = reducevals[3];
1535 
1536  env_->localInputAssertion(__FILE__, __LINE__,
1537  "Either all procs specify num_global/local_parts or none do",
1538  (sumHaveGlobal == 0 || sumHaveGlobal == nprocs) &&
1539  (sumHaveLocal == 0 || sumHaveLocal == nprocs),
1540  BASIC_ASSERTION);
1541  }
1542  else{
1543  if (haveNumLocalParts)
1544  sumLocal = numLocalParts * nprocs;
1545  if (haveNumGlobalParts)
1546  sumGlobal = numGlobalParts * nprocs;
1547 
1548  sumHaveGlobal = haveNumGlobalParts ? nprocs : 0;
1549  sumHaveLocal = haveNumLocalParts ? nprocs : 0;
1550 
1551  maxLocal = numLocalParts;
1552  maxGlobal = numGlobalParts;
1553  }
1554 
1555  if (!haveNumLocalParts && !haveNumGlobalParts){
1556  onePartPerProc_ = true; // default if user did not specify
1557  return;
1558  }
1559 
1560  if (haveNumGlobalParts){
1561  if (doCheck){
1562  vals[0] = numGlobalParts;
1563  vals[1] = numLocalParts;
1564  try{
1565  reduceAll<int, ssize_t>(
1566  *comm_, Teuchos::REDUCE_MAX, 2, vals, reducevals);
1567  }
1568  Z2_THROW_OUTSIDE_ERROR(*env_);
1569 
1570  maxGlobal = reducevals[0];
1571  maxLocal = reducevals[1];
1572 
1573  env_->localInputAssertion(__FILE__, __LINE__,
1574  "Value for num_global_parts is different on different processes.",
1575  maxGlobal * nprocs == sumGlobal, BASIC_ASSERTION);
1576  }
1577 
1578  if (sumLocal){
1579  env_->localInputAssertion(__FILE__, __LINE__,
1580  "Sum of num_local_parts does not equal requested num_global_parts",
1581  sumLocal == numGlobalParts, BASIC_ASSERTION);
1582 
1583  if (sumLocal == nprocs && maxLocal == 1){
1584  onePartPerProc_ = true; // user specified one part per proc
1585  return;
1586  }
1587  }
1588  else{
1589  if (maxGlobal == nprocs){
1590  onePartPerProc_ = true; // user specified num parts is num procs
1591  return;
1592  }
1593  }
1594  }
1595 
1596  // If we are here, we do not have #parts == #procs.
1597 
1598  if (sumHaveLocal == nprocs){
1599  //
1600  // We will go by the number of local parts specified.
1601  //
1602 
1603  try{
1604  procDist_.resize(nprocs+1);
1605  }
1606  catch (std::exception &){
1607  throw(std::bad_alloc());
1608  }
1609 
1610  part_t *procArray = &procDist_[0];
1611 
1612  try{
1613  part_t tmp = part_t(numLocalParts);
1614  gatherAll<int, part_t>(*comm_, 1, &tmp, nprocs, procArray + 1);
1615  }
1616  Z2_THROW_OUTSIDE_ERROR(*env_);
1617 
1618  procArray[0] = 0;
1619 
1620  for (int proc=0; proc < nprocs; proc++)
1621  procArray[proc+1] += procArray[proc];
1622  }
1623  else{
1624  //
1625  // We will allocate global number of parts to the processes.
1626  //
1627  double fParts = numGlobalParts;
1628  double fProcs = nprocs;
1629 
1630  if (fParts < fProcs){
1631 
1632  try{
1633  partDist_.resize(size_t(fParts+1));
1634  }
1635  catch (std::exception &){
1636  throw(std::bad_alloc());
1637  }
1638 
1639  int *partArray = &partDist_[0];
1640 
1641  double each = floor(fProcs / fParts);
1642  double extra = fmod(fProcs, fParts);
1643  partDist_[0] = 0;
1644 
1645  for (part_t part=0; part < numGlobalParts; part++){
1646  int numOwners = int(each + ((part<extra) ? 1 : 0));
1647  partArray[part+1] = partArray[part] + numOwners;
1648  }
1649 
1650  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1651  partDist_[numGlobalParts] == nprocs, COMPLEX_ASSERTION, comm_);
1652  }
1653  else if (fParts > fProcs){
1654 
1655  // User did not specify local number of parts per proc;
1656  // Distribute the parts evenly among the procs.
1657 
1658  procDistEquallySpread_ = true;
1659 
1660  try{
1661  procDist_.resize(size_t(fProcs+1));
1662  }
1663  catch (std::exception &){
1664  throw(std::bad_alloc());
1665  }
1666 
1667  part_t *procArray = &procDist_[0];
1668 
1669  double each = floor(fParts / fProcs);
1670  double extra = fmod(fParts, fProcs);
1671  procArray[0] = 0;
1672 
1673  for (int proc=0; proc < nprocs; proc++){
1674  part_t numParts = part_t(each + ((proc<extra) ? 1 : 0));
1675  procArray[proc+1] = procArray[proc] + numParts;
1676  }
1677 
1678  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1679  procDist_[nprocs] == numGlobalParts, COMPLEX_ASSERTION, comm_);
1680  }
1681  else{
1682  env_->globalBugAssertion(__FILE__, __LINE__,
1683  "should never get here", 1, COMPLEX_ASSERTION, comm_);
1684  }
1685  }
1686 }
1687 
1689 // Remap a new part assignment vector for maximum overlap with an input
1690 // part assignment.
1691 //
1692 // Assumptions for this version:
1693 // input part assignment == processor rank for every local object.
1694 // assuming nGlobalParts_ <= num ranks
1695 // TODO: Write a version that takes the input part number as input;
1696 // this change requires input parts in adapters to be provided in
1697 // the Adapter.
1698 // TODO: For repartitioning, compare to old remapping results; see Zoltan1.
1699 
1700 template <typename Adapter>
1702 {
1703  size_t len = parts_.size();
1704 
1705  part_t me = comm_->getRank();
1706  int np = comm_->getSize();
1707 
1708  if (np < nGlobalParts_) {
1709  if (me == 0) {
1710  std::ostringstream msg;
1711  msg << "Remapping not yet supported for "
1712  << "num_global_parts " << nGlobalParts_
1713  << " > num procs " << np << std::endl;
1714  env_->debug(DETAILED_STATUS, msg.str());
1715  }
1716  return;
1717  }
1718  // Build edges of a bipartite graph with np + nGlobalParts_ vertices,
1719  // and edges between a process vtx and any parts to which that process'
1720  // objects are assigned.
1721  // Weight edge[parts_[i]] by the number of objects that are going from
1722  // this rank to parts_[i].
1723  // We use a std::map, assuming the number of unique parts in the parts_ array
1724  // is small to keep the binary search efficient.
1725  // TODO We use the count of objects to move; should change to SIZE of objects
1726  // to move; need SIZE function in Adapter.
1727 
1728  std::map<part_t, long> edges;
1729  long lstaying = 0; // Total num of local objects staying if we keep the
1730  // current mapping. TODO: change to SIZE of local objs
1731  long gstaying = 0; // Total num of objects staying in the current partition
1732 
1733  for (size_t i = 0; i < len; i++) {
1734  edges[parts_[i]]++; // TODO Use obj size instead of count
1735  if (parts_[i] == me) lstaying++; // TODO Use obj size instead of count
1736  }
1737 
1738  Teuchos::reduceAll<int, long>(*comm_, Teuchos::REDUCE_SUM, 1,
1739  &lstaying, &gstaying);
1740 //TODO if (gstaying == Adapter::getGlobalNumObjs()) return; // Nothing to do
1741 
1742  part_t *remap = NULL;
1743 
1744  int nedges = edges.size();
1745 
1746  // Gather the graph to rank 0.
1747  part_t tnVtx = np + nGlobalParts_; // total # vertices
1748  int *idx = NULL; // Pointer index into graph adjacencies
1749  int *sizes = NULL; // nedges per rank
1750  if (me == 0) {
1751  idx = new int[tnVtx+1];
1752  sizes = new int[np];
1753  sizes[0] = nedges;
1754  }
1755  if (np > 1)
1756  Teuchos::gather<int, int>(&nedges, 1, sizes, 1, 0, *comm_);
1757 
1758  // prefix sum to build the idx array
1759  if (me == 0) {
1760  idx[0] = 0;
1761  for (int i = 0; i < np; i++)
1762  idx[i+1] = idx[i] + sizes[i];
1763  }
1764 
1765  // prepare to send edges
1766  int cnt = 0;
1767  part_t *bufv = NULL;
1768  long *bufw = NULL;
1769  if (nedges) {
1770  bufv = new part_t[nedges];
1771  bufw = new long[nedges];
1772  // Create buffer with edges (me, part[i]) and weight edges[parts_[i]].
1773  for (typename std::map<part_t, long>::iterator it = edges.begin();
1774  it != edges.end(); it++) {
1775  bufv[cnt] = it->first; // target part
1776  bufw[cnt] = it->second; // weight
1777  cnt++;
1778  }
1779  }
1780 
1781  // Prepare to receive edges on rank 0
1782  part_t *adj = NULL;
1783  long *wgt = NULL;
1784  if (me == 0) {
1785 //SYM adj = new part_t[2*idx[np]]; // need 2x space to symmetrize later
1786 //SYM wgt = new long[2*idx[np]]; // need 2x space to symmetrize later
1787  adj = new part_t[idx[np]];
1788  wgt = new long[idx[np]];
1789  }
1790 
1791  Teuchos::gatherv<int, part_t>(bufv, cnt, adj, sizes, idx, 0, *comm_);
1792  Teuchos::gatherv<int, long>(bufw, cnt, wgt, sizes, idx, 0, *comm_);
1793  delete [] bufv;
1794  delete [] bufw;
1795 
1796  // Now have constructed graph on rank 0.
1797  // Call the matching algorithm
1798 
1799  int doRemap;
1800  if (me == 0) {
1801  // We have the "LHS" vertices of the bipartite graph; need to create
1802  // "RHS" vertices.
1803  for (int i = 0; i < idx[np]; i++) {
1804  adj[i] += np; // New RHS vertex number; offset by num LHS vertices
1805  }
1806 
1807  // Build idx for RHS vertices
1808  for (part_t i = np; i < tnVtx; i++) {
1809  idx[i+1] = idx[i]; // No edges for RHS vertices
1810  }
1811 
1812 #ifdef KDDKDD_DEBUG
1813  std::cout << "IDX ";
1814  for (part_t i = 0; i <= tnVtx; i++) std::cout << idx[i] << " ";
1815  std::cout << std::endl;
1816 
1817  std::cout << "ADJ ";
1818  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << adj[i] << " ";
1819  std::cout << std::endl;
1820 
1821  std::cout << "WGT ";
1822  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << wgt[i] << " ";
1823  std::cout << std::endl;
1824 #endif
1825 
1826  // Perform matching on the graph
1827  part_t *match = new part_t[tnVtx];
1828  for (part_t i = 0; i < tnVtx; i++) match[i] = i;
1829  part_t nmatches =
1830  Zoltan2::GreedyMWM<part_t, long>(idx, adj, wgt, tnVtx, match);
1831 
1832 #ifdef KDDKDD_DEBUG
1833  std::cout << "After matching: " << nmatches << " found" << std::endl;
1834  for (part_t i = 0; i < tnVtx; i++)
1835  std::cout << "match[" << i << "] = " << match[i]
1836  << ((match[i] != i &&
1837  (i < np && match[i] != i+np))
1838  ? " *" : " ")
1839  << std::endl;
1840 #endif
1841 
1842  // See whether there were nontrivial changes in the matching.
1843  bool nontrivial = false;
1844  if (nmatches) {
1845  for (part_t i = 0; i < np; i++) {
1846  if ((match[i] != i) && (match[i] != (i+np))) {
1847  nontrivial = true;
1848  break;
1849  }
1850  }
1851  }
1852 
1853  // Process the matches
1854  if (nontrivial) {
1855  remap = new part_t[nGlobalParts_];
1856  for (part_t i = 0; i < nGlobalParts_; i++) remap[i] = -1;
1857 
1858  bool *used = new bool[np];
1859  for (part_t i = 0; i < np; i++) used[i] = false;
1860 
1861  // First, process all matched parts
1862  for (part_t i = 0; i < nGlobalParts_; i++) {
1863  part_t tmp = i + np;
1864  if (match[tmp] != tmp) {
1865  remap[i] = match[tmp];
1866  used[match[tmp]] = true;
1867  }
1868  }
1869 
1870  // Second, process unmatched parts; keep same part number if possible
1871  for (part_t i = 0; i < nGlobalParts_; i++) {
1872  if (remap[i] > -1) continue;
1873  if (!used[i]) {
1874  remap[i] = i;
1875  used[i] = true;
1876  }
1877  }
1878 
1879  // Third, process unmatched parts; give them the next unused part
1880  for (part_t i = 0, uidx = 0; i < nGlobalParts_; i++) {
1881  if (remap[i] > -1) continue;
1882  while (used[uidx]) uidx++;
1883  remap[i] = uidx;
1884  used[uidx] = true;
1885  }
1886  delete [] used;
1887  }
1888  delete [] match;
1889 
1890 #ifdef KDDKDD_DEBUG
1891  std::cout << "Remap vector: ";
1892  for (part_t i = 0; i < nGlobalParts_; i++) std::cout << remap[i] << " ";
1893  std::cout << std::endl;
1894 #endif
1895 
1896  long newgstaying = measure_stays(remap, idx, adj, wgt,
1897  nGlobalParts_, np);
1898  doRemap = (newgstaying > gstaying);
1899  std::ostringstream msg;
1900  msg << "gstaying " << gstaying << " measure(input) "
1901  << measure_stays(NULL, idx, adj, wgt, nGlobalParts_, np)
1902  << " newgstaying " << newgstaying
1903  << " nontrivial " << nontrivial
1904  << " doRemap " << doRemap << std::endl;
1905  env_->debug(DETAILED_STATUS, msg.str());
1906  }
1907  delete [] idx;
1908  delete [] sizes;
1909  delete [] adj;
1910  delete [] wgt;
1911 
1912  Teuchos::broadcast<int, int>(*comm_, 0, 1, &doRemap);
1913 
1914  if (doRemap) {
1915  if (me != 0) remap = new part_t[nGlobalParts_];
1916  Teuchos::broadcast<int, part_t>(*comm_, 0, nGlobalParts_, remap);
1917  for (size_t i = 0; i < len; i++) {
1918  parts_[i] = remap[parts_[i]];
1919  }
1920  }
1921 
1922  delete [] remap; // TODO May want to keep for repartitioning as in Zoltan
1923 }
1924 
1925 
1926 } // namespace Zoltan2
1927 
1928 #endif
const int * getProcListView() const
Returns the process list corresponding to the global ID list.
scalar_t getCriteriaPartSize(int idx, part_t part) const
Get the size for a given weight index and a given part.
Defines the Solution base class.
fast typical checks for valid arguments
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
part_t pointAssign(int dim, scalar_t *point) const
Return the part overlapping a given point in space;.
void RemapParts()
Remap a new partition for maximum overlap with an input partition.
Just a placeholder for now.
more involved, like validate a graph
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
virtual bool isPartitioningTreeBinary() const
calculate if partition tree is binary.
void getPartsForProc(int procId, double &numParts, part_t &partMin, part_t &partMax) const
Get the parts belonging to a process.
void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
Get the processes containing a part.
sub-steps, each method&#39;s entry and exit
bool oneToOnePartDistribution() const
Is the part-to-process distribution is one-to-one.
SparseMatrixAdapter_t::part_t part_t
#define epsilon
Definition: nd.cpp:82
long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt, part_t nrhs, part_t nlhs)
scalar_t getLocalFractionOfPart() const
If parts are divided across processes, return the fraction of a part on this process.
A PartitioningSolution is a solution to a partitioning problem.
const RCP< const Comm< int > > & getCommunicator() const
Return the communicator associated with the solution.
void boxAssign(int dim, scalar_t *lower, scalar_t *upper, size_t &nPartsFound, part_t **partsFound) const
Return an array of all parts overlapping a given box in space.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
PartitioningSolution(const RCP< const Environment > &env, const RCP< const Comm< int > > &comm, int nUserWeights, const RCP< Algorithm< Adapter > > &algorithm=Teuchos::null)
Constructor when part sizes are not supplied.
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
void getPartitionTree(part_t &numTreeVerts, std::vector< part_t > &permPartNums, std::vector< part_t > &splitRangeBeg, std::vector< part_t > &splitRangeEnd, std::vector< part_t > &treeVertParents) const
get the partition tree - fill the relevant arrays
Greedy Maximal Weight Matching.
void setParts(ArrayRCP< part_t > &partList)
The algorithm uses setParts to set the solution.
static const std::string fail
std::vector< Zoltan2::coordinateModelPartBox > & getPartBoxesView() const
returns the part box boundary list.
Defines the Environment class.
const part_t * getProcDistribution() const
Return a distribution by process.
void getCommunicationGraph(ArrayRCP< part_t > &comXAdj, ArrayRCP< part_t > &comAdj) const
returns communication graph resulting from geometric partitioning.
int getNumberOfCriteria() const
Get the number of criteria (object weights)
const int * getPartDistribution() const
Return a distribution by part.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
bool criteriaHasUniformPartSizes(int idx) const
Determine if balancing criteria has uniform part sizes. (User can specify differing part sizes...
bool criteriaHaveSamePartSizes(int c1, int c2) const
Return true if the two weight indices have the same part size information.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
const RCP< const Environment > & getEnvironment() const
Return the environment associated with the solution.
Zoltan2::BasicUserTypes< zscalar_t, zlno_t, zgno_t > user_t
Definition: Metric.cpp:74