Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_PartitioningSolution.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Zoltan2: A package of combinatorial algorithms for scientific computing
4 //
5 // Copyright 2012 NTESS and the Zoltan2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
14 #ifndef _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
15 #define _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
16 
17 namespace Zoltan2 {
18 template <typename Adapter>
20 }
21 
22 #include <Zoltan2_Environment.hpp>
23 #include <Zoltan2_Solution.hpp>
24 #include <Zoltan2_GreedyMWM.hpp>
25 #include <Zoltan2_Algorithm.hpp>
27 #include <cmath>
28 #include <algorithm>
29 #include <vector>
30 #include <limits>
31 #include <sstream>
32 
33 #ifdef _MSC_VER
34 #define NOMINMAX
35 #include <windows.h>
36 #endif
37 
38 
39 namespace Zoltan2 {
40 
55 template <typename Adapter>
56  class PartitioningSolution : public Solution
57 {
58 public:
59 
60 #ifndef DOXYGEN_SHOULD_SKIP_THIS
61  typedef typename Adapter::gno_t gno_t;
62  typedef typename Adapter::scalar_t scalar_t;
63  typedef typename Adapter::lno_t lno_t;
64  typedef typename Adapter::part_t part_t;
65  typedef typename Adapter::user_t user_t;
66 #endif
67 
85  PartitioningSolution( const RCP<const Environment> &env,
86  const RCP<const Comm<int> > &comm,
87  int nUserWeights,
88  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
89 
119  PartitioningSolution(const RCP<const Environment> &env,
120  const RCP<const Comm<int> > &comm,
121  int nUserWeights, ArrayView<ArrayRCP<part_t> > reqPartIds,
122  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
123  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
124 
126  // Information that the algorithm may wish to query.
127 
130  size_t getTargetGlobalNumberOfParts() const { return nGlobalParts_; }
131 
134  size_t getActualGlobalNumberOfParts() const { return nGlobalPartsSolution_; }
135 
138  size_t getLocalNumberOfParts() const { return nLocalParts_; }
139 
147  scalar_t getLocalFractionOfPart() const { return localFraction_; }
148 
158  bool oneToOnePartDistribution() const { return onePartPerProc_; }
159 
179  const int *getPartDistribution() const {
180  if (partDist_.size() > 0) return &partDist_[0];
181  else return NULL;
182  }
183 
200  const part_t *getProcDistribution() const {
201  if (procDist_.size() > 0) return &procDist_[0];
202  else return NULL;
203  }
204 
208  int getNumberOfCriteria() const { return nWeightsPerObj_; }
209 
210 
217  bool criteriaHasUniformPartSizes(int idx) const { return pSizeUniform_[idx];}
218 
231  scalar_t getCriteriaPartSize(int idx, part_t part) const {
232  if (pSizeUniform_[idx])
233  return 1.0 / nGlobalParts_;
234  else if (pCompactIndex_[idx].size())
235  return pSize_[idx][pCompactIndex_[idx][part]];
236  else
237  return pSize_[idx][part];
238  }
239 
253  bool criteriaHaveSamePartSizes(int c1, int c2) const;
254 
256  // Method used by the algorithm to set results.
257 
284  void setParts(ArrayRCP<part_t> &partList);
285 
287 
299  void RemapParts();
300 
302  /* Return the weight of objects staying with a given remap.
303  * If remap is NULL, compute weight of objects staying with given partition
304  */
305  long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt,
306  part_t nrhs, part_t nlhs)
307  {
308  long staying = 0;
309  for (part_t i = 0; i < nrhs; i++) {
310  part_t k = (remap ? remap[i] : i);
311  for (part_t j = idx[k]; j < idx[k+1]; j++) {
312  if (i == (adj[j]-nlhs)) {
313  staying += wgt[j];
314  break;
315  }
316  }
317  }
318  return staying;
319  }
320 
322  // Results that may be queried by the user, by migration methods,
323  // or by metric calculation methods.
324  // We return raw pointers so users don't have to learn about our
325  // pointer wrappers.
326 
329  inline const RCP<const Comm<int> > &getCommunicator() const { return comm_;}
330 
333  inline const RCP<const Environment> &getEnvironment() const { return env_;}
334 
337  const part_t *getPartListView() const {
338  if (parts_.size() > 0) return parts_.getRawPtr();
339  else return NULL;
340  }
341 
346  const int *getProcListView() const {
347  if (procs_.size() > 0) return procs_.getRawPtr();
348  else return NULL;
349  }
350 
353  virtual bool isPartitioningTreeBinary() const
354  {
355  if (this->algorithm_ == Teuchos::null)
356  throw std::logic_error("no partitioning algorithm has been run yet");
357  return this->algorithm_->isPartitioningTreeBinary();
358  }
359 
362  void getPartitionTree(part_t & numTreeVerts,
363  std::vector<part_t> & permPartNums,
364  std::vector<part_t> & splitRangeBeg,
365  std::vector<part_t> & splitRangeEnd,
366  std::vector<part_t> & treeVertParents) const {
367 
368  part_t numParts = static_cast<part_t>(getTargetGlobalNumberOfParts());
369 
370  if (this->algorithm_ == Teuchos::null)
371  throw std::logic_error("no partitioning algorithm has been run yet");
372  this->algorithm_->getPartitionTree(
373  numParts, // may want to change how this is passed through
374  numTreeVerts,
375  permPartNums,
376  splitRangeBeg,
377  splitRangeEnd,
378  treeVertParents);
379  }
380 
383  std::vector<Zoltan2::coordinateModelPartBox> &
385  {
386  return this->algorithm_->getPartBoxesView();
387  }
388 
390  // when a point lies on a part boundary, the lowest part
391  // number on that boundary is returned.
392  // Note that not all partitioning algorithms will support
393  // this method.
394  //
395  // \param dim : the number of dimensions specified for the point in space
396  // \param point : the coordinates of the point in space; array of size dim
397  // \return the part number of a part overlapping the given point
398  part_t pointAssign(int dim, scalar_t *point) const
399  {
400  part_t p;
401  try {
402  if (this->algorithm_ == Teuchos::null)
403  throw std::logic_error("no partitioning algorithm has been run yet");
404 
405  p = this->algorithm_->pointAssign(dim, point);
406  }
408  return p;
409  }
410 
412  // This method allocates memory for the return argument, but does not
413  // control that memory. The user is responsible for freeing the
414  // memory.
415  //
416  // \param dim : (in) the number of dimensions specified for the box
417  // \param lower : (in) the coordinates of the lower corner of the box;
418  // array of size dim
419  // \param upper : (in) the coordinates of the upper corner of the box;
420  // array of size dim
421  // \param nPartsFound : (out) the number of parts overlapping the box
422  // \param partsFound : (out) array of parts overlapping the box
423  void boxAssign(int dim, scalar_t *lower, scalar_t *upper,
424  size_t &nPartsFound, part_t **partsFound) const
425  {
426  try {
427  if (this->algorithm_ == Teuchos::null)
428  throw std::logic_error("no partitioning algorithm has been run yet");
429 
430  this->algorithm_->boxAssign(dim, lower, upper, nPartsFound, partsFound);
431  }
433  }
434 
435 
438  void getCommunicationGraph(ArrayRCP <part_t> &comXAdj,
439  ArrayRCP <part_t> &comAdj) const
440  {
441  try {
442  if (this->algorithm_ == Teuchos::null)
443  throw std::logic_error("no partitioning algorithm has been run yet");
444 
445  this->algorithm_->getCommunicationGraph(this, comXAdj, comAdj);
446  }
448  }
449 
467  void getPartsForProc(int procId, double &numParts, part_t &partMin,
468  part_t &partMax) const
469  {
470  env_->localInputAssertion(__FILE__, __LINE__, "invalid process id",
471  procId >= 0 && procId < comm_->getSize(), BASIC_ASSERTION);
472 
473  procToPartsMap(procId, numParts, partMin, partMax);
474  }
475 
487  void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
488  {
489  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
490  partId >= 0 && partId < nGlobalParts_, BASIC_ASSERTION);
491 
492  partToProcsMap(partId, procMin, procMax);
493  }
494 
495 private:
496  void partToProc(bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
497  int numLocalParts, int numGlobalParts);
498 
499  void procToPartsMap(int procId, double &numParts, part_t &partMin,
500  part_t &partMax) const;
501 
502  void partToProcsMap(part_t partId, int &procMin, int &procMax) const;
503 
504  void setPartDistribution();
505 
506  void setPartSizes(ArrayView<ArrayRCP<part_t> > reqPartIds,
507  ArrayView<ArrayRCP<scalar_t> > reqPartSizes);
508 
509  void computePartSizes(int widx, ArrayView<part_t> ids,
510  ArrayView<scalar_t> sizes);
511 
512  void broadcastPartSizes(int widx);
513 
514 
515  RCP<const Environment> env_; // has application communicator
516  const RCP<const Comm<int> > comm_; // the problem communicator
517 
518  //part box boundaries as a result of geometric partitioning algorithm.
519  RCP<std::vector<Zoltan2::coordinateModelPartBox> > partBoxes;
520 
521  part_t nGlobalParts_;// target global number of parts
522  part_t nLocalParts_; // number of parts to be on this process
523 
524  scalar_t localFraction_; // approx fraction of a part on this process
525  int nWeightsPerObj_; // if user has no weights, this is 1 TODO: WHY???
526 
527  // If process p is to be assigned part p for all p, then onePartPerProc_
528  // is true. Otherwise it is false, and either procDist_ or partDist_
529  // describes the allocation of parts to processes.
530  //
531  // If parts are never split across processes, then procDist_ is defined
532  // as follows:
533  //
534  // partId = procDist_[procId]
535  // partIdNext = procDist_[procId+1]
536  // globalNumberOfParts = procDist_[numProcs]
537  //
538  // meaning that the parts assigned to process procId range from
539  // [partId, partIdNext). If partIdNext is the same as partId, then
540  // process procId has no parts.
541  //
542  // If the number parts is less than the number of processes, and the
543  // user did not specify "num_local_parts" for each of the processes, then
544  // parts are split across processes, and partDist_ is defined rather than
545  // procDist_.
546  //
547  // procId = partDist_[partId]
548  // procIdNext = partDist_[partId+1]
549  // globalNumberOfProcs = partDist_[numParts]
550  //
551  // which implies that the part partId is shared by processes in the
552  // the range [procId, procIdNext).
553  //
554  // We use std::vector so we can use upper_bound algorithm
555 
556  bool onePartPerProc_; // either this is true...
557  std::vector<int> partDist_; // or this is defined ...
558  std::vector<part_t> procDist_; // or this is defined.
559  bool procDistEquallySpread_; // if procDist_ is used and
560  // #parts > #procs and
561  // num_local_parts is not specified,
562  // parts are evenly distributed to procs
563 
564  // In order to minimize the storage required for part sizes, we
565  // have three different representations.
566  //
567  // If the part sizes for weight index w are all the same, then:
568  // pSizeUniform_[w] = true
569  // pCompactIndex_[w].size() = 0
570  // pSize_[w].size() = 0
571  //
572  // and the size for part p is 1.0 / nparts.
573  //
574  // If part sizes differ for each part in weight index w, but there
575  // are no more than 64 distinct sizes:
576  // pSizeUniform_[w] = false
577  // pCompactIndex_[w].size() = number of parts
578  // pSize_[w].size() = number of different sizes
579  //
580  // and the size for part p is pSize_[pCompactIndex_[p]].
581  //
582  // If part sizes differ for each part in weight index w, and there
583  // are more than 64 distinct sizes:
584  // pSizeUniform_[w] = false
585  // pCompactIndex_[w].size() = 0
586  // pSize_[w].size() = nparts
587  //
588  // and the size for part p is pSize_[p].
589  //
590  // NOTE: If we expect to have similar cases, i.e. a long list of scalars
591  // where it is highly possible that just a few unique values appear,
592  // then we may want to make this a class. The advantage is that we
593  // save a long list of 1-byte indices instead of a long list of scalars.
594 
595  ArrayRCP<bool> pSizeUniform_;
596  ArrayRCP<ArrayRCP<unsigned char> > pCompactIndex_;
597  ArrayRCP<ArrayRCP<scalar_t> > pSize_;
598 
600  // The algorithm sets these values upon completion.
601 
602  ArrayRCP<part_t> parts_; // part number assigned to localid[i]
603 
604  bool haveSolution_;
605 
606  part_t nGlobalPartsSolution_; // global number of parts in solution
607 
609  // The solution calculates this from the part assignments,
610  // unless onePartPerProc_.
611 
612  ArrayRCP<int> procs_; // process rank assigned to localid[i]
613 
615  // Algorithm used to compute the solution;
616  // needed for post-processing with pointAssign or getCommunicationGraph
617  const RCP<Algorithm<Adapter> > algorithm_; //
618 };
619 
621 // Definitions
623 
624 template <typename Adapter>
626  const RCP<const Environment> &env,
627  const RCP<const Comm<int> > &comm,
628  int nUserWeights,
629  const RCP<Algorithm<Adapter> > &algorithm)
630  : env_(env), comm_(comm),
631  partBoxes(),
632  nGlobalParts_(0), nLocalParts_(0),
633  localFraction_(0), nWeightsPerObj_(),
634  onePartPerProc_(false), partDist_(), procDist_(),
635  procDistEquallySpread_(false),
636  pSizeUniform_(), pCompactIndex_(), pSize_(),
637  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
638  procs_(), algorithm_(algorithm)
639 {
640  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
641 
642  setPartDistribution();
643 
644  // We must call setPartSizes() because part sizes may have
645  // been provided by the user on other processes.
646 
647  ArrayRCP<part_t> *noIds = new ArrayRCP<part_t> [nWeightsPerObj_];
648  ArrayRCP<scalar_t> *noSizes = new ArrayRCP<scalar_t> [nWeightsPerObj_];
649  ArrayRCP<ArrayRCP<part_t> > ids(noIds, 0, nWeightsPerObj_, true);
650  ArrayRCP<ArrayRCP<scalar_t> > sizes(noSizes, 0, nWeightsPerObj_, true);
651 
652  setPartSizes(ids.view(0, nWeightsPerObj_), sizes.view(0, nWeightsPerObj_));
653 
654  env_->memory("After construction of solution");
655 }
656 
657 template <typename Adapter>
659  const RCP<const Environment> &env,
660  const RCP<const Comm<int> > &comm,
661  int nUserWeights,
662  ArrayView<ArrayRCP<part_t> > reqPartIds,
663  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
664  const RCP<Algorithm<Adapter> > &algorithm)
665  : env_(env), comm_(comm),
666  partBoxes(),
667  nGlobalParts_(0), nLocalParts_(0),
668  localFraction_(0), nWeightsPerObj_(),
669  onePartPerProc_(false), partDist_(), procDist_(),
670  procDistEquallySpread_(false),
671  pSizeUniform_(), pCompactIndex_(), pSize_(),
672  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
673  procs_(), algorithm_(algorithm)
674 {
675  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
676 
677  setPartDistribution();
678 
679  setPartSizes(reqPartIds, reqPartSizes);
680 
681  env_->memory("After construction of solution");
682 }
683 
684 template <typename Adapter>
686 {
687  // Did the caller define num_global_parts and/or num_local_parts?
688 
689  const ParameterList &pl = env_->getParameters();
690  size_t haveGlobalNumParts=0, haveLocalNumParts=0;
691  int numLocal=0, numGlobal=0;
692 
693  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("num_global_parts");
694 
695  if (pe){
696  haveGlobalNumParts = 1;
697  nGlobalParts_ = part_t(pe->getValue(&nGlobalParts_));
698  numGlobal = nGlobalParts_;
699  }
700 
701  pe = pl.getEntryPtr("num_local_parts");
702 
703  if (pe){
704  haveLocalNumParts = 1;
705  nLocalParts_ = part_t(pe->getValue(&nLocalParts_));
706  numLocal = nLocalParts_;
707  }
708 
709  try{
710  // Sets onePartPerProc_, partDist_, and procDist_
711 
712  partToProc(true, haveLocalNumParts, haveGlobalNumParts,
713  numLocal, numGlobal);
714  }
716 
717  int nprocs = comm_->getSize();
718  int rank = comm_->getRank();
719 
720  if (onePartPerProc_){
721  nGlobalParts_ = nprocs;
722  nLocalParts_ = 1;
723  }
724  else if (partDist_.size() > 0){ // more procs than parts
725  nGlobalParts_ = partDist_.size() - 1;
726  int pstart = partDist_[0];
727  for (part_t i=1; i <= nGlobalParts_; i++){
728  int pend = partDist_[i];
729  if (rank >= pstart && rank < pend){
730  int numOwners = pend - pstart;
731  nLocalParts_ = 1;
732  localFraction_ = 1.0 / numOwners;
733  break;
734  }
735  pstart = pend;
736  }
737  }
738  else if (procDist_.size() > 0){ // more parts than procs
739  nGlobalParts_ = procDist_[nprocs];
740  nLocalParts_ = procDist_[rank+1] - procDist_[rank];
741  }
742  else {
743  throw std::logic_error("partToProc error");
744  }
745 
746 }
747 
748 template <typename Adapter>
749  void PartitioningSolution<Adapter>::setPartSizes(
750  ArrayView<ArrayRCP<part_t> > ids, ArrayView<ArrayRCP<scalar_t> > sizes)
751 {
752  int widx = nWeightsPerObj_;
753  bool fail=false;
754 
755  size_t *countBuf = new size_t [widx*2];
756  ArrayRCP<size_t> counts(countBuf, 0, widx*2, true);
757 
758  fail = ((ids.size() != widx) || (sizes.size() != widx));
759 
760  for (int w=0; !fail && w < widx; w++){
761  counts[w] = ids[w].size();
762  if (ids[w].size() != sizes[w].size()) fail=true;
763  }
764 
765  env_->globalBugAssertion(__FILE__, __LINE__, "bad argument arrays", fail==0,
766  COMPLEX_ASSERTION, comm_);
767 
768  // Are all part sizes the same? This is the common case.
769 
770  ArrayRCP<scalar_t> *emptySizes= new ArrayRCP<scalar_t> [widx];
771  pSize_ = arcp(emptySizes, 0, widx);
772 
773  ArrayRCP<unsigned char> *emptyIndices= new ArrayRCP<unsigned char> [widx];
774  pCompactIndex_ = arcp(emptyIndices, 0, widx);
775 
776  bool *info = new bool [widx];
777  pSizeUniform_ = arcp(info, 0, widx);
778  for (int w=0; w < widx; w++)
779  pSizeUniform_[w] = true;
780 
781  if (nGlobalParts_ == 1){
782  return; // there's only one part in the whole problem
783  }
784 
785  size_t *ptr1 = counts.getRawPtr();
786  size_t *ptr2 = counts.getRawPtr() + widx;
787 
788  try{
789  reduceAll<int, size_t>(*comm_, Teuchos::REDUCE_MAX, widx, ptr1, ptr2);
790  }
791  Z2_THROW_OUTSIDE_ERROR(*env_);
792 
793  bool zero = true;
794 
795  for (int w=0; w < widx; w++)
796  if (counts[widx+w] > 0){
797  zero = false;
798  pSizeUniform_[w] = false;
799  }
800 
801  if (zero) // Part sizes for all criteria are uniform.
802  return;
803 
804  // Compute the part sizes for criteria for which part sizes were
805  // supplied. Normalize for each criteria so part sizes sum to one.
806 
807  int nprocs = comm_->getSize();
808  int rank = comm_->getRank();
809 
810  for (int w=0; w < widx; w++){
811  if (pSizeUniform_[w]) continue;
812 
813  // Send all ids and sizes to one process.
814  // (There is no simple gather method in Teuchos.)
815 
816  part_t length = ids[w].size();
817  part_t *allLength = new part_t [nprocs];
818  Teuchos::gatherAll<int, part_t>(*comm_, 1, &length,
819  nprocs, allLength);
820 
821  if (rank == 0){
822  int total = 0;
823  for (int i=0; i < nprocs; i++)
824  total += allLength[i];
825 
826  part_t *partNums = new part_t [total];
827  scalar_t *partSizes = new scalar_t [total];
828 
829  ArrayView<part_t> idArray(partNums, total);
830  ArrayView<scalar_t> sizeArray(partSizes, total);
831 
832  if (length > 0){
833  for (int i=0; i < length; i++){
834  *partNums++ = ids[w][i];
835  *partSizes++ = sizes[w][i];
836  }
837  }
838 
839  for (int p=1; p < nprocs; p++){
840  if (allLength[p] > 0){
841  Teuchos::receive<int, part_t>(*comm_, p,
842  allLength[p], partNums);
843  Teuchos::receive<int, scalar_t>(*comm_, p,
844  allLength[p], partSizes);
845  partNums += allLength[p];
846  partSizes += allLength[p];
847  }
848  }
849 
850  delete [] allLength;
851 
852  try{
853  computePartSizes(w, idArray, sizeArray);
854  }
856 
857  delete [] idArray.getRawPtr();
858  delete [] sizeArray.getRawPtr();
859  }
860  else{
861  delete [] allLength;
862  if (length > 0){
863  Teuchos::send<int, part_t>(*comm_, length, ids[w].getRawPtr(), 0);
864  Teuchos::send<int, scalar_t>(*comm_, length, sizes[w].getRawPtr(), 0);
865  }
866  }
867 
868  broadcastPartSizes(w);
869  }
870 }
871 
872 template <typename Adapter>
873  void PartitioningSolution<Adapter>::broadcastPartSizes(int widx)
874 {
875  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
876  pSize_.size()>widx &&
877  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
879 
880  int rank = comm_->getRank();
881  int nprocs = comm_->getSize();
882  part_t nparts = nGlobalParts_;
883 
884  if (nprocs < 2)
885  return;
886 
887  char flag=0;
888 
889  if (rank == 0){
890  if (pSizeUniform_[widx] == true)
891  flag = 1;
892  else if (pCompactIndex_[widx].size() > 0)
893  flag = 2;
894  else
895  flag = 3;
896  }
897 
898  try{
899  Teuchos::broadcast<int, char>(*comm_, 0, 1, &flag);
900  }
901  Z2_THROW_OUTSIDE_ERROR(*env_);
902 
903  if (flag == 1){
904  if (rank > 0)
905  pSizeUniform_[widx] = true;
906 
907  return;
908  }
909 
910  if (flag == 2){
911 
912  // broadcast the indices into the size list
913 
914  unsigned char *idxbuf = NULL;
915 
916  if (rank > 0){
917  idxbuf = new unsigned char [nparts];
918  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, idxbuf);
919  }
920  else{
921  env_->localBugAssertion(__FILE__, __LINE__, "index list size",
922  pCompactIndex_[widx].size() == nparts, COMPLEX_ASSERTION);
923  idxbuf = pCompactIndex_[widx].getRawPtr();
924  }
925 
926  try{
927  // broadcast of unsigned char is not supported
928  Teuchos::broadcast<int, char>(*comm_, 0, nparts,
929  reinterpret_cast<char *>(idxbuf));
930  }
931  Z2_THROW_OUTSIDE_ERROR(*env_);
932 
933  if (rank > 0)
934  pCompactIndex_[widx] = arcp(idxbuf, 0, nparts, true);
935 
936  // broadcast the list of different part sizes
937 
938  unsigned char maxIdx=0;
939  for (part_t p=0; p < nparts; p++)
940  if (idxbuf[p] > maxIdx) maxIdx = idxbuf[p];
941 
942  int numSizes = maxIdx + 1;
943 
944  scalar_t *sizeList = NULL;
945 
946  if (rank > 0){
947  sizeList = new scalar_t [numSizes];
948  env_->localMemoryAssertion(__FILE__, __LINE__, numSizes, sizeList);
949  }
950  else{
951  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
952  numSizes == pSize_[widx].size(), COMPLEX_ASSERTION);
953 
954  sizeList = pSize_[widx].getRawPtr();
955  }
956 
957  try{
958  Teuchos::broadcast<int, scalar_t>(*comm_, 0, numSizes, sizeList);
959  }
960  Z2_THROW_OUTSIDE_ERROR(*env_);
961 
962  if (rank > 0)
963  pSize_[widx] = arcp(sizeList, 0, numSizes, true);
964 
965  return;
966  }
967 
968  if (flag == 3){
969 
970  // broadcast the size of each part
971 
972  scalar_t *sizeList = NULL;
973 
974  if (rank > 0){
975  sizeList = new scalar_t [nparts];
976  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, sizeList);
977  }
978  else{
979  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
980  nparts == pSize_[widx].size(), COMPLEX_ASSERTION);
981 
982  sizeList = pSize_[widx].getRawPtr();
983  }
984 
985  try{
986  Teuchos::broadcast<int, scalar_t >(*comm_, 0, nparts, sizeList);
987  }
988  Z2_THROW_OUTSIDE_ERROR(*env_);
989 
990  if (rank > 0)
991  pSize_[widx] = arcp(sizeList, 0, nparts);
992 
993  return;
994  }
995 }
996 
997 template <typename Adapter>
998  void PartitioningSolution<Adapter>::computePartSizes(int widx,
999  ArrayView<part_t> ids, ArrayView<scalar_t> sizes)
1000 {
1001  int len = ids.size();
1002 
1003  if (len == 0){
1004  pSizeUniform_[widx] = true;
1005  return;
1006  }
1007 
1008  env_->localBugAssertion(__FILE__, __LINE__, "bad array sizes",
1009  len>0 && sizes.size()==len, COMPLEX_ASSERTION);
1010 
1011  env_->localBugAssertion(__FILE__, __LINE__, "bad index",
1012  widx>=0 && widx<nWeightsPerObj_, COMPLEX_ASSERTION);
1013 
1014  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
1015  pSize_.size()>widx &&
1016  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
1018 
1019  // Check ids and sizes and find min, max and average sizes.
1020  // If sizes are very close to uniform, call them uniform parts.
1021 
1022  part_t nparts = nGlobalParts_;
1023  unsigned char *buf = new unsigned char [nparts];
1024  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, buf);
1025  memset(buf, 0, nparts);
1026  ArrayRCP<unsigned char> partIdx(buf, 0, nparts, true);
1027 
1028  scalar_t epsilon = 10e-5 / nparts;
1029  scalar_t min=sizes[0], max=sizes[0], sum=0;
1030 
1031  for (int i=0; i < len; i++){
1032  part_t id = ids[i];
1033  scalar_t size = sizes[i];
1034 
1035  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
1036  id>=0 && id<nparts, BASIC_ASSERTION);
1037 
1038  env_->localInputAssertion(__FILE__, __LINE__, "invalid part size", size>=0,
1039  BASIC_ASSERTION);
1040 
1041  // TODO: we could allow users to specify multiple sizes for the same
1042  // part if we add a parameter that says what we are to do with them:
1043  // add them or take the max.
1044 
1045  env_->localInputAssertion(__FILE__, __LINE__,
1046  "multiple sizes provided for one part", partIdx[id]==0, BASIC_ASSERTION);
1047 
1048  partIdx[id] = 1; // mark that we have a size for this part
1049 
1050  if (size < min) min = size;
1051  if (size > max) max = size;
1052  sum += size;
1053  }
1054 
1055  if (sum == 0){
1056 
1057  // User has given us a list of parts of size 0, we'll set
1058  // the rest to them to equally sized parts.
1059 
1060  scalar_t *allSizes = new scalar_t [2];
1061  env_->localMemoryAssertion(__FILE__, __LINE__, 2, allSizes);
1062 
1063  ArrayRCP<scalar_t> sizeArray(allSizes, 0, 2, true);
1064 
1065  int numNonZero = nparts - len;
1066 
1067  allSizes[0] = 0.0;
1068  allSizes[1] = 1.0 / numNonZero;
1069 
1070  for (part_t p=0; p < nparts; p++)
1071  buf[p] = 1; // index to default part size
1072 
1073  for (int i=0; i < len; i++)
1074  buf[ids[i]] = 0; // index to part size zero
1075 
1076  pSize_[widx] = sizeArray;
1077  pCompactIndex_[widx] = partIdx;
1078 
1079  return;
1080  }
1081 
1082  if (max - min <= epsilon){
1083  pSizeUniform_[widx] = true;
1084  return;
1085  }
1086 
1087  // A size for parts that were not specified:
1088  scalar_t avg = sum / nparts;
1089 
1090  // We are going to merge part sizes that are very close. This takes
1091  // computation time now, but can save considerably in the storage of
1092  // all part sizes on each process. For example, a common case may
1093  // be some parts are size 1 and all the rest are size 2.
1094 
1095  scalar_t *tmp = new scalar_t [len];
1096  env_->localMemoryAssertion(__FILE__, __LINE__, len, tmp);
1097  memcpy(tmp, sizes.getRawPtr(), sizeof(scalar_t) * len);
1098  ArrayRCP<scalar_t> partSizes(tmp, 0, len, true);
1099 
1100  std::sort(partSizes.begin(), partSizes.end());
1101 
1102  // create a list of sizes that are unique within epsilon
1103 
1104  Array<scalar_t> nextUniqueSize;
1105  nextUniqueSize.push_back(partSizes[len-1]); // largest
1106  scalar_t curr = partSizes[len-1];
1107  int avgIndex = len;
1108  bool haveAvg = false;
1109  if (curr - avg <= epsilon)
1110  avgIndex = 0;
1111 
1112  for (int i=len-2; i >= 0; i--){
1113  scalar_t val = partSizes[i];
1114  if (curr - val > epsilon){
1115  nextUniqueSize.push_back(val); // the highest in the group
1116  curr = val;
1117  if (avgIndex==len && val > avg && val - avg <= epsilon){
1118  // the average would be in this group
1119  avgIndex = nextUniqueSize.size() - 1;
1120  haveAvg = true;
1121  }
1122  }
1123  }
1124 
1125  partSizes.clear();
1126 
1127  size_t numSizes = nextUniqueSize.size();
1128  int sizeArrayLen = numSizes;
1129 
1130  if (numSizes < 64){
1131  // We can store the size for every part in a compact way.
1132 
1133  // Create a list of all sizes in increasing order
1134 
1135  if (!haveAvg) sizeArrayLen++; // need to include average
1136 
1137  scalar_t *allSizes = new scalar_t [sizeArrayLen];
1138  env_->localMemoryAssertion(__FILE__, __LINE__, sizeArrayLen, allSizes);
1139  ArrayRCP<scalar_t> sizeArray(allSizes, 0, sizeArrayLen, true);
1140 
1141  int newAvgIndex = sizeArrayLen;
1142 
1143  for (int i=numSizes-1, idx=0; i >= 0; i--){
1144 
1145  if (newAvgIndex == sizeArrayLen){
1146 
1147  if (haveAvg && i==avgIndex)
1148  newAvgIndex = idx;
1149 
1150  else if (!haveAvg && avg < nextUniqueSize[i]){
1151  newAvgIndex = idx;
1152  allSizes[idx++] = avg;
1153  }
1154  }
1155 
1156  allSizes[idx++] = nextUniqueSize[i];
1157  }
1158 
1159  env_->localBugAssertion(__FILE__, __LINE__, "finding average in list",
1160  newAvgIndex < sizeArrayLen, COMPLEX_ASSERTION);
1161 
1162  for (int i=0; i < nparts; i++){
1163  buf[i] = newAvgIndex; // index to default part size
1164  }
1165 
1166  sum = (nparts - len) * allSizes[newAvgIndex];
1167 
1168  for (int i=0; i < len; i++){
1169  int id = ids[i];
1170  scalar_t size = sizes[i];
1171  int index;
1172 
1173  // Find the first size greater than or equal to this size.
1174 
1175  if (size < avg && avg - size <= epsilon)
1176  index = newAvgIndex;
1177  else{
1178  typename ArrayRCP<scalar_t>::iterator found =
1179  std::lower_bound(sizeArray.begin(), sizeArray.end(), size);
1180 
1181  env_->localBugAssertion(__FILE__, __LINE__, "size array",
1182  found != sizeArray.end(), COMPLEX_ASSERTION);
1183 
1184  index = found - sizeArray.begin();
1185  }
1186 
1187  buf[id] = index;
1188  sum += allSizes[index];
1189  }
1190 
1191  for (int i=0; i < sizeArrayLen; i++){
1192  sizeArray[i] /= sum;
1193  }
1194 
1195  pCompactIndex_[widx] = partIdx;
1196  pSize_[widx] = sizeArray;
1197  }
1198  else{
1199  // To have access to part sizes, we must store nparts scalar_ts on
1200  // every process. We expect this is a rare case.
1201 
1202  tmp = new scalar_t [nparts];
1203  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, tmp);
1204 
1205  sum += ((nparts - len) * avg);
1206 
1207  for (int i=0; i < nparts; i++){
1208  tmp[i] = avg/sum;
1209  }
1210 
1211  for (int i=0; i < len; i++){
1212  tmp[ids[i]] = sizes[i]/sum;
1213  }
1214 
1215  pSize_[widx] = arcp(tmp, 0, nparts);
1216  }
1217 }
1218 
1219 template <typename Adapter>
1220  void PartitioningSolution<Adapter>::setParts(ArrayRCP<part_t> &partList)
1221 {
1222  env_->debug(DETAILED_STATUS, "Entering setParts");
1223 
1224  size_t len = partList.size();
1225 
1226  // Find the actual number of parts in the solution, which can
1227  // be more or less than the nGlobalParts_ target.
1228  // (We may want to compute the imbalance of a given solution with
1229  // respect to a desired solution. This solution may have more or
1230  // fewer parts that the desired solution.)
1231 
1232  part_t lMax = -1;
1233  part_t lMin = (len > 0 ? std::numeric_limits<part_t>::max() : 0);
1234  part_t gMax, gMin;
1235 
1236  for (size_t i = 0; i < len; i++) {
1237  if (partList[i] < lMin) lMin = partList[i];
1238  if (partList[i] > lMax) lMax = partList[i];
1239  }
1240  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MIN, 1, &lMin, &gMin);
1241  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MAX, 1, &lMax, &gMax);
1242 
1243  nGlobalPartsSolution_ = gMax - gMin + 1;
1244  parts_ = partList;
1245 
1246  // Now determine which process gets each object, if not one-to-one.
1247 
1248  if (!onePartPerProc_){
1249 
1250  int *procs = new int [len];
1251  env_->localMemoryAssertion(__FILE__, __LINE__, len, procs);
1252  procs_ = arcp<int>(procs, 0, len);
1253 
1254  if (len > 0) {
1255  part_t *parts = partList.getRawPtr();
1256 
1257  if (procDist_.size() > 0){ // parts are not split across procs
1258 
1259  int procId;
1260  for (size_t i=0; i < len; i++){
1261  partToProcsMap(parts[i], procs[i], procId);
1262  }
1263  }
1264  else{ // harder - we need to split the parts across multiple procs
1265 
1266  lno_t *partCounter = new lno_t [nGlobalPartsSolution_];
1267  env_->localMemoryAssertion(__FILE__, __LINE__, nGlobalPartsSolution_,
1268  partCounter);
1269 
1270  int numProcs = comm_->getSize();
1271 
1272  //MD NOTE: there was no initialization for partCounter.
1273  //I added the line below, correct me if I am wrong.
1274  memset(partCounter, 0, sizeof(lno_t) * nGlobalPartsSolution_);
1275 
1276  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++)
1277  partCounter[parts[i]]++;
1278 
1279  lno_t *procCounter = new lno_t [numProcs];
1280  env_->localMemoryAssertion(__FILE__, __LINE__, numProcs, procCounter);
1281 
1282  int proc1;
1283  int proc2 = partDist_[0];
1284 
1285  for (part_t part=1; part < nGlobalParts_; part++){
1286  proc1 = proc2;
1287  proc2 = partDist_[part+1];
1288  int numprocs = proc2 - proc1;
1289 
1290  double dNum = partCounter[part];
1291  double dProcs = numprocs;
1292 
1293  //std::cout << "dNum:" << dNum << " dProcs:" << dProcs << std::endl;
1294  double each = floor(dNum/dProcs);
1295  double extra = fmod(dNum,dProcs);
1296 
1297  for (int proc=proc1, i=0; proc<proc2; proc++, i++){
1298  if (i < extra)
1299  procCounter[proc] = lno_t(each) + 1;
1300  else
1301  procCounter[proc] = lno_t(each);
1302  }
1303  }
1304 
1305  delete [] partCounter;
1306 
1307  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++){
1308  if (partList[i] >= nGlobalParts_){
1309  // Solution has more parts that targeted. These
1310  // objects just remain on this process.
1311  procs[i] = comm_->getRank();
1312  continue;
1313  }
1314  part_t partNum = parts[i];
1315  proc1 = partDist_[partNum];
1316  proc2 = partDist_[partNum + 1];
1317 
1318  int proc;
1319  for (proc=proc1; proc < proc2; proc++){
1320  if (procCounter[proc] > 0){
1321  procs[i] = proc;
1322  procCounter[proc]--;
1323  break;
1324  }
1325  }
1326  env_->localBugAssertion(__FILE__, __LINE__, "part to proc",
1327  proc < proc2, COMPLEX_ASSERTION);
1328  }
1329 
1330  delete [] procCounter;
1331  }
1332  }
1333  }
1334 
1335  // Now that parts_ info is back on home process, remap the parts.
1336  // TODO: The parts will be inconsistent with the proc assignments after
1337  // TODO: remapping. This problem will go away after we separate process
1338  // TODO: mapping from setParts. But for MueLu's use case, the part
1339  // TODO: remapping is all that matters; they do not use the process mapping.
1340  bool doRemap = false;
1341  const Teuchos::ParameterEntry *pe =
1342  env_->getParameters().getEntryPtr("remap_parts");
1343  if (pe) doRemap = pe->getValue(&doRemap);
1344  if (doRemap) RemapParts();
1345 
1346  haveSolution_ = true;
1347 
1348  env_->memory("After Solution has processed algorithm's answer");
1349  env_->debug(DETAILED_STATUS, "Exiting setParts");
1350 }
1351 
1352 
1353 template <typename Adapter>
1355  double &numParts, part_t &partMin, part_t &partMax) const
1356 {
1357  if (onePartPerProc_){
1358  numParts = 1.0;
1359  partMin = partMax = procId;
1360  }
1361  else if (procDist_.size() > 0){
1362  partMin = procDist_[procId];
1363  partMax = procDist_[procId+1] - 1;
1364  numParts = procDist_[procId+1] - partMin;
1365  }
1366  else{
1367  // find the first p such that partDist_[p] > procId
1368 
1369  std::vector<int>::const_iterator entry;
1370  entry = std::upper_bound(partDist_.begin(), partDist_.end(), procId);
1371 
1372  size_t partIdx = entry - partDist_.begin();
1373  int numProcs = partDist_[partIdx] - partDist_[partIdx-1];
1374  partMin = partMax = int(partIdx) - 1;
1375  numParts = 1.0 / numProcs;
1376  }
1377 }
1378 
1379 template <typename Adapter>
1380  void PartitioningSolution<Adapter>::partToProcsMap(part_t partId,
1381  int &procMin, int &procMax) const
1382 {
1383  if (partId >= nGlobalParts_){
1384  // setParts() may be given an initial solution which uses a
1385  // different number of parts than the desired solution. It is
1386  // still a solution. We keep it on this process.
1387  procMin = procMax = comm_->getRank();
1388  }
1389  else if (onePartPerProc_){
1390  procMin = procMax = int(partId);
1391  }
1392  else if (procDist_.size() > 0){
1393  if (procDistEquallySpread_) {
1394  // Avoid binary search.
1395  double fProcs = comm_->getSize();
1396  double fParts = nGlobalParts_;
1397  double each = fParts / fProcs;
1398  procMin = int(partId / each);
1399  while (procDist_[procMin] > partId) procMin--;
1400  while (procDist_[procMin+1] <= partId) procMin++;
1401  procMax = procMin;
1402  }
1403  else {
1404  // find the first p such that procDist_[p] > partId.
1405  // For now, do a binary search.
1406 
1407  typename std::vector<part_t>::const_iterator entry;
1408  entry = std::upper_bound(procDist_.begin(), procDist_.end(), partId);
1409 
1410  size_t procIdx = entry - procDist_.begin();
1411  procMin = procMax = int(procIdx) - 1;
1412  }
1413  }
1414  else{
1415  procMin = partDist_[partId];
1416  procMax = partDist_[partId+1] - 1;
1417  }
1418 }
1419 
1420 template <typename Adapter>
1422  int c1, int c2) const
1423 {
1424  if (c1 < 0 || c1 >= nWeightsPerObj_ || c2 < 0 || c2 >= nWeightsPerObj_ )
1425  throw std::logic_error("criteriaHaveSamePartSizes error");
1426 
1427  bool theSame = false;
1428 
1429  if (c1 == c2)
1430  theSame = true;
1431 
1432  else if (pSizeUniform_[c1] == true && pSizeUniform_[c2] == true)
1433  theSame = true;
1434 
1435  else if (pCompactIndex_[c1].size() == pCompactIndex_[c2].size()){
1436  theSame = true;
1437  bool useIndex = pCompactIndex_[c1].size() > 0;
1438  if (useIndex){
1439  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1440  if (pSize_[c1][pCompactIndex_[c1][p]] !=
1441  pSize_[c2][pCompactIndex_[c2][p]])
1442  theSame = false;
1443  }
1444  else{
1445  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1446  if (pSize_[c1][p] != pSize_[c2][p])
1447  theSame = false;
1448  }
1449  }
1450 
1451  return theSame;
1452 }
1453 
1469 template <typename Adapter>
1471  bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
1472  int numLocalParts, int numGlobalParts)
1473 {
1474 #ifdef _MSC_VER
1475  typedef SSIZE_T ssize_t;
1476 #endif
1477  int nprocs = comm_->getSize();
1478  ssize_t reducevals[4];
1479  ssize_t sumHaveGlobal=0, sumHaveLocal=0;
1480  ssize_t sumGlobal=0, sumLocal=0;
1481  ssize_t maxGlobal=0, maxLocal=0;
1482  ssize_t vals[4] = {haveNumGlobalParts, haveNumLocalParts,
1483  numGlobalParts, numLocalParts};
1484 
1485  partDist_.clear();
1486  procDist_.clear();
1487 
1488  if (doCheck){
1489 
1490  try{
1491  reduceAll<int, ssize_t>(*comm_, Teuchos::REDUCE_SUM, 4, vals, reducevals);
1492  }
1493  Z2_THROW_OUTSIDE_ERROR(*env_);
1494 
1495  sumHaveGlobal = reducevals[0];
1496  sumHaveLocal = reducevals[1];
1497  sumGlobal = reducevals[2];
1498  sumLocal = reducevals[3];
1499 
1500  env_->localInputAssertion(__FILE__, __LINE__,
1501  "Either all procs specify num_global/local_parts or none do",
1502  (sumHaveGlobal == 0 || sumHaveGlobal == nprocs) &&
1503  (sumHaveLocal == 0 || sumHaveLocal == nprocs),
1504  BASIC_ASSERTION);
1505  }
1506  else{
1507  if (haveNumLocalParts)
1508  sumLocal = numLocalParts * nprocs;
1509  if (haveNumGlobalParts)
1510  sumGlobal = numGlobalParts * nprocs;
1511 
1512  sumHaveGlobal = haveNumGlobalParts ? nprocs : 0;
1513  sumHaveLocal = haveNumLocalParts ? nprocs : 0;
1514 
1515  maxLocal = numLocalParts;
1516  maxGlobal = numGlobalParts;
1517  }
1518 
1519  if (!haveNumLocalParts && !haveNumGlobalParts){
1520  onePartPerProc_ = true; // default if user did not specify
1521  return;
1522  }
1523 
1524  if (haveNumGlobalParts){
1525  if (doCheck){
1526  vals[0] = numGlobalParts;
1527  vals[1] = numLocalParts;
1528  try{
1529  reduceAll<int, ssize_t>(
1530  *comm_, Teuchos::REDUCE_MAX, 2, vals, reducevals);
1531  }
1532  Z2_THROW_OUTSIDE_ERROR(*env_);
1533 
1534  maxGlobal = reducevals[0];
1535  maxLocal = reducevals[1];
1536 
1537  env_->localInputAssertion(__FILE__, __LINE__,
1538  "Value for num_global_parts is different on different processes.",
1539  maxGlobal * nprocs == sumGlobal, BASIC_ASSERTION);
1540  }
1541 
1542  if (sumLocal){
1543  env_->localInputAssertion(__FILE__, __LINE__,
1544  "Sum of num_local_parts does not equal requested num_global_parts",
1545  sumLocal == numGlobalParts, BASIC_ASSERTION);
1546 
1547  if (sumLocal == nprocs && maxLocal == 1){
1548  onePartPerProc_ = true; // user specified one part per proc
1549  return;
1550  }
1551  }
1552  else{
1553  if (maxGlobal == nprocs){
1554  onePartPerProc_ = true; // user specified num parts is num procs
1555  return;
1556  }
1557  }
1558  }
1559 
1560  // If we are here, we do not have #parts == #procs.
1561 
1562  if (sumHaveLocal == nprocs){
1563  //
1564  // We will go by the number of local parts specified.
1565  //
1566 
1567  try{
1568  procDist_.resize(nprocs+1);
1569  }
1570  catch (std::exception &){
1571  throw(std::bad_alloc());
1572  }
1573 
1574  part_t *procArray = &procDist_[0];
1575 
1576  try{
1577  part_t tmp = part_t(numLocalParts);
1578  gatherAll<int, part_t>(*comm_, 1, &tmp, nprocs, procArray + 1);
1579  }
1580  Z2_THROW_OUTSIDE_ERROR(*env_);
1581 
1582  procArray[0] = 0;
1583 
1584  for (int proc=0; proc < nprocs; proc++)
1585  procArray[proc+1] += procArray[proc];
1586  }
1587  else{
1588  //
1589  // We will allocate global number of parts to the processes.
1590  //
1591  double fParts = numGlobalParts;
1592  double fProcs = nprocs;
1593 
1594  if (fParts < fProcs){
1595 
1596  try{
1597  partDist_.resize(size_t(fParts+1));
1598  }
1599  catch (std::exception &){
1600  throw(std::bad_alloc());
1601  }
1602 
1603  int *partArray = &partDist_[0];
1604 
1605  double each = floor(fProcs / fParts);
1606  double extra = fmod(fProcs, fParts);
1607  partDist_[0] = 0;
1608 
1609  for (part_t part=0; part < numGlobalParts; part++){
1610  int numOwners = int(each + ((part<extra) ? 1 : 0));
1611  partArray[part+1] = partArray[part] + numOwners;
1612  }
1613 
1614  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1615  partDist_[numGlobalParts] == nprocs, COMPLEX_ASSERTION, comm_);
1616  }
1617  else if (fParts > fProcs){
1618 
1619  // User did not specify local number of parts per proc;
1620  // Distribute the parts evenly among the procs.
1621 
1622  procDistEquallySpread_ = true;
1623 
1624  try{
1625  procDist_.resize(size_t(fProcs+1));
1626  }
1627  catch (std::exception &){
1628  throw(std::bad_alloc());
1629  }
1630 
1631  part_t *procArray = &procDist_[0];
1632 
1633  double each = floor(fParts / fProcs);
1634  double extra = fmod(fParts, fProcs);
1635  procArray[0] = 0;
1636 
1637  for (int proc=0; proc < nprocs; proc++){
1638  part_t numParts = part_t(each + ((proc<extra) ? 1 : 0));
1639  procArray[proc+1] = procArray[proc] + numParts;
1640  }
1641 
1642  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1643  procDist_[nprocs] == numGlobalParts, COMPLEX_ASSERTION, comm_);
1644  }
1645  else{
1646  env_->globalBugAssertion(__FILE__, __LINE__,
1647  "should never get here", 1, COMPLEX_ASSERTION, comm_);
1648  }
1649  }
1650 }
1651 
1653 // Remap a new part assignment vector for maximum overlap with an input
1654 // part assignment.
1655 //
1656 // Assumptions for this version:
1657 // input part assignment == processor rank for every local object.
1658 // assuming nGlobalParts_ <= num ranks
1659 // TODO: Write a version that takes the input part number as input;
1660 // this change requires input parts in adapters to be provided in
1661 // the Adapter.
1662 // TODO: For repartitioning, compare to old remapping results; see Zoltan1.
1663 
1664 template <typename Adapter>
1666 {
1667  size_t len = parts_.size();
1668 
1669  part_t me = comm_->getRank();
1670  int np = comm_->getSize();
1671 
1672  if (np < nGlobalParts_) {
1673  if (me == 0) {
1674  std::ostringstream msg;
1675  msg << "Remapping not yet supported for "
1676  << "num_global_parts " << nGlobalParts_
1677  << " > num procs " << np << std::endl;
1678  env_->debug(DETAILED_STATUS, msg.str());
1679  }
1680  return;
1681  }
1682  // Build edges of a bipartite graph with np + nGlobalParts_ vertices,
1683  // and edges between a process vtx and any parts to which that process'
1684  // objects are assigned.
1685  // Weight edge[parts_[i]] by the number of objects that are going from
1686  // this rank to parts_[i].
1687  // We use a std::map, assuming the number of unique parts in the parts_ array
1688  // is small to keep the binary search efficient.
1689  // TODO We use the count of objects to move; should change to SIZE of objects
1690  // to move; need SIZE function in Adapter.
1691 
1692  std::map<part_t, long> edges;
1693  long lstaying = 0; // Total num of local objects staying if we keep the
1694  // current mapping. TODO: change to SIZE of local objs
1695  long gstaying = 0; // Total num of objects staying in the current partition
1696 
1697  for (size_t i = 0; i < len; i++) {
1698  edges[parts_[i]]++; // TODO Use obj size instead of count
1699  if (parts_[i] == me) lstaying++; // TODO Use obj size instead of count
1700  }
1701 
1702  Teuchos::reduceAll<int, long>(*comm_, Teuchos::REDUCE_SUM, 1,
1703  &lstaying, &gstaying);
1704 //TODO if (gstaying == Adapter::getGlobalNumObjs()) return; // Nothing to do
1705 
1706  part_t *remap = NULL;
1707 
1708  int nedges = edges.size();
1709 
1710  // Gather the graph to rank 0.
1711  part_t tnVtx = np + nGlobalParts_; // total # vertices
1712  int *idx = NULL; // Pointer index into graph adjacencies
1713  int *sizes = NULL; // nedges per rank
1714  if (me == 0) {
1715  idx = new int[tnVtx+1];
1716  sizes = new int[np];
1717  sizes[0] = nedges;
1718  }
1719  if (np > 1)
1720  Teuchos::gather<int, int>(&nedges, 1, sizes, 1, 0, *comm_);
1721 
1722  // prefix sum to build the idx array
1723  if (me == 0) {
1724  idx[0] = 0;
1725  for (int i = 0; i < np; i++)
1726  idx[i+1] = idx[i] + sizes[i];
1727  }
1728 
1729  // prepare to send edges
1730  int cnt = 0;
1731  part_t *bufv = NULL;
1732  long *bufw = NULL;
1733  if (nedges) {
1734  bufv = new part_t[nedges];
1735  bufw = new long[nedges];
1736  // Create buffer with edges (me, part[i]) and weight edges[parts_[i]].
1737  for (typename std::map<part_t, long>::iterator it = edges.begin();
1738  it != edges.end(); it++) {
1739  bufv[cnt] = it->first; // target part
1740  bufw[cnt] = it->second; // weight
1741  cnt++;
1742  }
1743  }
1744 
1745  // Prepare to receive edges on rank 0
1746  part_t *adj = NULL;
1747  long *wgt = NULL;
1748  if (me == 0) {
1749 //SYM adj = new part_t[2*idx[np]]; // need 2x space to symmetrize later
1750 //SYM wgt = new long[2*idx[np]]; // need 2x space to symmetrize later
1751  adj = new part_t[idx[np]];
1752  wgt = new long[idx[np]];
1753  }
1754 
1755  Teuchos::gatherv<int, part_t>(bufv, cnt, adj, sizes, idx, 0, *comm_);
1756  Teuchos::gatherv<int, long>(bufw, cnt, wgt, sizes, idx, 0, *comm_);
1757  delete [] bufv;
1758  delete [] bufw;
1759 
1760  // Now have constructed graph on rank 0.
1761  // Call the matching algorithm
1762 
1763  int doRemap;
1764  if (me == 0) {
1765  // We have the "LHS" vertices of the bipartite graph; need to create
1766  // "RHS" vertices.
1767  for (int i = 0; i < idx[np]; i++) {
1768  adj[i] += np; // New RHS vertex number; offset by num LHS vertices
1769  }
1770 
1771  // Build idx for RHS vertices
1772  for (part_t i = np; i < tnVtx; i++) {
1773  idx[i+1] = idx[i]; // No edges for RHS vertices
1774  }
1775 
1776 #ifdef KDDKDD_DEBUG
1777  std::cout << "IDX ";
1778  for (part_t i = 0; i <= tnVtx; i++) std::cout << idx[i] << " ";
1779  std::cout << std::endl;
1780 
1781  std::cout << "ADJ ";
1782  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << adj[i] << " ";
1783  std::cout << std::endl;
1784 
1785  std::cout << "WGT ";
1786  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << wgt[i] << " ";
1787  std::cout << std::endl;
1788 #endif
1789 
1790  // Perform matching on the graph
1791  part_t *match = new part_t[tnVtx];
1792  for (part_t i = 0; i < tnVtx; i++) match[i] = i;
1793  part_t nmatches =
1794  Zoltan2::GreedyMWM<part_t, long>(idx, adj, wgt, tnVtx, match);
1795 
1796 #ifdef KDDKDD_DEBUG
1797  std::cout << "After matching: " << nmatches << " found" << std::endl;
1798  for (part_t i = 0; i < tnVtx; i++)
1799  std::cout << "match[" << i << "] = " << match[i]
1800  << ((match[i] != i &&
1801  (i < np && match[i] != i+np))
1802  ? " *" : " ")
1803  << std::endl;
1804 #endif
1805 
1806  // See whether there were nontrivial changes in the matching.
1807  bool nontrivial = false;
1808  if (nmatches) {
1809  for (part_t i = 0; i < np; i++) {
1810  if ((match[i] != i) && (match[i] != (i+np))) {
1811  nontrivial = true;
1812  break;
1813  }
1814  }
1815  }
1816 
1817  // Process the matches
1818  if (nontrivial) {
1819  remap = new part_t[nGlobalParts_];
1820  for (part_t i = 0; i < nGlobalParts_; i++) remap[i] = -1;
1821 
1822  bool *used = new bool[np];
1823  for (part_t i = 0; i < np; i++) used[i] = false;
1824 
1825  // First, process all matched parts
1826  for (part_t i = 0; i < nGlobalParts_; i++) {
1827  part_t tmp = i + np;
1828  if (match[tmp] != tmp) {
1829  remap[i] = match[tmp];
1830  used[match[tmp]] = true;
1831  }
1832  }
1833 
1834  // Second, process unmatched parts; keep same part number if possible
1835  for (part_t i = 0; i < nGlobalParts_; i++) {
1836  if (remap[i] > -1) continue;
1837  if (!used[i]) {
1838  remap[i] = i;
1839  used[i] = true;
1840  }
1841  }
1842 
1843  // Third, process unmatched parts; give them the next unused part
1844  for (part_t i = 0, uidx = 0; i < nGlobalParts_; i++) {
1845  if (remap[i] > -1) continue;
1846  while (used[uidx]) uidx++;
1847  remap[i] = uidx;
1848  used[uidx] = true;
1849  }
1850  delete [] used;
1851  }
1852  delete [] match;
1853 
1854 #ifdef KDDKDD_DEBUG
1855  std::cout << "Remap vector: ";
1856  for (part_t i = 0; i < nGlobalParts_; i++) std::cout << remap[i] << " ";
1857  std::cout << std::endl;
1858 #endif
1859 
1860  long newgstaying = measure_stays(remap, idx, adj, wgt,
1861  nGlobalParts_, np);
1862  doRemap = (newgstaying > gstaying);
1863  std::ostringstream msg;
1864  msg << "gstaying " << gstaying << " measure(input) "
1865  << measure_stays(NULL, idx, adj, wgt, nGlobalParts_, np)
1866  << " newgstaying " << newgstaying
1867  << " nontrivial " << nontrivial
1868  << " doRemap " << doRemap << std::endl;
1869  env_->debug(DETAILED_STATUS, msg.str());
1870  }
1871  delete [] idx;
1872  delete [] sizes;
1873  delete [] adj;
1874  delete [] wgt;
1875 
1876  Teuchos::broadcast<int, int>(*comm_, 0, 1, &doRemap);
1877 
1878  if (doRemap) {
1879  if (me != 0) remap = new part_t[nGlobalParts_];
1880  Teuchos::broadcast<int, part_t>(*comm_, 0, nGlobalParts_, remap);
1881  for (size_t i = 0; i < len; i++) {
1882  parts_[i] = remap[parts_[i]];
1883  }
1884  }
1885 
1886  delete [] remap; // TODO May want to keep for repartitioning as in Zoltan
1887 }
1888 
1889 
1890 } // namespace Zoltan2
1891 
1892 #endif
const int * getProcListView() const
Returns the process list corresponding to the global ID list.
scalar_t getCriteriaPartSize(int idx, part_t part) const
Get the size for a given weight index and a given part.
Defines the Solution base class.
fast typical checks for valid arguments
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
part_t pointAssign(int dim, scalar_t *point) const
Return the part overlapping a given point in space;.
void RemapParts()
Remap a new partition for maximum overlap with an input partition.
Just a placeholder for now.
more involved, like validate a graph
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:27
size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
virtual bool isPartitioningTreeBinary() const
calculate if partition tree is binary.
void getPartsForProc(int procId, double &numParts, part_t &partMin, part_t &partMax) const
Get the parts belonging to a process.
void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
Get the processes containing a part.
sub-steps, each method&#39;s entry and exit
bool oneToOnePartDistribution() const
Is the part-to-process distribution is one-to-one.
SparseMatrixAdapter_t::part_t part_t
#define epsilon
Definition: nd.cpp:47
long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt, part_t nrhs, part_t nlhs)
scalar_t getLocalFractionOfPart() const
If parts are divided across processes, return the fraction of a part on this process.
A PartitioningSolution is a solution to a partitioning problem.
const RCP< const Comm< int > > & getCommunicator() const
Return the communicator associated with the solution.
void boxAssign(int dim, scalar_t *lower, scalar_t *upper, size_t &nPartsFound, part_t **partsFound) const
Return an array of all parts overlapping a given box in space.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
PartitioningSolution(const RCP< const Environment > &env, const RCP< const Comm< int > > &comm, int nUserWeights, const RCP< Algorithm< Adapter > > &algorithm=Teuchos::null)
Constructor when part sizes are not supplied.
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:26
void getPartitionTree(part_t &numTreeVerts, std::vector< part_t > &permPartNums, std::vector< part_t > &splitRangeBeg, std::vector< part_t > &splitRangeEnd, std::vector< part_t > &treeVertParents) const
get the partition tree - fill the relevant arrays
Greedy Maximal Weight Matching.
void setParts(ArrayRCP< part_t > &partList)
The algorithm uses setParts to set the solution.
static const std::string fail
std::vector< Zoltan2::coordinateModelPartBox > & getPartBoxesView() const
returns the part box boundary list.
Defines the Environment class.
const part_t * getProcDistribution() const
Return a distribution by process.
void getCommunicationGraph(ArrayRCP< part_t > &comXAdj, ArrayRCP< part_t > &comAdj) const
returns communication graph resulting from geometric partitioning.
int getNumberOfCriteria() const
Get the number of criteria (object weights)
const int * getPartDistribution() const
Return a distribution by part.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
bool criteriaHasUniformPartSizes(int idx) const
Determine if balancing criteria has uniform part sizes. (User can specify differing part sizes...
bool criteriaHaveSamePartSizes(int c1, int c2) const
Return true if the two weight indices have the same part size information.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
const RCP< const Environment > & getEnvironment() const
Return the environment associated with the solution.
Zoltan2::BasicUserTypes< zscalar_t, zlno_t, zgno_t > user_t
Definition: Metric.cpp:39