Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
58 
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
62 #include <new> // ::operator new[]
63 #include <algorithm> // std::sort
64 #include <Zoltan2_Util.hpp>
65 #include <vector>
66 
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
69 #else
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
72 
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h" // for error codes
78 #endif
79 #endif
80 
81 #ifdef HAVE_ZOLTAN2_OMP
82 #include <omp.h>
83 #endif
84 
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
87 
88 //if the (last dimension reduce all count) x the mpi world size
89 //estimated to be bigger than this number then migration will be forced
90 //in earlier iterations.
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
92 //if parts right before last dimension are estimated to have less than
93 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
94 #define MIN_WORK_LAST_DIM 1000
95 
96 
97 
98 
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
100 //imbalance calculation. Wreal / Wexpected - 1
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102  (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104  (Wachieved) / (wExpected) - 1
105 
106 
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
108 
109 
110 namespace Teuchos{
111 
116 template <typename Ordinal, typename T>
117 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
118 {
119 private:
120  Ordinal size;
121  T _EPSILON;
122 
123 public:
126  Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
127 
134  Zoltan2_BoxBoundaries (Ordinal s_):
135  size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
136 
139  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
140  {
141  for (Ordinal i=0; i < count; i++){
142  if (Z2_ABS(inBuffer[i]) > _EPSILON){
143  inoutBuffer[i] = inBuffer[i];
144  }
145  }
146  }
147 };
148 } // namespace Teuchos
149 
150 namespace Zoltan2{
151 
155 template <typename T>
156 T *allocMemory(size_t size){
157  if (size > 0){
158  T * a = new T[size];
159  if (a == NULL) {
160  throw "cannot allocate memory";
161  }
162  return a;
163  }
164  else {
165  return NULL;
166  }
167 }
168 
172 template <typename T>
173 void freeArray(T *&array){
174  if(array != NULL){
175  delete [] array;
176  array = NULL;
177  }
178 }
179 
180 
188 template <typename IT, typename CT, typename WT>
190 {
191 public:
192  //TODO: Why volatile?
193  //no idea, another intel compiler faiulure.
194  volatile IT index;
195  volatile CT count;
196  //unsigned int val;
197  volatile WT *val;
198  volatile WT _EPSILON;
199 
201  this->index = 0;
202  this->count = 0;
203  this->val = NULL;
205  }
206 
207 
208  uMultiSortItem(IT index_ ,CT count_, WT *vals_){
209  this->index = index_;
210  this->count = count_;
211  this->val = vals_;
213  }
214 
216  this->index = other.index;
217  this->count = other.count;
218  this->val = other.val;
219  this->_EPSILON = other._EPSILON;
220  }
221 
223  //freeArray<WT>(this->val);
224  }
225 
226  void set(IT index_ ,CT count_, WT *vals_){
227  this->index = index_;
228  this->count = count_;
229  this->val = vals_;
230  }
231 
232 
234  this->index = other.index;
235  this->count = other.count;
236  this->val = other.val;
237  return *(this);
238  }
239 
240  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
241  assert (this->count == other.count);
242  for(CT i = 0; i < this->count; ++i){
243  //if the values are equal go to next one.
244  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
245  continue;
246  }
247  //if next value is smaller return true;
248  if(this->val[i] < other.val[i]){
249  return true;
250  }
251  //if next value is bigger return false;
252  else {
253  return false;
254  }
255  }
256  //if they are totally equal.
257  return this->index < other.index;
258  }
259  bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
260  assert (this->count == other.count);
261  for(CT i = 0; i < this->count; ++i){
262  //if the values are equal go to next one.
263  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
264  continue;
265  }
266  //if next value is bigger return true;
267  if(this->val[i] > other.val[i]){
268  return true;
269  }
270  //if next value is smaller return false;
271  else //(this->val[i] > other.val[i])
272  {
273  return false;
274  }
275  }
276  //if they are totally equal.
277  return this->index > other.index;
278  }
279 };// uSortItem;
280 
284 template <class IT, class WT>
285 struct uSortItem
286 {
287  IT id;
288  //unsigned int val;
289  WT val;
290 };// uSortItem;
291 
295 template <class IT, class WT>
296 void uqsort(IT n, uSortItem<IT, WT> * arr)
297 {
298 
299  int NSTACK = 50;
300  int M = 7;
301  IT i, ir=n, j, k, l=1;
302  IT jstack=0, istack[50];
303  WT aval;
304  uSortItem<IT,WT> a, temp;
305 
306  --arr;
307  for (;;)
308  {
309  if (ir-l < M)
310  {
311  for (j=l+1;j<=ir;j++)
312  {
313  a=arr[j];
314  aval = a.val;
315  for (i=j-1;i>=1;i--)
316  {
317  if (arr[i].val <= aval)
318  break;
319  arr[i+1] = arr[i];
320  }
321  arr[i+1]=a;
322  }
323  if (jstack == 0)
324  break;
325  ir=istack[jstack--];
326  l=istack[jstack--];
327  }
328  else
329  {
330  k=(l+ir) >> 1;
331 
332  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
333  if (arr[l+1].val > arr[ir].val)
334  {
335  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
336  }
337  if (arr[l].val > arr[ir].val)
338  {
339  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
340  }
341  if (arr[l+1].val > arr[l].val)
342  {
343  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
344  }
345  i=l+1;
346  j=ir;
347  a=arr[l];
348  aval = a.val;
349  for (;;)
350  {
351  do i++; while (arr[i].val < aval);
352  do j--; while (arr[j].val > aval);
353  if (j < i) break;
354  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
355  }
356  arr[l]=arr[j];
357  arr[j]=a;
358  jstack += 2;
359  if (jstack > NSTACK){
360  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
361  exit(1);
362  }
363  if (ir-i+1 >= j-l)
364  {
365  istack[jstack]=ir;
366  istack[jstack-1]=i;
367  ir=j-1;
368  }
369  else
370  {
371  istack[jstack]=j-1;
372  istack[jstack-1]=l;
373  l=i;
374  }
375  }
376  }
377 }
378 
379 template <class IT, class WT, class SIGN>
381 {
382  IT id;
383  //unsigned int val;
384  WT val;
385  SIGN signbit; // 1 means positive, 0 means negative.
386  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
387  /*if I am negative, the other is positive*/
388  if (this->signbit < rhs.signbit){
389  return true;
390  }
391  /*if both has the same sign*/
392  else if (this->signbit == rhs.signbit){
393 
394  if (this->val < rhs.val){//if my value is smaller,
395  return this->signbit;//then if we both are positive return true.
396  //if we both are negative, return false.
397  }
398  else if (this->val > rhs.val){//if my value is larger,
399  return !this->signbit; //then if we both are positive return false.
400  //if we both are negative, return true.
401  }
402  else { //if both are equal.
403  return false;
404  }
405  }
406  else {
407  /*if I am positive, the other is negative*/
408  return false;
409  }
410 
411  }
412  bool operator>(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
413  /*if I am positive, the other is negative*/
414  if (this->signbit > rhs.signbit){
415  return true;
416  }
417  /*if both has the same sign*/
418  else if (this->signbit == rhs.signbit){
419 
420  if (this->val < rhs.val){//if my value is smaller,
421  return !this->signbit;//then if we both are positive return false.
422  //if we both are negative, return true.
423  }
424  else if (this->val > rhs.val){//if my value is larger,
425  return this->signbit; //then if we both are positive return true.
426  //if we both are negative, return false.
427  }
428  else { // if they are equal
429  return false;
430  }
431  }
432  else {
433  /*if I am negative, the other is positive*/
434  return false;
435  }
436  }
437  bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
438  return !(*this > rhs);}
440  return !(*this < rhs);}
441 };
442 
446 template <class IT, class WT, class SIGN>
448 
449  IT NSTACK = 50;
450  IT M = 7;
451  IT i, ir=n, j, k, l=1;
452  IT jstack=0, istack[50];
454 
455  --arr;
456  for (;;)
457  {
458  if (ir < M + l)
459  {
460  for (j=l+1;j<=ir;j++)
461  {
462  a=arr[j];
463  for (i=j-1;i>=1;i--)
464  {
465  if (arr[i] <= a)
466  {
467  break;
468  }
469  arr[i+1] = arr[i];
470  }
471  arr[i+1]=a;
472  }
473  if (jstack == 0)
474  break;
475  ir=istack[jstack--];
476  l=istack[jstack--];
477  }
478  else
479  {
480  k=(l+ir) >> 1;
481  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
482  if (arr[l+1] > arr[ir])
483  {
484  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
485  }
486  if (arr[l] > arr[ir])
487  {
488  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
489  }
490  if (arr[l+1] > arr[l])
491  {
492  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
493  }
494  i=l+1;
495  j=ir;
496  a=arr[l];
497  for (;;)
498  {
499  do i++; while (arr[i] < a);
500  do j--; while (arr[j] > a);
501  if (j < i) break;
502  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
503  }
504  arr[l]=arr[j];
505  arr[j]=a;
506  jstack += 2;
507  if (jstack > NSTACK){
508  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
509  exit(1);
510  }
511  if (ir+l+1 >= j+i)
512  {
513  istack[jstack]=ir;
514  istack[jstack-1]=i;
515  ir=j-1;
516  }
517  else
518  {
519  istack[jstack]=j-1;
520  istack[jstack-1]=l;
521  l=i;
522  }
523  }
524  }
525 }
526 
530 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
531  typename mj_part_t>
532 class AlgMJ
533 {
534 private:
536  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
537 
538  RCP<const Environment> mj_env; //the environment object
539  RCP<const Comm<int> > mj_problemComm; //initial comm object
540 
541  double imbalance_tolerance; //input imbalance tolerance.
542  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
543  int recursion_depth; //the number of steps that partitioning will be solved in.
544  int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
545 
546  size_t initial_num_loc_coords; //initial num local coords.
547  global_size_t initial_num_glob_coords; //initial num global coords.
548 
549  mj_lno_t num_local_coords; //number of local coords.
550  mj_gno_t num_global_coords; //number of global coords.
551 
552  mj_scalar_t **mj_coordinates; //two dimension coordinate array
553  mj_scalar_t **mj_weights; //two dimension weight array
554  bool *mj_uniform_parts; //if the target parts are uniform
555  mj_scalar_t **mj_part_sizes; //target part weight sizes.
556  bool *mj_uniform_weights; //if the coordinates have uniform weights.
557 
558  ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
559  size_t num_global_parts; //the targeted number of parts
560 
561  mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
562  mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
563  int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
564 
565  mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
566  mj_lno_t *new_coordinate_permutations; //permutation work array.
567  mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
568 
569  mj_lno_t *part_xadj; //beginning and end of each part.
570  mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
571 
572  //get mj specific parameters.
573  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
574  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
575 
576  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
577  int mj_user_recursion_depth; //the recursion depth value provided by user.
578  bool mj_keep_part_boxes; //if the boxes need to be kept.
579 
580  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
581  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
582  //1 - will aim for minimized number of messages with possibly bad load-imbalance
583  double minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
584  int num_threads; //num threads
585 
586  mj_part_t total_num_cut ; //how many cuts will be totally
587  mj_part_t total_num_part; //how many parts will be totally
588 
589  mj_part_t max_num_part_along_dim ; //maximum part count along a dimension.
590  mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
591  size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
592 
593  mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done.
594  mj_part_t last_dim_num_part; //max no of parts that might occur
595  //during the partition before the
596  //last partitioning dimension.
597 
598  RCP<Comm<int> > comm; //comm object than can be altered during execution
599  float fEpsilon; //epsilon for float
600  mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
601 
602  mj_scalar_t maxScalar_t; //max possible scalar
603  mj_scalar_t minScalar_t; //min scalar
604 
605  mj_scalar_t *all_cut_coordinates;
606  mj_scalar_t *max_min_coords;
607  mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
608  mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
609 
610  // work array to manipulate coordinate of cutlines in different iterations.
611  //necessary because previous cut line information is used for determining
612  //the next cutline information. therefore, cannot update the cut work array
613  //until all cutlines are determined.
614  mj_scalar_t *cut_coordinates_work_array;
615 
616  //cumulative part weight array.
617  mj_scalar_t *target_part_weights;
618 
619  mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line
620  mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line
621  mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line
622  mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line
623 
624  mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
625  mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
626 
627  //isDone is used to determine if a cutline is determined already.
628  //If a cut line is already determined, the next iterations will skip this cut line.
629  bool *is_cut_line_determined;
630  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
631  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
632  mj_part_t *my_incomplete_cut_count;
633  //local part weights of each thread.
634  double **thread_part_weights;
635  //the work manupulation array for partweights.
636  double **thread_part_weight_work;
637 
638  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
639  mj_scalar_t **thread_cut_left_closest_point;
640  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
641  mj_scalar_t **thread_cut_right_closest_point;
642 
643  //to store how many points in each part a thread has.
644  mj_lno_t **thread_point_counts;
645 
646  mj_scalar_t *process_rectilinear_cut_weight;
647  mj_scalar_t *global_rectilinear_cut_weight;
648 
649  //for faster communication, concatanation of
650  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
651  //leftClosest distances sized P-1, since P-1 cut lines
652  //rightClosest distances size P-1, since P-1 cut lines.
653  mj_scalar_t *total_part_weight_left_right_closests ;
654  mj_scalar_t *global_total_part_weight_left_right_closests;
655 
656  RCP<mj_partBoxVector_t> kept_boxes; // vector of all boxes for all parts;
657  // constructed only if
658  // mj_keep_part_boxes == true
659  RCP<mj_partBox_t> global_box;
660  int myRank, myActualRank; //processor rank, and initial rank
661 
662  bool divide_to_prime_first;
663 
664  /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
665  * the input. part_no_array takes
666  * precedence if both are provided.
667  * Depending on these parameters, total cut/part number,
668  * maximum part/cut number along a dimension, estimated number of reduceAlls,
669  * and the number of parts before the last dimension is calculated.
670  * */
671  void set_part_specifications();
672 
673  /* \brief Tries to determine the part number for current dimension,
674  * by trying to make the partitioning as square as possible.
675  * \param num_total_future how many more partitionings are required.
676  * \param root how many more recursion depth is left.
677  */
678  inline mj_part_t get_part_count(
679  mj_part_t num_total_future,
680  double root);
681 
682  /* \brief Allocates the all required memory for the mj partitioning algorithm.
683  *
684  */
685  void allocate_set_work_memory();
686 
687  /* \brief for part communication we keep track of the box boundaries.
688  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
689  * This function initializes a single box with all global min and max coordinates.
690  * \param initial_partitioning_boxes the input and output vector for boxes.
691  */
692  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
693 
694  /* \brief compute global bounding box: min/max coords of global domain */
695  void compute_global_box();
696 
697  /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
698  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
699  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
700  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
701  *
702  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
703  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
704  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
705  * \param future_num_parts: output, max number of future parts that will be obtained from a single
706  * \param current_num_parts: input, how many parts are there currently.
707  * \param current_iteration: input, current dimension iteration number.
708  * \param input_part_boxes: input, if boxes are kept, current boxes.
709  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
710  */
711  mj_part_t update_part_num_arrays(
712  std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
713  std::vector<mj_part_t> *future_num_part_in_parts,
714  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
715  mj_part_t &future_num_parts,
716  mj_part_t current_num_parts,
717  int current_iteration,
718  RCP<mj_partBoxVector_t> input_part_boxes,
719  RCP<mj_partBoxVector_t> output_part_boxes,
720  mj_part_t atomic_part_count);
721 
733  void mj_get_local_min_max_coord_totW(
734  mj_lno_t coordinate_begin_index,
735  mj_lno_t coordinate_end_index,
736  mj_lno_t *mj_current_coordinate_permutations,
737  mj_scalar_t *mj_current_dim_coords,
738  mj_scalar_t &min_coordinate,
739  mj_scalar_t &max_coordinate,
740  mj_scalar_t &total_weight);
741 
749  void mj_get_global_min_max_coord_totW(
750  mj_part_t current_concurrent_num_parts,
751  mj_scalar_t *local_min_max_total,
752  mj_scalar_t *global_min_max_total);
753 
772  void mj_get_initial_cut_coords_target_weights(
773  mj_scalar_t min_coord,
774  mj_scalar_t max_coord,
775  mj_part_t num_cuts/*p-1*/ ,
776  mj_scalar_t global_weight,
777  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
778  mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
779 
780  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
781  std::vector <mj_part_t> *next_future_num_parts_in_parts,
782  mj_part_t concurrent_current_part,
783  mj_part_t obtained_part_index);
784 
797  void set_initial_coordinate_parts(
798  mj_scalar_t &max_coordinate,
799  mj_scalar_t &min_coordinate,
800  mj_part_t &concurrent_current_part_index,
801  mj_lno_t coordinate_begin_index,
802  mj_lno_t coordinate_end_index,
803  mj_lno_t *mj_current_coordinate_permutations,
804  mj_scalar_t *mj_current_dim_coords,
805  mj_part_t *mj_part_ids,
806  mj_part_t &partition_count);
807 
818  void mj_1D_part(
819  mj_scalar_t *mj_current_dim_coords,
820  double imbalanceTolerance,
821  mj_part_t current_work_part,
822  mj_part_t current_concurrent_num_parts,
823  mj_scalar_t *current_cut_coordinates,
824  mj_part_t total_incomplete_cut_count,
825  std::vector <mj_part_t> &num_partitioning_in_current_dim);
826 
846  void mj_1D_part_get_thread_part_weights(
847  size_t total_part_count,
848  mj_part_t num_cuts,
849  mj_scalar_t max_coord,
850  mj_scalar_t min_coord,
851  mj_lno_t coordinate_begin_index,
852  mj_lno_t coordinate_end_index,
853  mj_scalar_t *mj_current_dim_coords,
854  mj_scalar_t *temp_current_cut_coords,
855  bool *current_cut_status,
856  double *my_current_part_weights,
857  mj_scalar_t *my_current_left_closest,
858  mj_scalar_t *my_current_right_closest);
859 
867  void mj_accumulate_thread_results(
868  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
869  mj_part_t current_work_part,
870  mj_part_t current_concurrent_num_parts);
871 
902  void mj_get_new_cut_coordinates(
903  const size_t &num_total_part,
904  const mj_part_t &num_cuts,
905  const mj_scalar_t &max_coordinate,
906  const mj_scalar_t &min_coordinate,
907  const mj_scalar_t &global_total_weight,
908  const double &used_imbalance_tolerance,
909  mj_scalar_t * current_global_part_weights,
910  const mj_scalar_t * current_local_part_weights,
911  const mj_scalar_t *current_part_target_weights,
912  bool *current_cut_line_determined,
913  mj_scalar_t *current_cut_coordinates,
914  mj_scalar_t *current_cut_upper_bounds,
915  mj_scalar_t *current_cut_lower_bounds,
916  mj_scalar_t *current_global_left_closest_points,
917  mj_scalar_t *current_global_right_closest_points,
918  mj_scalar_t * current_cut_lower_bound_weights,
919  mj_scalar_t * current_cut_upper_weights,
920  mj_scalar_t *new_current_cut_coordinates,
921  mj_scalar_t *current_part_cut_line_weight_to_put_left,
922  mj_part_t *rectilinear_cut_count,
923  mj_part_t &my_num_incomplete_cut);
924 
934  void mj_calculate_new_cut_position (
935  mj_scalar_t cut_upper_bound,
936  mj_scalar_t cut_lower_bound,
937  mj_scalar_t cut_upper_weight,
938  mj_scalar_t cut_lower_weight,
939  mj_scalar_t expected_weight,
940  mj_scalar_t &new_cut_position);
941 
952  void mj_create_new_partitions(
953  mj_part_t num_parts,
954  mj_scalar_t *mj_current_dim_coords,
955  mj_scalar_t *current_concurrent_cut_coordinate,
956  mj_lno_t coordinate_begin,
957  mj_lno_t coordinate_end,
958  mj_scalar_t *used_local_cut_line_weight_to_left,
959  double **used_thread_part_weight_work,
960  mj_lno_t *out_part_xadj);
961 
984  bool mj_perform_migration(
985  mj_part_t in_num_parts, //current umb parts
986  mj_part_t &out_num_parts, //output umb parts.
987  std::vector<mj_part_t> *next_future_num_parts_in_parts,
988  mj_part_t &output_part_begin_index,
989  size_t migration_reduce_all_population,
990  mj_lno_t num_coords_for_last_dim_part,
991  std::string iteration,
992  RCP<mj_partBoxVector_t> &input_part_boxes,
993  RCP<mj_partBoxVector_t> &output_part_boxes);
994 
1004  void get_processor_num_points_in_parts(
1005  mj_part_t num_procs,
1006  mj_part_t num_parts,
1007  mj_gno_t *&num_points_in_all_processor_parts);
1008 
1021  bool mj_check_to_migrate(
1022  size_t migration_reduce_all_population,
1023  mj_lno_t num_coords_for_last_dim_part,
1024  mj_part_t num_procs,
1025  mj_part_t num_parts,
1026  mj_gno_t *num_points_in_all_processor_parts);
1027 
1028 
1046  void mj_migration_part_proc_assignment(
1047  mj_gno_t * num_points_in_all_processor_parts,
1048  mj_part_t num_parts,
1049  mj_part_t num_procs,
1050  mj_lno_t *send_count_to_each_proc,
1051  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1052  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1053  mj_part_t &out_num_part,
1054  std::vector<mj_part_t> &out_part_indices,
1055  mj_part_t &output_part_numbering_begin_index,
1056  int *coordinate_destinations);
1057 
1074  void mj_assign_proc_to_parts(
1075  mj_gno_t * num_points_in_all_processor_parts,
1076  mj_part_t num_parts,
1077  mj_part_t num_procs,
1078  mj_lno_t *send_count_to_each_proc,
1079  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1080  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1081  mj_part_t &out_part_index,
1082  mj_part_t &output_part_numbering_begin_index,
1083  int *coordinate_destinations);
1084 
1095  void assign_send_destinations(
1096  mj_part_t num_parts,
1097  mj_part_t *part_assignment_proc_begin_indices,
1098  mj_part_t *processor_chains_in_parts,
1099  mj_lno_t *send_count_to_each_proc,
1100  int *coordinate_destinations);
1101 
1114  void assign_send_destinations2(
1115  mj_part_t num_parts,
1116  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
1117  int *coordinate_destinations,
1118  mj_part_t &output_part_numbering_begin_index,
1119  std::vector<mj_part_t> *next_future_num_parts_in_parts);
1120 
1137  void mj_assign_parts_to_procs(
1138  mj_gno_t * num_points_in_all_processor_parts,
1139  mj_part_t num_parts,
1140  mj_part_t num_procs,
1141  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
1142  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
1143  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
1144  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
1145  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
1146  int *coordinate_destinations);
1147 
1160  void mj_migrate_coords(
1161  mj_part_t num_procs,
1162  mj_lno_t &num_new_local_points,
1163  std::string iteration,
1164  int *coordinate_destinations,
1165  mj_part_t num_parts);
1166 
1173  void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1174 
1175 
1181  void fill_permutation_array(
1182  mj_part_t output_num_parts,
1183  mj_part_t num_parts);
1184 
1193  void set_final_parts(
1194  mj_part_t current_num_parts,
1195  mj_part_t output_part_begin_index,
1196  RCP<mj_partBoxVector_t> &output_part_boxes,
1197  bool is_data_ever_migrated);
1200  void free_work_memory();
1214  void create_consistent_chunks(
1215  mj_part_t num_parts,
1216  mj_scalar_t *mj_current_dim_coords,
1217  mj_scalar_t *current_concurrent_cut_coordinate,
1218  mj_lno_t coordinate_begin,
1219  mj_lno_t coordinate_end,
1220  mj_scalar_t *used_local_cut_line_weight_to_left,
1221  mj_lno_t *out_part_xadj,
1222  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1223 
1228  mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1229  mj_part_t largest_factor = 1;
1230  mj_part_t n = num_parts;
1231  mj_part_t divisor = 2;
1232  while (n > 1){
1233  while (n % divisor == 0){
1234  n = n / divisor;
1235  largest_factor = divisor;
1236  }
1237  ++divisor;
1238  if (divisor * divisor > n){
1239  if (n > 1){
1240  largest_factor = n;
1241  }
1242  break;
1243  }
1244  }
1245  return largest_factor;
1246  }
1247 public:
1248  AlgMJ();
1249 
1278  void multi_jagged_part(
1279  const RCP<const Environment> &env,
1280  RCP<const Comm<int> > &problemComm,
1281 
1282  double imbalance_tolerance,
1283  size_t num_global_parts,
1284  mj_part_t *part_no_array,
1285  int recursion_depth,
1286 
1287  int coord_dim,
1288  mj_lno_t num_local_coords,
1289  mj_gno_t num_global_coords,
1290  const mj_gno_t *initial_mj_gnos,
1291  mj_scalar_t **mj_coordinates,
1292 
1293  int num_weights_per_coord,
1294  bool *mj_uniform_weights,
1295  mj_scalar_t **mj_weights,
1296  bool *mj_uniform_parts,
1297  mj_scalar_t **mj_part_sizes,
1298 
1299  mj_part_t *&result_assigned_part_ids,
1300  mj_gno_t *&result_mj_gnos
1301 
1302  );
1312  bool distribute_points_on_cut_lines_,
1313  int max_concurrent_part_calculation_,
1314  int check_migrate_avoid_migration_option_,
1315  double minimum_migration_imbalance_, int migration_type_ = 0);
1319  void set_to_keep_part_boxes();
1320 
1323  RCP<mj_partBox_t> get_global_box() const;
1324 
1325  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1326 
1327  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1328  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1329 
1355  const RCP<const Environment> &env,
1356  mj_lno_t num_total_coords,
1357  mj_lno_t num_selected_coords,
1358  size_t num_target_part,
1359  int coord_dim,
1360  mj_scalar_t **mj_coordinates,
1361  mj_lno_t *initial_selected_coords_output_permutation,
1362  mj_lno_t *output_xadj,
1363  int recursion_depth,
1364  const mj_part_t *part_no_array,
1365  bool partition_along_longest_dim,
1366  int num_ranks_per_node,
1367  bool divide_to_prime_first_);
1368 
1369 };
1370 
1395 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1396  typename mj_part_t>
1398  const RCP<const Environment> &env,
1399  mj_lno_t num_total_coords,
1400  mj_lno_t num_selected_coords,
1401  size_t num_target_part,
1402  int coord_dim_,
1403  mj_scalar_t **mj_coordinates_,
1404  mj_lno_t *inital_adjList_output_adjlist,
1405  mj_lno_t *output_xadj,
1406  int rd,
1407  const mj_part_t *part_no_array_,
1408  bool partition_along_longest_dim,
1409  int num_ranks_per_node,
1410  bool divide_to_prime_first_
1411 ){
1412 
1413 
1414  this->mj_env = env;
1415  const RCP<Comm<int> > commN;
1416  this->mj_problemComm =
1417  Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1418  this->comm =
1419  Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1420  this->myActualRank = this->myRank = 1;
1421 
1422 #ifdef HAVE_ZOLTAN2_OMP
1423  //int actual_num_threads = omp_get_num_threads();
1424  //omp_set_num_threads(1);
1425 #endif
1426 
1427  this->divide_to_prime_first = divide_to_prime_first_;
1428  //weights are uniform for task mapping
1429 
1430  //parts are uniform for task mapping
1431  //as input indices.
1432  this->imbalance_tolerance = 0;
1433  this->num_global_parts = num_target_part;
1434  this->part_no_array = (mj_part_t *)part_no_array_;
1435  this->recursion_depth = rd;
1436 
1437  this->coord_dim = coord_dim_;
1438  this->num_local_coords = num_total_coords;
1439  this->num_global_coords = num_total_coords;
1440  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
1441 
1444  this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1445 
1446  this->num_weights_per_coord = 0;
1447  bool *tmp_mj_uniform_weights = new bool[1];
1448  this->mj_uniform_weights = tmp_mj_uniform_weights ;
1449  this->mj_uniform_weights[0] = true;
1450 
1451  mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
1452  this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
1453 
1454  bool *tmp_mj_uniform_parts = new bool[1];
1455  this->mj_uniform_parts = tmp_mj_uniform_parts;
1456  this->mj_uniform_parts[0] = true;
1457 
1458  mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
1459  this->mj_part_sizes = tmp_mj_part_sizes;
1460  this->mj_part_sizes[0] = NULL;
1461 
1462  this->num_threads = 1;
1463  this->set_part_specifications();
1464 
1465  this->allocate_set_work_memory();
1466  //the end of the initial partition is the end of coordinates.
1467  this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1468  for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1469  this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1470  }
1471 
1472  mj_part_t current_num_parts = 1;
1473 
1474  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1475 
1476  mj_part_t future_num_parts = this->total_num_part;
1477 
1478  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
1479  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
1480  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1481  RCP<mj_partBoxVector_t> t1;
1482  RCP<mj_partBoxVector_t> t2;
1483 
1484 
1485  std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1486  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted = &(coord_dimension_range_sorted[0]);
1487  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1488  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1489 
1490  for (int i = 0; i < this->recursion_depth; ++i){
1491 
1492  //partitioning array. size will be as the number of current partitions and this
1493  //holds how many parts that each part will be in the current dimension partitioning.
1494  std::vector <mj_part_t> num_partitioning_in_current_dim;
1495 
1496  //number of parts that will be obtained at the end of this partitioning.
1497  //future_num_part_in_parts is as the size of current number of parts.
1498  //holds how many more parts each should be divided in the further
1499  //iterations. this will be used to calculate num_partitioning_in_current_dim,
1500  //as the number of parts that the part will be partitioned
1501  //in the current dimension partitioning.
1502 
1503  //next_future_num_parts_in_parts will be as the size of outnumParts,
1504  //and this will hold how many more parts that each output part
1505  //should be divided. this array will also be used to determine the weight ratios
1506  //of the parts.
1507  //swap the arrays to use iteratively..
1508  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1509  future_num_part_in_parts = next_future_num_parts_in_parts;
1510  next_future_num_parts_in_parts = tmpPartVect;
1511 
1512  //clear next_future_num_parts_in_parts array as
1513  //getPartitionArrays expects it to be empty.
1514  //it also expects num_partitioning_in_current_dim to be empty as well.
1515  next_future_num_parts_in_parts->clear();
1516 
1517 
1518  //returns the total number of output parts for this dimension partitioning.
1519  mj_part_t output_part_count_in_dimension =
1520  this->update_part_num_arrays(
1521  num_partitioning_in_current_dim,
1522  future_num_part_in_parts,
1523  next_future_num_parts_in_parts,
1524  future_num_parts,
1525  current_num_parts,
1526  i,
1527  t1,
1528  t2, num_ranks_per_node);
1529 
1530  //if the number of obtained parts equal to current number of parts,
1531  //skip this dimension. For example, this happens when 1 is given in the input
1532  //part array is given. P=4,5,1,2
1533  if(output_part_count_in_dimension == current_num_parts) {
1534  tmpPartVect= future_num_part_in_parts;
1535  future_num_part_in_parts = next_future_num_parts_in_parts;
1536  next_future_num_parts_in_parts = tmpPartVect;
1537  continue;
1538  }
1539 
1540  //convert i to string to be used for debugging purposes.
1541  std::string istring = Teuchos::toString<int>(i);
1542 
1543  //alloc Memory to point the indices
1544  //of the parts in the permutation array.
1545  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1546 
1547  //the index where in the outtotalCounts will be written.
1548  mj_part_t output_part_index = 0;
1549  //whatever is written to outTotalCounts will be added with previousEnd
1550  //so that the points will be shifted.
1551  mj_part_t output_coordinate_end_index = 0;
1552 
1553  mj_part_t current_work_part = 0;
1554  mj_part_t current_concurrent_num_parts = 1;
1555 
1556  mj_part_t obtained_part_index = 0;
1557 
1558  //get the coordinate axis along which the partitioning will be done.
1559  int coordInd = i % this->coord_dim;
1560  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1561 
1562 
1563  //run for all available parts.
1564  for (; current_work_part < current_num_parts;
1565  current_work_part += current_concurrent_num_parts){
1566 
1567 
1568  //current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1569  //this->max_concurrent_part_calculation);
1570 
1571  mj_part_t actual_work_part_count = 0;
1572  //initialization for 1D partitioning.
1573  //get the min and max coordinates of each part
1574  //together with the part weights of each part.
1575  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1576  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1577 
1578  //if this part wont be partitioned any further
1579  //dont do any work for this part.
1580  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1581  continue;
1582  }
1583  ++actual_work_part_count;
1584  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1585  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1586 
1587  /*
1588  std::cout << "i:" << i << " j:" << current_work_part + kk
1589  << " coordinate_begin_index:" << coordinate_begin_index
1590  << " coordinate_end_index:" << coordinate_end_index
1591  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
1592  */
1593 
1594 
1595  if(partition_along_longest_dim){
1596 
1597  mj_scalar_t best_weight_coord = 0;
1598  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1599  mj_scalar_t best_min_coord = 0;
1600  mj_scalar_t best_max_coord = 0;
1601  //MD:same for all coordinates, but I will still use this for now.
1602 
1603  this->mj_get_local_min_max_coord_totW(
1604  coordinate_begin_index,
1605  coordinate_end_index,
1606  this->coordinate_permutations,
1607  this->mj_coordinates[coord_traverse_ind],
1608  best_min_coord, //min coordinate
1609  best_max_coord, //max coordinate
1610  best_weight_coord //total weight);
1611  );
1612 
1613  coord_dim_mins[coord_traverse_ind] = best_min_coord;
1614  coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1615  mj_scalar_t best_range = best_max_coord - best_min_coord;
1616  coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1617  coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1618  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1619  }
1620 
1621 
1622  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1623  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1624 
1625  /*
1626  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1627  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " range:" << p_coord_dimension_range_sorted[coord_traverse_ind].val << std::endl;
1628  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " coord_dim_mins:" << coord_dim_mins[p_coord_dimension_range_sorted[coord_traverse_ind].id]<< std::endl;
1629  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " coord_dim_maxs:" << coord_dim_maxs[p_coord_dimension_range_sorted[coord_traverse_ind].id] << std::endl;
1630 
1631  }
1632  */
1633 
1634  mj_current_dim_coords = this->mj_coordinates[coordInd];
1635 
1636  this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1637  this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1638  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1639 
1640  }
1641  else{
1642  this->mj_get_local_min_max_coord_totW(
1643  coordinate_begin_index,
1644  coordinate_end_index,
1645  this->coordinate_permutations,
1646  mj_current_dim_coords,
1647  this->process_local_min_max_coord_total_weight[kk], //min coordinate
1648  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
1649  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
1650  );
1651  }
1652  }
1653 
1654  //1D partitioning
1655  if (actual_work_part_count > 0){
1656  //obtain global Min max of the part.
1657  this->mj_get_global_min_max_coord_totW(
1658  current_concurrent_num_parts,
1659  this->process_local_min_max_coord_total_weight,
1660  this->global_min_max_coord_total_weight);
1661 
1662  //represents the total number of cutlines
1663  //whose coordinate should be determined.
1664  mj_part_t total_incomplete_cut_count = 0;
1665 
1666  //Compute weight ratios for parts & cuts:
1667  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
1668  //part0 cut0 part1 cut1 part2 cut2 part3
1669  mj_part_t concurrent_part_cut_shift = 0;
1670  mj_part_t concurrent_part_part_shift = 0;
1671 
1672 
1673  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1674  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1675  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1676  current_concurrent_num_parts];
1677  mj_scalar_t global_total_weight =
1678  this->global_min_max_coord_total_weight[kk +
1679  2 * current_concurrent_num_parts];
1680 
1681  mj_part_t concurrent_current_part_index = current_work_part + kk;
1682 
1683  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1684 
1685  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1686  mj_scalar_t *current_target_part_weights = this->target_part_weights +
1687  concurrent_part_part_shift;
1688  //shift the usedCutCoordinate array as noCuts.
1689  concurrent_part_cut_shift += partition_count - 1;
1690  //shift the partRatio array as noParts.
1691  concurrent_part_part_shift += partition_count;
1692 
1693  //calculate only if part is not empty,
1694  //and part will be further partitioend.
1695  if(partition_count > 1 && min_coordinate <= max_coordinate){
1696 
1697  //increase allDone by the number of cuts of the current
1698  //part's cut line number.
1699  total_incomplete_cut_count += partition_count - 1;
1700  //set the number of cut lines that should be determined
1701  //for this part.
1702  this->my_incomplete_cut_count[kk] = partition_count - 1;
1703 
1704  //get the target weights of the parts.
1705  this->mj_get_initial_cut_coords_target_weights(
1706  min_coordinate,
1707  max_coordinate,
1708  partition_count - 1,
1709  global_total_weight,
1710  usedCutCoordinate,
1711  current_target_part_weights,
1712  future_num_part_in_parts,
1713  next_future_num_parts_in_parts,
1714  concurrent_current_part_index,
1715  obtained_part_index);
1716 
1717  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1718  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1719 
1720  //get the initial estimated part assignments of the coordinates.
1721  this->set_initial_coordinate_parts(
1722  max_coordinate,
1723  min_coordinate,
1724  concurrent_current_part_index,
1725  coordinate_begin_index, coordinate_end_index,
1726  this->coordinate_permutations,
1727  mj_current_dim_coords,
1728  this->assigned_part_ids,
1729  partition_count);
1730 
1731  }
1732  else {
1733  // e.g., if have fewer coordinates than parts, don't need to do next dim.
1734  this->my_incomplete_cut_count[kk] = 0;
1735  }
1736  obtained_part_index += partition_count;
1737  }
1738 
1739  //used imbalance, it is always 0, as it is difficult to estimate a range.
1740  double used_imbalance = 0;
1741 
1742 
1743  // Determine cut lines for k parts here.
1744  this->mj_1D_part(
1745  mj_current_dim_coords,
1746  used_imbalance,
1747  current_work_part,
1748  current_concurrent_num_parts,
1749  current_cut_coordinates,
1750  total_incomplete_cut_count,
1751  num_partitioning_in_current_dim);
1752  }
1753  else {
1754  obtained_part_index += current_concurrent_num_parts;
1755  }
1756 
1757  //create part chunks
1758  {
1759 
1760  mj_part_t output_array_shift = 0;
1761  mj_part_t cut_shift = 0;
1762  size_t tlr_shift = 0;
1763  size_t partweight_array_shift = 0;
1764 
1765  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1766  mj_part_t current_concurrent_work_part = current_work_part + kk;
1767  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1768 
1769  //if the part is empty, skip the part.
1770  if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1771  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1772 
1773  for(mj_part_t jj = 0; jj < num_parts; ++jj){
1774  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1775  }
1776  cut_shift += num_parts - 1;
1777  tlr_shift += (4 *(num_parts - 1) + 1);
1778  output_array_shift += num_parts;
1779  partweight_array_shift += (2 * (num_parts - 1) + 1);
1780  continue;
1781  }
1782 
1783  mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1784  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1785  -1];
1786  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1787  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1788  cut_shift;
1789 
1790  for(int ii = 0; ii < this->num_threads; ++ii){
1791  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1792  }
1793 
1794  if(num_parts > 1){
1795  // Rewrite the indices based on the computed cuts.
1796  this->create_consistent_chunks(
1797  num_parts,
1798  mj_current_dim_coords,
1799  current_concurrent_cut_coordinate,
1800  coordinate_begin,
1801  coordinate_end,
1802  used_local_cut_line_weight_to_left,
1803  this->new_part_xadj + output_part_index + output_array_shift,
1804  coordInd,
1805  partition_along_longest_dim,
1806  p_coord_dimension_range_sorted);
1807  }
1808  else {
1809  //if this part is partitioned into 1 then just copy
1810  //the old values.
1811  mj_lno_t part_size = coordinate_end - coordinate_begin;
1812  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1813  memcpy(this->new_coordinate_permutations + coordinate_begin,
1814  this->coordinate_permutations + coordinate_begin,
1815  part_size * sizeof(mj_lno_t));
1816  }
1817 
1818 
1819 
1820  cut_shift += num_parts - 1;
1821  tlr_shift += (4 *(num_parts - 1) + 1);
1822  output_array_shift += num_parts;
1823  partweight_array_shift += (2 * (num_parts - 1) + 1);
1824  }
1825 
1826  //shift cut coordinates so that all cut coordinates are stored.
1827  //current_cut_coordinates += cutShift;
1828 
1829  //getChunks from coordinates partitioned the parts and
1830  //wrote the indices as if there were a single part.
1831  //now we need to shift the beginning indices.
1832  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1833  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1834  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1835  //shift it by previousCount
1836  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1837  if (ii % 2 == 1){
1838  mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1839  mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1840 
1841  for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1842  mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1843  //MARKER: FLIPPED ZORDER BELOW
1844  mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1845  }
1846  }
1847  }
1848  //increase the previous count by current end.
1849  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1850  //increase the current out.
1851  output_part_index += num_parts ;
1852  }
1853  }
1854  }
1855  // end of this partitioning dimension
1856 
1857  //set the current num parts for next dim partitioning
1858  current_num_parts = output_part_count_in_dimension;
1859 
1860  //swap the coordinate permutations for the next dimension.
1861  mj_lno_t * tmp = this->coordinate_permutations;
1862  this->coordinate_permutations = this->new_coordinate_permutations;
1863  this->new_coordinate_permutations = tmp;
1864 
1865  freeArray<mj_lno_t>(this->part_xadj);
1866  this->part_xadj = this->new_part_xadj;
1867  this->new_part_xadj = NULL;
1868  }
1869 
1870  for(mj_lno_t i = 0; i < num_total_coords; ++i){
1871  inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1872  }
1873 
1874  // Return output_xadj in CSR format
1875  output_xadj[0] = 0;
1876  for(size_t i = 0; i < this->num_global_parts ; ++i){
1877  output_xadj[i+1] = this->part_xadj[i];
1878  }
1879 
1880  delete future_num_part_in_parts;
1881  delete next_future_num_parts_in_parts;
1882 
1883  //free the extra memory that we allocated.
1884  freeArray<mj_part_t>(this->assigned_part_ids);
1885  freeArray<mj_gno_t>(this->initial_mj_gnos);
1886  freeArray<mj_gno_t>(this->current_mj_gnos);
1887  freeArray<bool>(tmp_mj_uniform_weights);
1888  freeArray<bool>(tmp_mj_uniform_parts);
1889  freeArray<mj_scalar_t *>(tmp_mj_weights);
1890  freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1891 
1892  this->free_work_memory();
1893 
1894 #ifdef HAVE_ZOLTAN2_OMP
1895  //omp_set_num_threads(actual_num_threads);
1896 #endif
1897 }
1898 
1902 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1903  typename mj_part_t>
1905  mj_env(), mj_problemComm(), imbalance_tolerance(0),
1906  part_no_array(NULL), recursion_depth(0), coord_dim(0),
1907  num_weights_per_coord(0), initial_num_loc_coords(0),
1908  initial_num_glob_coords(0),
1909  num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1910  mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1911  mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1912  initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1913  coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1914  assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1915  distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1916  mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
1917  check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
1918  num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1919  max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1920  last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1921  all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1922  thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1923  target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1924  cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1925  process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1926  is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1927  thread_part_weights(NULL), thread_part_weight_work(NULL),
1928  thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1929  thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1930  global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1931  global_total_part_weight_left_right_closests(NULL),
1932  kept_boxes(),global_box(),
1933  myRank(0), myActualRank(0), divide_to_prime_first(false)
1934 {
1935  this->fEpsilon = std::numeric_limits<float>::epsilon();
1936  this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
1937 
1938  this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1939  this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1940 
1941 }
1942 
1943 
1947 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1948  typename mj_part_t>
1949 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1951 {
1952  return this->global_box;
1953 }
1954 
1958 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1959  typename mj_part_t>
1961  this->mj_keep_part_boxes = true;
1962 }
1963 
1964 
1965 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
1966  * the input. part_no_array takes
1967  * precedence if both are provided.
1968  * Depending on these parameters, total cut/part number,
1969  * maximum part/cut number along a dimension, estimated number of reduceAlls,
1970  * and the number of parts before the last dimension is calculated.
1971  * */
1972 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1973  typename mj_part_t>
1975 
1976  this->total_num_cut = 0; //how many cuts will be totally
1977  this->total_num_part = 1; //how many parts will be totally
1978  this->max_num_part_along_dim = 0; //maximum part count along a dimension.
1979  this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done.
1980  this->last_dim_num_part = 1; //max no of parts that might occur
1981  //during the partition before the
1982  //last partitioning dimension.
1983  this->max_num_cut_along_dim = 0;
1984  this->max_num_total_part_along_dim = 0;
1985 
1986  if (this->part_no_array){
1987  //if user provided part array, traverse the array and set variables.
1988  for (int i = 0; i < this->recursion_depth; ++i){
1989  this->total_dim_num_reduce_all += this->total_num_part;
1990  this->total_num_part *= this->part_no_array[i];
1991  if(this->part_no_array[i] > this->max_num_part_along_dim) {
1992  this->max_num_part_along_dim = this->part_no_array[i];
1993  }
1994  }
1995  this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1996  this->num_global_parts = this->total_num_part;
1997  } else {
1998  mj_part_t future_num_parts = this->num_global_parts;
1999 
2000  //we need to calculate the part numbers now, to determine the maximum along the dimensions.
2001  for (int i = 0; i < this->recursion_depth; ++i){
2002 
2003  mj_part_t maxNoPartAlongI = this->get_part_count(
2004  future_num_parts, 1.0f / (this->recursion_depth - i));
2005 
2006  if (maxNoPartAlongI > this->max_num_part_along_dim){
2007  this->max_num_part_along_dim = maxNoPartAlongI;
2008  }
2009 
2010  mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
2011  if (future_num_parts % maxNoPartAlongI){
2012  ++nfutureNumParts;
2013  }
2014  future_num_parts = nfutureNumParts;
2015  }
2016  this->total_num_part = this->num_global_parts;
2017 
2018  if (this->divide_to_prime_first){
2019  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2020  this->last_dim_num_part = this->num_global_parts;
2021  }
2022  else {
2023  //this is the lower bound.
2024 
2025  //estimate reduceAll Count here.
2026  //we find the upperbound instead.
2027  size_t p = 1;
2028 
2029  for (int i = 0; i < this->recursion_depth; ++i){
2030  this->total_dim_num_reduce_all += p;
2031  p *= this->max_num_part_along_dim;
2032  }
2033 
2034  if (p / this->max_num_part_along_dim > this->num_global_parts){
2035  this->last_dim_num_part = this->num_global_parts;
2036  }
2037  else {
2038  this->last_dim_num_part = p / this->max_num_part_along_dim;
2039  }
2040 
2041  }
2042  }
2043 
2044  this->total_num_cut = this->total_num_part - 1;
2045  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2046  this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2047  //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2048 
2049  //refine the concurrent part count, if it is given bigger than the maximum possible part count.
2050  if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2051  if(this->mj_problemComm->getRank() == 0){
2052  std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2053  ") has been set bigger than maximum amount that can be used." <<
2054  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2055  }
2056  this->max_concurrent_part_calculation = this->last_dim_num_part;
2057  }
2058 
2059 }
2060 /* \brief Tries to determine the part number for current dimension,
2061  * by trying to make the partitioning as square as possible.
2062  * \param num_total_future how many more partitionings are required.
2063  * \param root how many more recursion depth is left.
2064  */
2065 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2066  typename mj_part_t>
2067 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2068  mj_part_t num_total_future,
2069  double root)
2070 {
2071  double fp = pow(num_total_future, root);
2072  mj_part_t ip = mj_part_t (fp);
2073  if (fp - ip < this->fEpsilon * 100){
2074  return ip;
2075  }
2076  else {
2077  return ip + 1;
2078  }
2079 }
2080 
2081 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
2082  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
2083  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
2084  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
2085  *
2086  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
2087  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
2088  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
2089  * \param future_num_parts: output, max number of future parts that will be obtained from a single
2090  * \param current_num_parts: input, how many parts are there currently.
2091  * \param current_iteration: input, current dimension iteration number.
2092  * \param input_part_boxes: input, if boxes are kept, current boxes.
2093  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
2094  */
2095 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2096  typename mj_part_t>
2097 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2098  std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
2099  std::vector<mj_part_t> *future_num_part_in_parts,
2100  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
2101  mj_part_t &future_num_parts,
2102  mj_part_t current_num_parts,
2103  int current_iteration,
2104  RCP<mj_partBoxVector_t> input_part_boxes,
2105  RCP<mj_partBoxVector_t> output_part_boxes,
2106  mj_part_t atomic_part_count
2107 ){
2108  //how many parts that will be obtained after this dimension.
2109  mj_part_t output_num_parts = 0;
2110  if(this->part_no_array){
2111  //when the partNo array is provided as input,
2112  //each current partition will be partition to the same number of parts.
2113  //we dont need to use the future_num_part_in_parts vector in this case.
2114 
2115  mj_part_t p = this->part_no_array[current_iteration];
2116  if (p < 1){
2117  std::cout << "i:" << current_iteration << " p is given as:" << p << std::endl;
2118  exit(1);
2119  }
2120  if (p == 1){
2121  return current_num_parts;
2122  }
2123 
2124  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2125  num_partitioning_in_current_dim.push_back(p);
2126  }
2127  //std::cout << "me:" << this->myRank << " current_iteration" << current_iteration <<
2128  //" current_num_parts:" << current_num_parts << std::endl;
2129  //std::cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << std::endl;
2130  //set the new value of future_num_parts.
2131 
2132  /*
2133  std::cout << "\tfuture_num_parts:" << future_num_parts
2134  << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0]
2135  << future_num_parts/ num_partitioning_in_current_dim[0] << std::endl;
2136  */
2137 
2138  future_num_parts /= num_partitioning_in_current_dim[0];
2139  output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2140 
2141  if (this->mj_keep_part_boxes){
2142  for (mj_part_t k = 0; k < current_num_parts; ++k){
2143  //initialized the output boxes as its ancestor.
2144  for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2145  output_part_boxes->push_back((*input_part_boxes)[k]);
2146  }
2147  }
2148  }
2149 
2150  //set the how many more parts each part will be divided.
2151  //this is obvious when partNo array is provided as input.
2152  //however, fill this so that weights will be calculated according to this array.
2153  for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2154  next_future_num_parts_in_parts->push_back(future_num_parts);
2155  }
2156  }
2157  else {
2158  //if partNo array is not provided as input,
2159  //future_num_part_in_parts holds how many parts each part should be divided.
2160  //initially it holds a single number equal to the total number of global parts.
2161 
2162  //calculate the future_num_parts from beginning,
2163  //since each part might be divided into different number of parts.
2164  future_num_parts = 1;
2165 
2166  //std::cout << "i:" << i << std::endl;
2167 
2168  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2169  //get how many parts a part should be divided.
2170  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2171 
2172  //get the ideal number of parts that is close to the
2173  //(recursion_depth - i) root of the future_num_parts_of_part_ii.
2174  mj_part_t num_partitions_in_current_dim =
2175  this->get_part_count(
2176  future_num_parts_of_part_ii,
2177  1.0 / (this->recursion_depth - current_iteration)
2178  );
2179 
2180  if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2181  std::cerr << "ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2182  << num_partitions_in_current_dim << "this->max_num_part_along_dim:"
2183  << this->max_num_part_along_dim <<
2184  " this->recursion_depth:" << this->recursion_depth <<
2185  " current_iteration:" << current_iteration <<
2186  " future_num_parts_of_part_ii:" << future_num_parts_of_part_ii <<
2187  " might need to fix max part no calculation for largest_prime_first partitioning" <<
2188  std::endl;
2189  exit(1);
2190  }
2191  //add this number to num_partitioning_in_current_dim vector.
2192  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2193 
2194  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2195  if (this->divide_to_prime_first){
2196 
2197  //increase the output number of parts.
2198  output_num_parts += num_partitions_in_current_dim;
2199  if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2200  atomic_part_count = 1;
2201  }
2202 
2203  largest_prime_factor = this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2204 
2205  //we divide to num_partitions_in_current_dim. But we adjust the weights based on largest prime/
2206  //if num_partitions_in_current_dim = 2, largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2207  //if the largest prime is less than part count, we use the part count so that we divide uniformly.
2208  if (largest_prime_factor < num_partitions_in_current_dim){
2209  largest_prime_factor = num_partitions_in_current_dim;
2210  }
2211 
2212  //ideal number of future partitions for each part.
2213  mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2214  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2215  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2216 
2217  //std::cout << "current num part:" << ii << " largest_prime_factor:" << largest_prime_factor << " To Partition:" << future_num_parts_of_part_ii << " ";
2218  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2219  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2220  mj_part_t my_ideal_primescale = ideal_prime_scale;
2221  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2222  if (iii < (largest_prime_factor) % num_partitions_in_current_dim){
2223  ++my_ideal_primescale;
2224  }
2225  //scale with 'x';
2226  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part * my_ideal_primescale;
2227 
2228  //if there is a remainder in the part increase the part weight.
2229  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor){
2230  //if not uniform, add 1 for the extra parts.
2231  ++num_future_parts_for_part_iii;
2232  }
2233 
2234  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2235 
2236  //if part boxes are stored, initialize the box of the parts as the ancestor.
2237  if (this->mj_keep_part_boxes){
2238  output_part_boxes->push_back((*input_part_boxes)[ii]);
2239  }
2240 
2241  //set num future_num_parts to maximum in this part.
2242  if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2243 
2244  }
2245 
2246 
2247  }
2248  else {
2249 
2250  //increase the output number of parts.
2251  output_num_parts += num_partitions_in_current_dim;
2252 
2253 
2254 
2255  if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2256  atomic_part_count = 1;
2257  }
2258  //ideal number of future partitions for each part.
2259  mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2260  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2261  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2262 
2263  //if there is a remainder in the part increase the part weight.
2264  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2265  //if not uniform, add 1 for the extra parts.
2266  ++num_future_parts_for_part_iii;
2267  }
2268 
2269  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2270 
2271  //if part boxes are stored, initialize the box of the parts as the ancestor.
2272  if (this->mj_keep_part_boxes){
2273  output_part_boxes->push_back((*input_part_boxes)[ii]);
2274  }
2275 
2276  //set num future_num_parts to maximum in this part.
2277  if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2278  }
2279  }
2280  }
2281  }
2282  return output_num_parts;
2283 }
2284 
2285 
2286 /* \brief Allocates and initializes the work memory that will be used by MJ.
2287  *
2288  * */
2289 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2290  typename mj_part_t>
2291 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2292 
2293  //points to process that initially owns the coordinate.
2294  this->owner_of_coordinate = NULL;
2295 
2296  //Throughout the partitioning execution,
2297  //instead of the moving the coordinates, hold a permutation array for parts.
2298  //coordinate_permutations holds the current permutation.
2299  this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2300  //initial configuration, set each pointer-i to i.
2301 #ifdef HAVE_ZOLTAN2_OMP
2302 #pragma omp parallel for
2303 #endif
2304  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2305  this->coordinate_permutations[i] = i;
2306  }
2307 
2308  //new_coordinate_permutations holds the current permutation.
2309  this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2310 
2311  this->assigned_part_ids = NULL;
2312  if(this->num_local_coords > 0){
2313  this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2314  }
2315 
2316  //single partition starts at index-0, and ends at numLocalCoords
2317  //inTotalCounts array holds the end points in coordinate_permutations array
2318  //for each partition. Initially sized 1, and single element is set to numLocalCoords.
2319  this->part_xadj = allocMemory<mj_lno_t>(1);
2320  this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
2321  //the ends points of the output, this is allocated later.
2322  this->new_part_xadj = NULL;
2323 
2324  // only store this much if cuts are needed to be stored.
2325  //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
2326 
2327 
2328  this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2329 
2330  this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2331 
2332  this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
2333  this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
2334  //distribute_points_on_cut_lines = false;
2335  if(this->distribute_points_on_cut_lines){
2336  this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2337  this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2338  for(int i = 0; i < this->num_threads; ++i){
2339  this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2340  }
2341  this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2342  this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2343  }
2344 
2345 
2346  // work array to manipulate coordinate of cutlines in different iterations.
2347  //necessary because previous cut line information is used for determining
2348  //the next cutline information. therefore, cannot update the cut work array
2349  //until all cutlines are determined.
2350  this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2351  this->max_concurrent_part_calculation);
2352 
2353 
2354  //cumulative part weight array.
2355  this->target_part_weights = allocMemory<mj_scalar_t>(
2356  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2357  // the weight from left to write.
2358 
2359  this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line
2360  this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line
2361  this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line
2362  this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line
2363 
2364  this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
2365  this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
2366 
2367  //is_cut_line_determined is used to determine if a cutline is determined already.
2368  //If a cut line is already determined, the next iterations will skip this cut line.
2369  this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2370  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
2371  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
2372  this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2373  //local part weights of each thread.
2374  this->thread_part_weights = allocMemory<double *>(this->num_threads);
2375  //the work manupulation array for partweights.
2376  this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2377 
2378  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
2379  this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2380  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
2381  this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2382 
2383  //to store how many points in each part a thread has.
2384  this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2385 
2386  for(int i = 0; i < this->num_threads; ++i){
2387  //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
2388  this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2389  this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2390  this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2391  this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2392  }
2393  //for faster communication, concatanation of
2394  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2395  //leftClosest distances sized P-1, since P-1 cut lines
2396  //rightClosest distances size P-1, since P-1 cut lines.
2397  this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2398  this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2399 
2400 
2401  mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2402  for (int i=0; i < this->coord_dim; i++){
2403  coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2404 #ifdef HAVE_ZOLTAN2_OMP
2405 #pragma omp parallel for
2406 #endif
2407  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2408  coord[i][j] = this->mj_coordinates[i][j];
2409  }
2410  this->mj_coordinates = coord;
2411 
2412 
2413  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2414  mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2415 
2416  for (int i=0; i < criteria_dim; i++){
2417  weights[i] = NULL;
2418  }
2419  for (int i=0; i < this->num_weights_per_coord; i++){
2420  weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2421 #ifdef HAVE_ZOLTAN2_OMP
2422 #pragma omp parallel for
2423 #endif
2424  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2425  weights[i][j] = this->mj_weights[i][j];
2426 
2427  }
2428  this->mj_weights = weights;
2429  this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2430 #ifdef HAVE_ZOLTAN2_OMP
2431 #pragma omp parallel for
2432 #endif
2433  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2434  this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2435 
2436  this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2437 
2438 #ifdef HAVE_ZOLTAN2_OMP
2439 #pragma omp parallel for
2440 #endif
2441  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2442  this->owner_of_coordinate[j] = this->myActualRank;
2443 }
2444 
2445 /* \brief compute the global bounding box
2446  */
2447 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2448  typename mj_part_t>
2449 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2450 {
2451  //local min coords
2452  mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2453  //global min coords
2454  mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2455  //local max coords
2456  mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2457  //global max coords
2458  mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2459 
2460  for (int i = 0; i < this->coord_dim; ++i){
2461  mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2462  mj_scalar_t localMax = -localMin;
2463  if (localMax > 0) localMax = 0;
2464 
2465 
2466  for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2467  if (this->mj_coordinates[i][j] < localMin){
2468  localMin = this->mj_coordinates[i][j];
2469  }
2470  if (this->mj_coordinates[i][j] > localMax){
2471  localMax = this->mj_coordinates[i][j];
2472  }
2473  }
2474  //std::cout << " localMin:" << localMin << std::endl;
2475  //std::cout << " localMax:" << localMax << std::endl;
2476  mins[i] = localMin;
2477  maxs[i] = localMax;
2478 
2479  }
2480  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2481  this->coord_dim, mins, gmins
2482  );
2483 
2484 
2485  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2486  this->coord_dim, maxs, gmaxs
2487  );
2488 
2489 
2490 
2491  //create single box with all areas.
2492  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2493  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2494  freeArray<mj_scalar_t>(mins);
2495  freeArray<mj_scalar_t>(gmins);
2496  freeArray<mj_scalar_t>(maxs);
2497  freeArray<mj_scalar_t>(gmaxs);
2498 }
2499 
2500 /* \brief for part communication we keep track of the box boundaries.
2501  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
2502  * This function initializes a single box with all global min and max coordinates.
2503  * \param initial_partitioning_boxes the input and output vector for boxes.
2504  */
2505 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2506  typename mj_part_t>
2507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2508  RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2509 )
2510 {
2511  mj_partBox_t tmp_box(*global_box);
2512  initial_partitioning_boxes->push_back(tmp_box);
2513 }
2514 
2525 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2526  typename mj_part_t>
2527 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2528  mj_lno_t coordinate_begin_index,
2529  mj_lno_t coordinate_end_index,
2530  mj_lno_t *mj_current_coordinate_permutations,
2531  mj_scalar_t *mj_current_dim_coords,
2532  mj_scalar_t &min_coordinate,
2533  mj_scalar_t &max_coordinate,
2534  mj_scalar_t &total_weight){
2535 
2536  //if the part is empty.
2537  //set the min and max coordinates as reverse.
2538  if(coordinate_begin_index >= coordinate_end_index)
2539  {
2540  min_coordinate = this->maxScalar_t;
2541  max_coordinate = this->minScalar_t;
2542  total_weight = 0;
2543  }
2544  else {
2545  mj_scalar_t my_total_weight = 0;
2546 #ifdef HAVE_ZOLTAN2_OMP
2547 #pragma omp parallel num_threads(this->num_threads)
2548 #endif
2549  {
2550  //if uniform weights are used, then weight is equal to count.
2551  if (this->mj_uniform_weights[0]) {
2552 #ifdef HAVE_ZOLTAN2_OMP
2553 #pragma omp single
2554 #endif
2555  {
2556  my_total_weight = coordinate_end_index - coordinate_begin_index;
2557  }
2558 
2559  }
2560  else {
2561  //if not uniform, then weights are reducted from threads.
2562 #ifdef HAVE_ZOLTAN2_OMP
2563 #pragma omp for reduction(+:my_total_weight)
2564 #endif
2565  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2566  int i = mj_current_coordinate_permutations[ii];
2567  my_total_weight += this->mj_weights[0][i];
2568  }
2569  }
2570 
2571  int my_thread_id = 0;
2572 #ifdef HAVE_ZOLTAN2_OMP
2573  my_thread_id = omp_get_thread_num();
2574 #endif
2575  mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2576  my_thread_min_coord=my_thread_max_coord
2577  =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2578 
2579 
2580 #ifdef HAVE_ZOLTAN2_OMP
2581 #pragma omp for
2582 #endif
2583  for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2584  int i = mj_current_coordinate_permutations[j];
2585  if(mj_current_dim_coords[i] > my_thread_max_coord)
2586  my_thread_max_coord = mj_current_dim_coords[i];
2587  if(mj_current_dim_coords[i] < my_thread_min_coord)
2588  my_thread_min_coord = mj_current_dim_coords[i];
2589  }
2590  this->max_min_coords[my_thread_id] = my_thread_min_coord;
2591  this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2592 
2593 #ifdef HAVE_ZOLTAN2_OMP
2594 //we need a barrier here, because max_min_array might not be filled by some of the threads.
2595 #pragma omp barrier
2596 #pragma omp single nowait
2597 #endif
2598  {
2599  min_coordinate = this->max_min_coords[0];
2600  for(int i = 1; i < this->num_threads; ++i){
2601  if(this->max_min_coords[i] < min_coordinate)
2602  min_coordinate = this->max_min_coords[i];
2603  }
2604  }
2605 
2606 #ifdef HAVE_ZOLTAN2_OMP
2607 #pragma omp single nowait
2608 #endif
2609  {
2610  max_coordinate = this->max_min_coords[this->num_threads];
2611  for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2612  if(this->max_min_coords[i] > max_coordinate)
2613  max_coordinate = this->max_min_coords[i];
2614  }
2615  }
2616  }
2617  total_weight = my_total_weight;
2618  }
2619 }
2620 
2621 
2629 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2630  typename mj_part_t>
2631 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2632  mj_part_t current_concurrent_num_parts,
2633  mj_scalar_t *local_min_max_total,
2634  mj_scalar_t *global_min_max_total){
2635 
2636  //reduce min for first current_concurrent_num_parts elements, reduce max for next
2637  //concurrentPartCount elements,
2638  //reduce sum for the last concurrentPartCount elements.
2639  if(this->comm->getSize() > 1){
2641  reductionOp(
2642  current_concurrent_num_parts,
2643  current_concurrent_num_parts,
2644  current_concurrent_num_parts);
2645  try{
2646  reduceAll<int, mj_scalar_t>(
2647  *(this->comm),
2648  reductionOp,
2649  3 * current_concurrent_num_parts,
2650  local_min_max_total,
2651  global_min_max_total);
2652  }
2653  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2654  }
2655  else {
2656  mj_part_t s = 3 * current_concurrent_num_parts;
2657  for (mj_part_t i = 0; i < s; ++i){
2658  global_min_max_total[i] = local_min_max_total[i];
2659  }
2660  }
2661 }
2662 
2663 
2664 
2683 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2684  typename mj_part_t>
2685 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2686  mj_scalar_t min_coord,
2687  mj_scalar_t max_coord,
2688  mj_part_t num_cuts/*p-1*/ ,
2689  mj_scalar_t global_weight,
2690  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
2691  mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
2692 
2693  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
2694  std::vector <mj_part_t> *next_future_num_parts_in_parts,
2695  mj_part_t concurrent_current_part,
2696  mj_part_t obtained_part_index
2697 ){
2698 
2699  mj_scalar_t coord_range = max_coord - min_coord;
2700  if(this->mj_uniform_parts[0]){
2701  {
2702  mj_part_t cumulative = 0;
2703  //how many total future parts the part will be partitioned into.
2704  mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2705 
2706 
2707  //how much each part should weigh in ideal case.
2708  mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2709  /*
2710  std::cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << std::endl;
2711  std::cout << "global_weight:" << global_weight << std::endl;
2712  std::cout << "unit_part_weight" << unit_part_weight << std::endl;
2713  */
2714  for(mj_part_t i = 0; i < num_cuts; ++i){
2715  cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2716 
2717  /*
2718  std::cout << "obtained_part_index:" << obtained_part_index <<
2719  " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] <<
2720  " cumulative:" << cumulative << std::endl;
2721  */
2722  //set target part weight.
2723  current_target_part_weights[i] = cumulative * unit_part_weight;
2724  //std::cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] <<std::endl;
2725  //set initial cut coordinate.
2726 
2727  initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2728  }
2729  current_target_part_weights[num_cuts] = 1;
2730  }
2731 
2732  //round the target part weights.
2733  if (this->mj_uniform_weights[0]){
2734  for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2735 
2736  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2737  }
2738  }
2739  }
2740  else {
2741  std::cerr << "MJ does not support non uniform part weights" << std::endl;
2742  exit(1);
2743  }
2744 }
2745 
2746 
2759 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2760  typename mj_part_t>
2761 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
2762  mj_scalar_t &max_coordinate,
2763  mj_scalar_t &min_coordinate,
2764  mj_part_t &/* concurrent_current_part_index */,
2765  mj_lno_t coordinate_begin_index,
2766  mj_lno_t coordinate_end_index,
2767  mj_lno_t *mj_current_coordinate_permutations,
2768  mj_scalar_t *mj_current_dim_coords,
2769  mj_part_t *mj_part_ids,
2770  mj_part_t &partition_count
2771 ){
2772  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2773 
2774  //if there is single point, or if all points are along a line.
2775  //set initial part to 0 for all.
2776  if(ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp parallel for
2779 #endif
2780  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781  mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2782  }
2783  }
2784  else{
2785 
2786  //otherwise estimate an initial part for each coordinate.
2787  //assuming uniform distribution of points.
2788  mj_scalar_t slice = coordinate_range / partition_count;
2789 
2790 #ifdef HAVE_ZOLTAN2_OMP
2791 #pragma omp parallel for
2792 #endif
2793  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2794 
2795  mj_lno_t iii = mj_current_coordinate_permutations[ii];
2796  mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2797  mj_part_ids[iii] = 2 * pp;
2798  }
2799  }
2800 }
2801 
2802 
2813 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2814  typename mj_part_t>
2815 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
2816  mj_scalar_t *mj_current_dim_coords,
2817  double used_imbalance_tolerance,
2818  mj_part_t current_work_part,
2819  mj_part_t current_concurrent_num_parts,
2820  mj_scalar_t *current_cut_coordinates,
2821  mj_part_t total_incomplete_cut_count,
2822  std::vector <mj_part_t> &num_partitioning_in_current_dim
2823 ){
2824 
2825 
2826  mj_part_t rectilinear_cut_count = 0;
2827  mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2828 
2830  *reductionOp = NULL;
2831  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
2832  <mj_part_t, mj_scalar_t>(
2833  &num_partitioning_in_current_dim ,
2834  current_work_part ,
2835  current_concurrent_num_parts);
2836 
2837  size_t total_reduction_size = 0;
2838 #ifdef HAVE_ZOLTAN2_OMP
2839 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
2840 #endif
2841  {
2842  int me = 0;
2843 #ifdef HAVE_ZOLTAN2_OMP
2844  me = omp_get_thread_num();
2845 #endif
2846  double *my_thread_part_weights = this->thread_part_weights[me];
2847  mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2848  mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2849 
2850 #ifdef HAVE_ZOLTAN2_OMP
2851 #pragma omp single
2852 #endif
2853  {
2854  //initialize the lower and upper bounds of the cuts.
2855  mj_part_t next = 0;
2856  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2857 
2858  mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2859  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2860  total_reduction_size += (4 * num_cut_in_dim + 1);
2861 
2862  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2863  this->is_cut_line_determined[next] = false;
2864  this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
2865  this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
2866 
2867  this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
2868  this->cut_lower_bound_weights[next] = 0;
2869 
2870  if(this->distribute_points_on_cut_lines){
2871  this->process_cut_line_weight_to_put_left[next] = 0;
2872  }
2873  ++next;
2874  }
2875  }
2876  }
2877 
2878  //no need to have barrier here.
2879  //pragma omp single have implicit barrier.
2880 
2881  int iteration = 0;
2882  while (total_incomplete_cut_count != 0){
2883  iteration += 1;
2884  mj_part_t concurrent_cut_shifts = 0;
2885  size_t total_part_shift = 0;
2886 
2887  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2888  mj_part_t num_parts = -1;
2889  num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2890 
2891  mj_part_t num_cuts = num_parts - 1;
2892  size_t total_part_count = num_parts + size_t (num_cuts) ;
2893  if (this->my_incomplete_cut_count[kk] > 0){
2894 
2895  //although isDone shared, currentDone is private and same for all.
2896  bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2897  double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2898  mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2899  mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2900 
2901  mj_part_t conccurent_current_part = current_work_part + kk;
2902  mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2903  mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2904  mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2905 
2906  mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2907  mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2908 
2909  // compute part weights using existing cuts
2910  this->mj_1D_part_get_thread_part_weights(
2911  total_part_count,
2912  num_cuts,
2913  max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
2914  min_coord,//globalMinMaxTotal[kk]//minScalar,
2915  coordinate_begin_index,
2916  coordinate_end_index,
2917  mj_current_dim_coords,
2918  temp_current_cut_coords,
2919  current_cut_status,
2920  my_current_part_weights,
2921  my_current_left_closest,
2922  my_current_right_closest);
2923 
2924  }
2925 
2926  concurrent_cut_shifts += num_cuts;
2927  total_part_shift += total_part_count;
2928  }
2929 
2930  //sum up the results of threads
2931  this->mj_accumulate_thread_results(
2932  num_partitioning_in_current_dim,
2933  current_work_part,
2934  current_concurrent_num_parts);
2935 
2936  //now sum up the results of mpi processors.
2937 #ifdef HAVE_ZOLTAN2_OMP
2938 #pragma omp single
2939 #endif
2940  {
2941  if(this->comm->getSize() > 1){
2942  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2943  total_reduction_size,
2944  this->total_part_weight_left_right_closests,
2945  this->global_total_part_weight_left_right_closests);
2946 
2947  }
2948  else {
2949  memcpy(
2950  this->global_total_part_weight_left_right_closests,
2951  this->total_part_weight_left_right_closests,
2952  total_reduction_size * sizeof(mj_scalar_t));
2953  }
2954  }
2955 
2956  //how much cut will be shifted for the next part in the concurrent part calculation.
2957  mj_part_t cut_shift = 0;
2958 
2959  //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
2960  size_t tlr_shift = 0;
2961  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2962  mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2963  mj_part_t num_cuts = num_parts - 1;
2964  size_t num_total_part = num_parts + size_t (num_cuts) ;
2965 
2966  //if the cuts of this cut has already been completed.
2967  //nothing to do for this part.
2968  //just update the shift amount and proceed.
2969  if (this->my_incomplete_cut_count[kk] == 0) {
2970  cut_shift += num_cuts;
2971  tlr_shift += (num_total_part + 2 * num_cuts);
2972  continue;
2973  }
2974 
2975  mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2976  mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2977  mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
2978  mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
2979  mj_scalar_t *current_global_part_weights = current_global_tlr;
2980  bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2981 
2982  mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2983  mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2984 
2985  mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2986  mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2987  mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2988  mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2989  mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2990  mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2991  mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2992 
2993  mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2994 
2995  // Now compute the new cut coordinates.
2996  this->mj_get_new_cut_coordinates(
2997  num_total_part,
2998  num_cuts,
2999  max_coordinate,
3000  min_coordinate,
3001  global_total_weight,
3002  used_imbalance_tolerance,
3003  current_global_part_weights,
3004  current_local_part_weights,
3005  current_part_target_weights,
3006  current_cut_line_determined,
3007  temp_cut_coords + cut_shift,
3008  current_cut_upper_bounds,
3009  current_cut_lower_bounds,
3010  current_global_left_closest_points,
3011  current_global_right_closest_points,
3012  current_cut_lower_bound_weights,
3013  current_cut_upper_weights,
3014  this->cut_coordinates_work_array +cut_shift, //new cut coordinates
3015  current_part_cut_line_weight_to_put_left,
3016  &rectilinear_cut_count,
3017  this->my_incomplete_cut_count[kk]);
3018 
3019  cut_shift += num_cuts;
3020  tlr_shift += (num_total_part + 2 * num_cuts);
3021  mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3022 #ifdef HAVE_ZOLTAN2_OMP
3023 #pragma omp single
3024 #endif
3025  {
3026  total_incomplete_cut_count -= iteration_complete_cut_count;
3027  }
3028 
3029  }
3030  { //This unnecessary bracket works around a compiler bug in NVCC when compiling with OpenMP enabled
3031 #ifdef HAVE_ZOLTAN2_OMP
3032 #pragma omp barrier
3033 #pragma omp single
3034 #endif
3035  {
3036  //swap the cut coordinates for next iteration.
3037  mj_scalar_t *t = temp_cut_coords;
3038  temp_cut_coords = this->cut_coordinates_work_array;
3039  this->cut_coordinates_work_array = t;
3040  }
3041  }
3042  }
3043 
3044  //if (myRank == 0)
3045  //std::cout << "iteration:" << iteration << " partition:" << num_partitioning_in_current_dim[current_work_part] << std::endl;
3046  // Needed only if keep_cuts; otherwise can simply swap array pointers
3047  // cutCoordinates and cutCoordinatesWork.
3048  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3049  // computed cuts must be in cutCoordinates.
3050  if (current_cut_coordinates != temp_cut_coords){
3051 #ifdef HAVE_ZOLTAN2_OMP
3052 #pragma omp single
3053 #endif
3054  {
3055  mj_part_t next = 0;
3056  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3057  mj_part_t num_parts = -1;
3058  num_parts = num_partitioning_in_current_dim[current_work_part + i];
3059  mj_part_t num_cuts = num_parts - 1;
3060 
3061  for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3062  current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3063  }
3064  next += num_cuts;
3065  }
3066  }
3067 
3068 #ifdef HAVE_ZOLTAN2_OMP
3069 #pragma omp single
3070 #endif
3071  {
3072  this->cut_coordinates_work_array = temp_cut_coords;
3073  }
3074  }
3075  }
3076  delete reductionOp;
3077 }
3078 
3079 
3099 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3100  typename mj_part_t>
3101 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3102  size_t total_part_count,
3103  mj_part_t num_cuts,
3104  mj_scalar_t max_coord,
3105  mj_scalar_t min_coord,
3106  mj_lno_t coordinate_begin_index,
3107  mj_lno_t coordinate_end_index,
3108  mj_scalar_t *mj_current_dim_coords,
3109  mj_scalar_t *temp_current_cut_coords,
3110  bool * /* current_cut_status */,
3111  double *my_current_part_weights,
3112  mj_scalar_t *my_current_left_closest,
3113  mj_scalar_t *my_current_right_closest){
3114 
3115  // initializations for part weights, left/right closest
3116  for (size_t i = 0; i < total_part_count; ++i){
3117  my_current_part_weights[i] = 0;
3118  }
3119 
3120  //initialize the left and right closest coordinates
3121  //to their max value.
3122  for(mj_part_t i = 0; i < num_cuts; ++i){
3123  my_current_left_closest[i] = min_coord - 1;
3124  my_current_right_closest[i] = max_coord + 1;
3125  }
3126  //mj_lno_t comparison_count = 0;
3127  mj_scalar_t minus_EPSILON = -this->sEpsilon;
3128 #ifdef HAVE_ZOLTAN2_OMP
3129  //no need for the barrier as all threads uses their local memories.
3130  //dont change the static scheduling here, as it is assumed when the new
3131  //partitions are created later.
3132 #pragma omp for
3133 #endif
3134  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3135  int i = this->coordinate_permutations[ii];
3136 
3137  //the accesses to assigned_part_ids are thread safe
3138  //since each coordinate is assigned to only a single thread.
3139  mj_part_t j = this->assigned_part_ids[i] / 2;
3140 
3141  if(j >= num_cuts){
3142  j = num_cuts - 1;
3143  }
3144 
3145  mj_part_t lower_cut_index = 0;
3146  mj_part_t upper_cut_index = num_cuts - 1;
3147 
3148  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3149  bool is_inserted = false;
3150  bool is_on_left_of_cut = false;
3151  bool is_on_right_of_cut = false;
3152  mj_part_t last_compared_part = -1;
3153 
3154  mj_scalar_t coord = mj_current_dim_coords[i];
3155 
3156  while(upper_cut_index >= lower_cut_index)
3157  {
3158  //comparison_count++;
3159  last_compared_part = -1;
3160  is_on_left_of_cut = false;
3161  is_on_right_of_cut = false;
3162  mj_scalar_t cut = temp_current_cut_coords[j];
3163  mj_scalar_t distance_to_cut = coord - cut;
3164  mj_scalar_t abs_distance_to_cut = ZOLTAN2_ABS(distance_to_cut);
3165 
3166  //if it is on the line.
3167  if(abs_distance_to_cut < this->sEpsilon){
3168 
3169  my_current_part_weights[j * 2 + 1] += w;
3170  this->assigned_part_ids[i] = j * 2 + 1;
3171 
3172  //assign left and right closest point to cut as the point is on the cut.
3173  my_current_left_closest[j] = coord;
3174  my_current_right_closest[j] = coord;
3175  //now we need to check if there are other cuts on the same cut coordinate.
3176  //if there are, then we add the weight of the cut to all cuts in the same coordinate.
3177  mj_part_t kk = j + 1;
3178  while(kk < num_cuts){
3179  // Needed when cuts shared the same position
3180  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3181  if(distance_to_cut < this->sEpsilon){
3182  my_current_part_weights[2 * kk + 1] += w;
3183  my_current_left_closest[kk] = coord;
3184  my_current_right_closest[kk] = coord;
3185  kk++;
3186  }
3187  else{
3188  //cut is far away.
3189  //just check the left closest point for the next cut.
3190  if(coord - my_current_left_closest[kk] > this->sEpsilon){
3191  my_current_left_closest[kk] = coord;
3192  }
3193  break;
3194  }
3195  }
3196 
3197 
3198  kk = j - 1;
3199  //continue checking for the cuts on the left if they share the same coordinate.
3200  while(kk >= 0){
3201  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3202  if(distance_to_cut < this->sEpsilon){
3203  my_current_part_weights[2 * kk + 1] += w;
3204  //try to write the partId as the leftmost cut.
3205  this->assigned_part_ids[i] = kk * 2 + 1;
3206  my_current_left_closest[kk] = coord;
3207  my_current_right_closest[kk] = coord;
3208  kk--;
3209  }
3210  else{
3211  //if cut is far away on the left of the point.
3212  //then just compare for right closest point.
3213  if(my_current_right_closest[kk] - coord > this->sEpsilon){
3214  my_current_right_closest[kk] = coord;
3215  }
3216  break;
3217  }
3218  }
3219 
3220  is_inserted = true;
3221  break;
3222  }
3223  else {
3224  //if point is on the left of the cut.
3225  if (distance_to_cut < 0) {
3226  bool _break = false;
3227  if(j > 0){
3228  //check distance to the cut on the left the current cut compared.
3229  //if point is on the right, then we find the part of the point.
3230  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3231  if(distance_to_next_cut > this->sEpsilon){
3232  _break = true;
3233  }
3234  }
3235  //if point is not on the right of the next cut, then
3236  //set the upper bound to this cut.
3237  upper_cut_index = j - 1;
3238  //set the last part, and mark it as on the left of the last part.
3239  is_on_left_of_cut = true;
3240  last_compared_part = j;
3241  if(_break) break;
3242  }
3243  else {
3244  //if point is on the right of the cut.
3245  bool _break = false;
3246  if(j < num_cuts - 1){
3247  //check distance to the cut on the left the current cut compared.
3248  //if point is on the right, then we find the part of the point.
3249  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3250  if(distance_to_next_cut < minus_EPSILON){
3251  _break = true;
3252  }
3253  }
3254 
3255  //if point is not on the left of the next cut, then
3256  //set the upper bound to this cut.
3257  lower_cut_index = j + 1;
3258  //set the last part, and mark it as on the right of the last part.
3259  is_on_right_of_cut = true;
3260  last_compared_part = j;
3261  if(_break) break;
3262  }
3263  }
3264 
3265  j = (upper_cut_index + lower_cut_index) / 2;
3266  }
3267  if(!is_inserted){
3268  if(is_on_right_of_cut){
3269 
3270  //add it to the right of the last compared part.
3271  my_current_part_weights[2 * last_compared_part + 2] += w;
3272  this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3273 
3274  //update the right closest point of last compared cut.
3275  if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3276  my_current_right_closest[last_compared_part] = coord;
3277  }
3278  //update the left closest point of the cut on the right of the last compared cut.
3279  if(last_compared_part+1 < num_cuts){
3280 
3281  if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3282  my_current_left_closest[last_compared_part + 1] = coord;
3283  }
3284  }
3285 
3286  }
3287  else if(is_on_left_of_cut){
3288 
3289  //add it to the left of the last compared part.
3290  my_current_part_weights[2 * last_compared_part] += w;
3291  this->assigned_part_ids[i] = 2 * last_compared_part;
3292 
3293 
3294  //update the left closest point of last compared cut.
3295  if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3296  my_current_left_closest[last_compared_part] = coord;
3297  }
3298 
3299  //update the right closest point of the cut on the left of the last compared cut.
3300  if(last_compared_part-1 >= 0){
3301  if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3302  my_current_right_closest[last_compared_part -1] = coord;
3303  }
3304  }
3305  }
3306  }
3307  }
3308 
3309  // prefix sum computation.
3310  //we need prefix sum for each part to determine cut positions.
3311  for (size_t i = 1; i < total_part_count; ++i){
3312  // check for cuts sharing the same position; all cuts sharing a position
3313  // have the same weight == total weight for all cuts sharing the position.
3314  // don't want to accumulate that total weight more than once.
3315  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3316  ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3317  < this->sEpsilon){
3318  //i % 2 = 0 when part i represents the cut coordinate.
3319  //if it is a cut, and if the next cut also have the same coordinate, then
3320  //dont addup.
3321  my_current_part_weights[i] = my_current_part_weights[i-2];
3322  continue;
3323  }
3324  //otherwise do the prefix sum.
3325  my_current_part_weights[i] += my_current_part_weights[i-1];
3326  }
3327 }
3328 
3329 
3337 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3338  typename mj_part_t>
3339 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3340  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3341  mj_part_t current_work_part,
3342  mj_part_t current_concurrent_num_parts){
3343 
3344 #ifdef HAVE_ZOLTAN2_OMP
3345  //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
3346  //using parallel region here reduces the performance because of the cache invalidates.
3347 #pragma omp barrier
3348 #pragma omp single
3349 #endif
3350  {
3351  size_t tlr_array_shift = 0;
3352  mj_part_t cut_shift = 0;
3353 
3354  //iterate for all concurrent parts to find the left and right closest points in the process.
3355  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3356 
3357  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3358  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3359  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3360 
3361  //iterate for cuts in a single part.
3362  for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3363  mj_part_t next = tlr_array_shift + ii;
3364  mj_part_t cut_index = cut_shift + ii;
3365  if(this->is_cut_line_determined[cut_index]) continue;
3366  mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3367  right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3368 
3369  //find the closest points from left and right for the cut in the process.
3370  for (int j = 1; j < this->num_threads; ++j){
3371  if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3372  right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3373  }
3374  if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3375  left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3376  }
3377  }
3378  //store the left and right closes points.
3379  this->total_part_weight_left_right_closests[num_total_part_in_part +
3380  next] = left_closest_in_process;
3381  this->total_part_weight_left_right_closests[num_total_part_in_part +
3382  num_cuts_in_part + next] = right_closest_in_process;
3383  }
3384  //set the shift position in the arrays
3385  tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3386  cut_shift += num_cuts_in_part;
3387  }
3388 
3389  tlr_array_shift = 0;
3390  cut_shift = 0;
3391  size_t total_part_array_shift = 0;
3392 
3393  //iterate for all concurrent parts to find the total weight in the process.
3394  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3395 
3396  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3397  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3398  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3399 
3400  for(size_t j = 0; j < num_total_part_in_part; ++j){
3401 
3402  mj_part_t cut_ind = j / 2 + cut_shift;
3403 
3404  //need to check j != num_total_part_in_part - 1
3405  // which is same as j/2 != num_cuts_in_part.
3406  //we cannot check it using cut_ind, because of the concurrent part concantanetion.
3407  if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
3408  double pwj = 0;
3409  for (int k = 0; k < this->num_threads; ++k){
3410  pwj += this->thread_part_weights[k][total_part_array_shift + j];
3411  }
3412  //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
3413  this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3414  }
3415  cut_shift += num_cuts_in_part;
3416  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3417  total_part_array_shift += num_total_part_in_part;
3418  }
3419  }
3420  //the other threads needs to wait here.
3421  //but we don't need a pragma omp barrier.
3422  //as omp single has already have implicit barrier.
3423 }
3424 
3425 
3435 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3436  typename mj_part_t>
3437 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3438  mj_scalar_t cut_upper_bound,
3439  mj_scalar_t cut_lower_bound,
3440  mj_scalar_t cut_upper_weight,
3441  mj_scalar_t cut_lower_weight,
3442  mj_scalar_t expected_weight,
3443  mj_scalar_t &new_cut_position){
3444 
3445  if(ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3446  new_cut_position = cut_upper_bound; //or lower bound does not matter.
3447  }
3448 
3449 
3450  if(ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3451  new_cut_position = cut_lower_bound;
3452  }
3453 
3454  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3455  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3456  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3457 
3458  mj_scalar_t required_shift = (my_weight_diff / weight_range);
3459  int scale_constant = 20;
3460  int shiftint= int (required_shift * scale_constant);
3461  if (shiftint == 0) shiftint = 1;
3462  required_shift = mj_scalar_t (shiftint) / scale_constant;
3463  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3464 }
3465 
3466 
3477 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3478  typename mj_part_t>
3479 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3480  mj_part_t num_parts,
3481  mj_scalar_t * /* mj_current_dim_coords */,
3482  mj_scalar_t *current_concurrent_cut_coordinate,
3483  mj_lno_t coordinate_begin,
3484  mj_lno_t coordinate_end,
3485  mj_scalar_t *used_local_cut_line_weight_to_left,
3486  double **used_thread_part_weight_work,
3487  mj_lno_t *out_part_xadj){
3488 
3489  mj_part_t num_cuts = num_parts - 1;
3490 
3491 #ifdef HAVE_ZOLTAN2_OMP
3492 #pragma omp parallel
3493 #endif
3494  {
3495  int me = 0;
3496 #ifdef HAVE_ZOLTAN2_OMP
3497  me = omp_get_thread_num();
3498 #endif
3499 
3500  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3501  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3502 
3503  //now if the rectilinear partitioning is allowed we decide how
3504  //much weight each thread should put to left and right.
3505  if (this->distribute_points_on_cut_lines){
3506  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3507  // this for assumes the static scheduling in mj_1D_part calculation.
3508 #ifdef HAVE_ZOLTAN2_OMP
3509 #pragma omp for
3510 #endif
3511  for (mj_part_t i = 0; i < num_cuts; ++i){
3512  //the left to be put on the left of the cut.
3513  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3514  for(int ii = 0; ii < this->num_threads; ++ii){
3515  if(left_weight > this->sEpsilon){
3516  //the weight of thread ii on cut.
3517  mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3518  if(thread_ii_weight_on_cut < left_weight){
3519  //if left weight is bigger than threads weight on cut.
3520  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3521  }
3522  else {
3523  //if thread's weight is bigger than space, then put only a portion.
3524  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3525  }
3526  left_weight -= thread_ii_weight_on_cut;
3527  }
3528  else {
3529  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3530  }
3531  }
3532  }
3533 
3534  if(num_cuts > 0){
3535  //this is a special case. If cutlines share the same coordinate, their weights are equal.
3536  //we need to adjust the ratio for that.
3537  for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3538  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3539  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3540  }
3541  my_local_thread_cut_weights_to_put_left[i] = static_cast<long long>((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
3542  / mj_scalar_t(SIGNIFICANCE_MUL);
3543  }
3544  }
3545  }
3546 
3547  for(mj_part_t ii = 0; ii < num_parts; ++ii){
3548  thread_num_points_in_parts[ii] = 0;
3549  }
3550 
3551 
3552 #ifdef HAVE_ZOLTAN2_OMP
3553  //dont change static scheduler. the static partitioner used later as well.
3554 #pragma omp for
3555 #endif
3556  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3557 
3558  mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3559  mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3560  mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3561  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3562  if(coordinate_assigned_place % 2 == 1){
3563  //if it is on the cut.
3564  if(this->distribute_points_on_cut_lines
3565  && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3566  //if the rectilinear partitioning is allowed,
3567  //and the thread has still space to put on the left of the cut
3568  //then thread puts the vertex to left.
3569  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3570  //if putting the vertex to left increased the weight more than expected.
3571  //and if the next cut is on the same coordinate,
3572  //then we need to adjust how much weight next cut puts to its left as well,
3573  //in order to take care of the imbalance.
3574  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3575  && coordinate_assigned_part < num_cuts - 1
3576  && ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3577  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3578  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3579  }
3580  ++thread_num_points_in_parts[coordinate_assigned_part];
3581  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3582  }
3583  else{
3584  //if there is no more space on the left, put the coordinate to the right of the cut.
3585  ++coordinate_assigned_part;
3586  //this while loop is necessary when a line is partitioned into more than 2 parts.
3587  while(this->distribute_points_on_cut_lines &&
3588  coordinate_assigned_part < num_cuts){
3589  //traverse all the cut lines having the same partitiong
3590  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3591  current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3592  < this->sEpsilon){
3593  //if line has enough space on left, put it there.
3594  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3595  this->sEpsilon &&
3596  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3597  ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3598  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3599  //Again if it put too much on left of the cut,
3600  //update how much the next cut sharing the same coordinate will put to its left.
3601  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3602  coordinate_assigned_part < num_cuts - 1 &&
3603  ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3604  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3605  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3606  }
3607  break;
3608  }
3609  }
3610  else {
3611  break;
3612  }
3613  ++coordinate_assigned_part;
3614  }
3615  ++thread_num_points_in_parts[coordinate_assigned_part];
3616  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3617  }
3618  }
3619  else {
3620  //if it is already assigned to a part, then just put it to the corresponding part.
3621  ++thread_num_points_in_parts[coordinate_assigned_part];
3622  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3623  }
3624  }
3625 
3626 
3627 
3628  //now we calculate where each thread will write in new_coordinate_permutations array.
3629  //first we find the out_part_xadj, by marking the begin and end points of each part found.
3630  //the below loop find the number of points in each part, and writes it to out_part_xadj
3631 #ifdef HAVE_ZOLTAN2_OMP
3632 #pragma omp for
3633 #endif
3634  for(mj_part_t j = 0; j < num_parts; ++j){
3635  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3636  for (int i = 0; i < this->num_threads; ++i){
3637  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3638  //prefix sum to thread point counts, so that each will have private space to write.
3639  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3640  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3641 
3642  }
3643  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
3644  }
3645 
3646  //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
3647 #ifdef HAVE_ZOLTAN2_OMP
3648 #pragma omp single
3649 #endif
3650  {
3651  //perform prefix sum for num_points in parts.
3652  for(mj_part_t j = 1; j < num_parts; ++j){
3653  out_part_xadj[j] += out_part_xadj[j - 1];
3654  }
3655  }
3656 
3657  //shift the num points in threads thread to obtain the
3658  //beginning index of each thread's private space.
3659  for(mj_part_t j = 1; j < num_parts; ++j){
3660  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3661  }
3662 
3663 
3664  //now thread gets the coordinate and writes the index of coordinate to the permutation array
3665  //using the part index we calculated.
3666 #ifdef HAVE_ZOLTAN2_OMP
3667 #pragma omp for
3668 #endif
3669  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3670  mj_lno_t i = this->coordinate_permutations[ii];
3671  mj_part_t p = this->assigned_part_ids[i];
3672  this->new_coordinate_permutations[coordinate_begin +
3673  thread_num_points_in_parts[p]++] = i;
3674  }
3675  }
3676 }
3677 
3678 
3679 
3708 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3709  typename mj_part_t>
3710 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3711  const size_t &/* num_total_part */,
3712  const mj_part_t &num_cuts,
3713  const mj_scalar_t &max_coordinate,
3714  const mj_scalar_t &min_coordinate,
3715  const mj_scalar_t &global_total_weight,
3716  const double &used_imbalance_tolerance,
3717  mj_scalar_t * current_global_part_weights,
3718  const mj_scalar_t * current_local_part_weights,
3719  const mj_scalar_t *current_part_target_weights,
3720  bool *current_cut_line_determined,
3721  mj_scalar_t *current_cut_coordinates,
3722  mj_scalar_t *current_cut_upper_bounds,
3723  mj_scalar_t *current_cut_lower_bounds,
3724  mj_scalar_t *current_global_left_closest_points,
3725  mj_scalar_t *current_global_right_closest_points,
3726  mj_scalar_t * current_cut_lower_bound_weights,
3727  mj_scalar_t * current_cut_upper_weights,
3728  mj_scalar_t *new_current_cut_coordinates,
3729  mj_scalar_t *current_part_cut_line_weight_to_put_left,
3730  mj_part_t *rectilinear_cut_count,
3731  mj_part_t &my_num_incomplete_cut){
3732 
3733  //seen weight in the part
3734  mj_scalar_t seen_weight_in_part = 0;
3735  //expected weight for part.
3736  mj_scalar_t expected_weight_in_part = 0;
3737  //imbalance for the left and right side of the cut.
3738  double imbalance_on_left = 0, imbalance_on_right = 0;
3739 
3740 
3741 #ifdef HAVE_ZOLTAN2_OMP
3742 #pragma omp for
3743 #endif
3744  for (mj_part_t i = 0; i < num_cuts; i++){
3745  //if left and right closest points are not set yet,
3746  //set it to the cut itself.
3747  if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3748  current_global_left_closest_points[i] = current_cut_coordinates[i];
3749  if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3750  current_global_right_closest_points[i] = current_cut_coordinates[i];
3751 
3752  }
3753 #ifdef HAVE_ZOLTAN2_OMP
3754 #pragma omp for
3755 #endif
3756  for (mj_part_t i = 0; i < num_cuts; i++){
3757 
3758  if(this->distribute_points_on_cut_lines){
3759  //init the weight on the cut.
3760  this->global_rectilinear_cut_weight[i] = 0;
3761  this->process_rectilinear_cut_weight[i] = 0;
3762  }
3763  //if already determined at previous iterations,
3764  //then just write the coordinate to new array, and proceed.
3765  if(current_cut_line_determined[i]) {
3766  new_current_cut_coordinates[i] = current_cut_coordinates[i];
3767  continue;
3768  }
3769 
3770  //current weight of the part at the left of the cut line.
3771  seen_weight_in_part = current_global_part_weights[i * 2];
3772 
3773  /*
3774  std::cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part <<std::endl;
3775  std::cout << "\tcut:" << current_cut_coordinates[i]
3776  << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
3777  << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << std::endl;
3778  */
3779  //expected ratio
3780  expected_weight_in_part = current_part_target_weights[i];
3781  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
3782  imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3783  //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
3784  imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3785 
3786  bool is_left_imbalance_valid = ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3787  bool is_right_imbalance_valid = ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3788 
3789  //if the cut line reaches to desired imbalance.
3790  if(is_left_imbalance_valid && is_right_imbalance_valid){
3791  current_cut_line_determined[i] = true;
3792 #ifdef HAVE_ZOLTAN2_OMP
3793 #pragma omp atomic
3794 #endif
3795  my_num_incomplete_cut -= 1;
3796  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3797  continue;
3798  }
3799  else if(imbalance_on_left < 0){
3800  //if left imbalance < 0 then we need to move the cut to right.
3801 
3802  if(this->distribute_points_on_cut_lines){
3803  //if it is okay to distribute the coordinate on
3804  //the same coordinate to left and right.
3805  //then check if we can reach to the target weight by including the
3806  //coordinates in the part.
3807  if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3808  //if it is we are done.
3809  current_cut_line_determined[i] = true;
3810 #ifdef HAVE_ZOLTAN2_OMP
3811 #pragma omp atomic
3812 #endif
3813  my_num_incomplete_cut -= 1;
3814 
3815  //then assign everything on the cut to the left of the cut.
3816  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3817 
3818  //for this cut all the weight on cut will be put to left.
3819 
3820  current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3821  continue;
3822  }
3823  else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3824 
3825  //if the weight is larger than the expected weight,
3826  //then we need to distribute some points to left, some to right.
3827  current_cut_line_determined[i] = true;
3828 #ifdef HAVE_ZOLTAN2_OMP
3829 #pragma omp atomic
3830 #endif
3831  *rectilinear_cut_count += 1;
3832  //increase the num cuts to be determined with rectilinear partitioning.
3833 
3834 #ifdef HAVE_ZOLTAN2_OMP
3835 #pragma omp atomic
3836 #endif
3837  my_num_incomplete_cut -= 1;
3838  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3839  this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3840  current_local_part_weights[i * 2];
3841  continue;
3842  }
3843  }
3844  //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
3845  current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3846  //set the lower bound weight to the weight we have seen.
3847  current_cut_lower_bound_weights[i] = seen_weight_in_part;
3848 
3849  //compare the upper bound with what has been found in the last iteration.
3850  //we try to make more strict bounds for the cut here.
3851  for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3852  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3853  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3854 
3855  if(p_weight >= expected_weight_in_part){
3856  //if a cut on the right has the expected weight, then we found
3857  //our cut position. Set up and low coordiantes to this new cut coordinate.
3858  //but we need one more iteration to finalize the cut position,
3859  //as wee need to update the part ids.
3860  if(p_weight == expected_weight_in_part){
3861  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3862  current_cut_upper_weights[i] = p_weight;
3863  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3864  current_cut_lower_bound_weights[i] = p_weight;
3865  } else if (p_weight < current_cut_upper_weights[i]){
3866  //if a part weight is larger then my expected weight,
3867  //but lower than my upper bound weight, update upper bound.
3868  current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3869  current_cut_upper_weights[i] = p_weight;
3870  }
3871  break;
3872  }
3873  //if comes here then pw < ew
3874  //then compare the weight against line weight.
3875  if(line_weight >= expected_weight_in_part){
3876  //if the line is larger than the expected weight,
3877  //then we need to reach to the balance by distributing coordinates on this line.
3878  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3879  current_cut_upper_weights[i] = line_weight;
3880  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3881  current_cut_lower_bound_weights[i] = p_weight;
3882  break;
3883  }
3884  //if a stricter lower bound is found,
3885  //update the lower bound.
3886  if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3887  current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3888  current_cut_lower_bound_weights[i] = p_weight;
3889  }
3890  }
3891 
3892 
3893  mj_scalar_t new_cut_position = 0;
3894  this->mj_calculate_new_cut_position(
3895  current_cut_upper_bounds[i],
3896  current_cut_lower_bounds[i],
3897  current_cut_upper_weights[i],
3898  current_cut_lower_bound_weights[i],
3899  expected_weight_in_part, new_cut_position);
3900 
3901  //if cut line does not move significantly.
3902  //then finalize the search.
3903  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3904  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
3905  ){
3906  current_cut_line_determined[i] = true;
3907 #ifdef HAVE_ZOLTAN2_OMP
3908 #pragma omp atomic
3909 #endif
3910  my_num_incomplete_cut -= 1;
3911 
3912  //set the cut coordinate and proceed.
3913  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3914  } else {
3915  new_current_cut_coordinates [i] = new_cut_position;
3916  }
3917  } else {
3918 
3919  //need to move the cut line to left.
3920  //set upper bound to current line.
3921  current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3922  current_cut_upper_weights[i] = seen_weight_in_part;
3923 
3924  // compare the current cut line weights with previous upper and lower bounds.
3925  for (int ii = i - 1; ii >= 0; --ii){
3926  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3927  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3928  if(p_weight <= expected_weight_in_part){
3929  if(p_weight == expected_weight_in_part){
3930  //if the weight of the part is my expected weight
3931  //then we find the solution.
3932  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3933  current_cut_upper_weights[i] = p_weight;
3934  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3935  current_cut_lower_bound_weights[i] = p_weight;
3936  }
3937  else if (p_weight > current_cut_lower_bound_weights[i]){
3938  //if found weight is bigger than the lower bound
3939  //then update the lower bound.
3940  current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3941  current_cut_lower_bound_weights[i] = p_weight;
3942 
3943  //at the same time, if weight of line is bigger than the
3944  //expected weight, then update the upper bound as well.
3945  //in this case the balance will be obtained by distributing weightss
3946  //on this cut position.
3947  if(line_weight > expected_weight_in_part){
3948  current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3949  current_cut_upper_weights[i] = line_weight;
3950  }
3951  }
3952  break;
3953  }
3954  //if the weight of the cut on the left is still bigger than my weight,
3955  //and also if the weight is smaller than the current upper weight,
3956  //or if the weight is equal to current upper weight, but on the left of
3957  // the upper weight, then update upper bound.
3958  if (p_weight >= expected_weight_in_part &&
3959  (p_weight < current_cut_upper_weights[i] ||
3960  (p_weight == current_cut_upper_weights[i] &&
3961  current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3962  )
3963  )
3964  ){
3965  current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3966  current_cut_upper_weights[i] = p_weight;
3967  }
3968  }
3969  mj_scalar_t new_cut_position = 0;
3970  this->mj_calculate_new_cut_position(
3971  current_cut_upper_bounds[i],
3972  current_cut_lower_bounds[i],
3973  current_cut_upper_weights[i],
3974  current_cut_lower_bound_weights[i],
3975  expected_weight_in_part,
3976  new_cut_position);
3977 
3978  //if cut line does not move significantly.
3979  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3980  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
3981  current_cut_line_determined[i] = true;
3982 #ifdef HAVE_ZOLTAN2_OMP
3983 #pragma omp atomic
3984 #endif
3985  my_num_incomplete_cut -= 1;
3986  //set the cut coordinate and proceed.
3987  new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3988  } else {
3989  new_current_cut_coordinates [ i] = new_cut_position;
3990  }
3991  }
3992  }
3993 
3994  { // This unnecessary bracket works around a compiler bug in NVCC when enabling OpenMP as well
3995 
3996  //communication to determine the ratios of processors for the distribution
3997  //of coordinates on the cut lines.
3998 #ifdef HAVE_ZOLTAN2_OMP
3999  //no need barrier here as it is implicit.
4000 #pragma omp single
4001 #endif
4002  {
4003  if(*rectilinear_cut_count > 0){
4004 
4005  try{
4006  Teuchos::scan<int,mj_scalar_t>(
4007  *comm, Teuchos::REDUCE_SUM,
4008  num_cuts,
4009  this->process_rectilinear_cut_weight,
4010  this->global_rectilinear_cut_weight
4011  );
4012  }
4013  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4014 
4015  for (mj_part_t i = 0; i < num_cuts; ++i){
4016  //if cut line weight to be distributed.
4017  if(this->global_rectilinear_cut_weight[i] > 0) {
4018  //expected weight to go to left of the cut.
4019  mj_scalar_t expected_part_weight = current_part_target_weights[i];
4020  //the weight that should be put to left of the cut.
4021  mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4022  //the weight of the cut in the process
4023  mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4024  //the sum of the cut weights upto this process, including the weight of this process.
4025  mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4026  //the space on the left side of the cut after all processes before this process (including this process)
4027  //puts their weights on cut to left.
4028  mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4029  //add my weight to this space to find out how much space is left to me.
4030  mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4031 
4032  /*
4033  std::cout << "expected_part_weight:" << expected_part_weight
4034  << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
4035  << " my_weight_on_line" << my_weight_on_line
4036  << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
4037  << " space_to_put_left:" << space_to_put_left
4038  << " space_left_to_me" << space_left_to_me << std::endl;
4039  */
4040  if(space_left_to_me < 0){
4041  //space_left_to_me is negative and i dont need to put anything to left.
4042  current_part_cut_line_weight_to_put_left[i] = 0;
4043  }
4044  else if(space_left_to_me >= my_weight_on_line){
4045  //space left to me is bigger than the weight of the processor on cut.
4046  //so put everything to left.
4047  current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4048  //std::cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << std::endl;
4049  }
4050  else {
4051  //put only the weight as much as the space.
4052  current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4053 
4054  //std::cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << std::endl;
4055  }
4056 
4057  }
4058  }
4059  *rectilinear_cut_count = 0;
4060  }
4061  }
4062  }
4063 }
4064 
4074 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4075  typename mj_part_t>
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4077  mj_part_t num_procs,
4078  mj_part_t num_parts,
4079  mj_gno_t *&num_points_in_all_processor_parts){
4080 
4081  //initially allocation_size is num_parts
4082  size_t allocation_size = num_parts * (num_procs + 1);
4083 
4084  //this will be output
4085  //holds how many each processor has in each part.
4086  //last portion is the sum of all processor points in each part.
4087 
4088  //allocate memory for the local num coordinates in each part.
4089  mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4090 
4091 
4092  //this is the portion of the memory which will be used
4093  //at the summation to obtain total number of processors' points in each part.
4094  mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4095  //this is the portion of the memory where each stores its local number.
4096  //this information is needed by other processors.
4097  mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4098 
4099  //initialize the array with 0's.
4100  memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
4101 
4102  //write the number of coordinates in each part.
4103  for (mj_part_t i = 0; i < num_parts; ++i){
4104  mj_lno_t part_begin_index = 0;
4105  if (i > 0){
4106  part_begin_index = this->new_part_xadj[i - 1];
4107  }
4108  mj_lno_t part_end_index = this->new_part_xadj[i];
4109  my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4110  }
4111 
4112  //copy the local num parts to the last portion of array,
4113  //so that this portion will represent the global num points in each part after the reduction.
4114  memcpy (my_local_point_counts_in_each_art,
4115  my_local_points_to_reduce_sum,
4116  sizeof(mj_gno_t) * (num_parts) );
4117 
4118 
4119  //reduceAll operation.
4120  //the portion that belongs to a processor with index p
4121  //will start from myRank * num_parts.
4122  //the global number of points will be held at the index
4123  try{
4124  reduceAll<int, mj_gno_t>(
4125  *(this->comm),
4126  Teuchos::REDUCE_SUM,
4127  allocation_size,
4128  num_local_points_in_each_part_to_reduce_sum,
4129  num_points_in_all_processor_parts);
4130  }
4131  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4132  freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4133 }
4134 
4135 
4136 
4149 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4150  typename mj_part_t>
4151 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4152  size_t migration_reduce_all_population,
4153  mj_lno_t num_coords_for_last_dim_part,
4154  mj_part_t num_procs,
4155  mj_part_t num_parts,
4156  mj_gno_t *num_points_in_all_processor_parts){
4157 
4158  //if reduce all count and population in the last dim is too high
4159  if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
4160  //if the work in a part per processor in the last dim is too low.
4161  if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
4162 
4163  //if migration is to be checked and the imbalance is too high
4164  if (this->check_migrate_avoid_migration_option == 0){
4165  double global_imbalance = 0;
4166  //global shift to reach the sum of coordiante count in each part.
4167  size_t global_shift = num_procs * num_parts;
4168 
4169  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4170  for (mj_part_t i = 0; i < num_parts; ++i){
4171  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4172  / double(num_procs);
4173 
4174  global_imbalance += ZOLTAN2_ABS(ideal_num -
4175  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4176  }
4177  }
4178  global_imbalance /= num_parts;
4179  global_imbalance /= num_procs;
4180 
4181  /*
4182  if (this->myRank == 0) {
4183  std::cout << "imbalance for next iteration:" << global_imbalance << std::endl;
4184  }
4185  */
4186 
4187  if(global_imbalance <= this->minimum_migration_imbalance){
4188  return false;
4189  }
4190  else {
4191  return true;
4192  }
4193  }
4194  else {
4195  //if migration is forced
4196  return true;
4197  }
4198 }
4199 
4200 
4210 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4211  typename mj_part_t>
4212 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4213  mj_part_t num_parts,
4214  mj_part_t *part_assignment_proc_begin_indices,
4215  mj_part_t *processor_chains_in_parts,
4216  mj_lno_t *send_count_to_each_proc,
4217  int *coordinate_destinations){
4218 
4219  for (mj_part_t p = 0; p < num_parts; ++p){
4220  mj_lno_t part_begin = 0;
4221  if (p > 0) part_begin = this->new_part_xadj[p - 1];
4222  mj_lno_t part_end = this->new_part_xadj[p];
4223 
4224  //get the first part that current processor will send its part-p.
4225  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4226  //initialize how many point I sent to this processor.
4227  mj_lno_t num_total_send = 0;
4228  for (mj_lno_t j=part_begin; j < part_end; j++){
4229  mj_lno_t local_ind = this->new_coordinate_permutations[j];
4230  while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4231  //then get the next processor to send the points in part p.
4232  num_total_send = 0;
4233  //assign new processor to part_assign_begin[p]
4234  part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4235  //remove the previous processor
4236  processor_chains_in_parts[proc_to_sent] = -1;
4237  //choose the next processor as the next one to send.
4238  proc_to_sent = part_assignment_proc_begin_indices[p];
4239  }
4240  //write the gno index to corresponding position in sendBuf.
4241  coordinate_destinations[local_ind] = proc_to_sent;
4242  ++num_total_send;
4243  }
4244  }
4245 }
4246 
4261 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4262  typename mj_part_t>
4263 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4264  mj_gno_t * num_points_in_all_processor_parts,
4265  mj_part_t num_parts,
4266  mj_part_t num_procs,
4267  mj_lno_t *send_count_to_each_proc,
4268  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4269  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4270  mj_part_t &out_part_index,
4271  mj_part_t &output_part_numbering_begin_index,
4272  int *coordinate_destinations){
4273 
4274 
4275  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4276  mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4277 
4278  //boolean variable if the process finds its part to be assigned.
4279  bool did_i_find_my_group = false;
4280 
4281  mj_part_t num_free_procs = num_procs;
4282  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4283 
4284  double max_imbalance_difference = 0;
4285  mj_part_t max_differing_part = 0;
4286 
4287  //find how many processor each part requires.
4288  for (mj_part_t i=0; i < num_parts; i++){
4289 
4290  //scalar portion of the required processors
4291  double scalar_required_proc = num_procs *
4292  (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4293 
4294  //round it to closest integer; make sure have at least one proc.
4295  mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
4296  if (required_proc == 0) required_proc = 1;
4297 
4298  //if assigning the required num procs, creates problems for the rest of the parts.
4299  //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
4300  if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4301  required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4302  }
4303 
4304  //reduce the free processor count
4305  num_free_procs -= required_proc;
4306  //reduce the free minimum processor count required for the rest of the part by 1.
4307  --minimum_num_procs_required_for_rest_of_parts;
4308 
4309  //part (i) is assigned to (required_proc) processors.
4310  num_procs_assigned_to_each_part[i] = required_proc;
4311 
4312  //because of the roundings some processors might be left as unassigned.
4313  //we want to assign those processors to the part with most imbalance.
4314  //find the part with the maximum imbalance here.
4315  double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4316  if (imbalance_wrt_ideal > max_imbalance_difference){
4317  max_imbalance_difference = imbalance_wrt_ideal;
4318  max_differing_part = i;
4319  }
4320  }
4321 
4322  //assign extra processors to the part with maximum imbalance than the ideal.
4323  if (num_free_procs > 0){
4324  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4325  }
4326 
4327  //now find what are the best processors with least migration for each part.
4328 
4329  //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
4330  //index of a processor that processor sends its data for part - i
4331  mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4332  //the next processor send is found in processor_chains_in_parts, in linked list manner.
4333  mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4334  mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4335 
4336  //initialize the assignment of each processor.
4337  //this has a linked list implementation.
4338  //the beginning of processors assigned
4339  //to each part is hold at part_assignment_proc_begin_indices[part].
4340  //then the next processor assigned to that part is located at
4341  //proc_part_assignments[part_assign_begins[part]], this is a chain
4342  //until the value of -1 is reached.
4343  for (int i = 0; i < num_procs; ++i ){
4344  processor_part_assignments[i] = -1;
4345  processor_chains_in_parts[i] = -1;
4346  }
4347  for (int i = 0; i < num_parts; ++i ){
4348  part_assignment_proc_begin_indices[i] = -1;
4349  }
4350 
4351 
4352  //std::cout << "Before migration: mig type:" << this->migration_type << std::endl;
4353  //Allocate memory for sorting data structure.
4354  uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4355  for(mj_part_t i = 0; i < num_parts; ++i){
4356  //the algorithm tries to minimize the cost of migration,
4357  //by assigning the processors with highest number of coordinates on that part.
4358  //here we might want to implement a maximum weighted bipartite matching algorithm.
4359  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4360  sort_item_num_part_points_in_procs[ii].id = ii;
4361  //if processor is not assigned yet.
4362  //add its num points to the sort data structure.
4363  if (processor_part_assignments[ii] == -1){
4364  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4365  sort_item_num_part_points_in_procs[ii].signbit = 1; //indicate that the processor has positive weight.
4366  }
4367  else {
4368  //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
4369  //would be same if we simply set it to -1,
4370  //but more information with no extra cost (which is used later) is provided.
4371  //sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4372 
4373  //UPDATE: Since above gets warning when unsigned is used to represent, we added extra bit to as sign bit to the sort item.
4374  //It is 1 for positives, 0 for negatives.
4375  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4376  sort_item_num_part_points_in_procs[ii].signbit = 0;
4377  }
4378  }
4379  //sort the processors in the part.
4380  uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4381 
4382  /*
4383  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4384  std::cout << "ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4385  " " << sort_item_num_part_points_in_procs[ii].val <<
4386  " " << int(sort_item_num_part_points_in_procs[ii].signbit) << std::endl;
4387  }
4388  */
4389 
4390  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4391  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4392  mj_gno_t ideal_num_points_in_a_proc =
4393  Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
4394 
4395  //starts sending to least heaviest part.
4396  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4397  mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4398  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4399 
4400  //find the processors that will be assigned to this part, which are the heaviest
4401  //non assigned processors.
4402  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4403  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4404  //assign processor to part - i.
4405  processor_part_assignments[proc_id] = i;
4406  }
4407 
4408  bool did_change_sign = false;
4409  //if processor has a minus count, reverse it.
4410  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4411  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
4412  // TODO: SEE BUG 6194
4413  if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4414  did_change_sign = true;
4415  sort_item_num_part_points_in_procs[ii].signbit = 1;
4416  }
4417  else {
4418  break;
4419  }
4420  }
4421  if(did_change_sign){
4422  //resort the processors in the part for the rest of the processors that is not assigned.
4423  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4424  }
4425  /*
4426  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4427  std::cout << "after resort ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4428  " " << sort_item_num_part_points_in_procs[ii].val <<
4429  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) << std::endl;
4430  }
4431  */
4432 
4433  //check if this processors is one of the procs assigned to this part.
4434  //if it is, then get the group.
4435  if (!did_i_find_my_group){
4436  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4437 
4438  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4439  //add the proc to the group.
4440  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4441 
4442  if(proc_id_to_assign == this->myRank){
4443  //if the assigned process is me, then I find my group.
4444  did_i_find_my_group = true;
4445  //set the beginning of part i to my rank.
4446  part_assignment_proc_begin_indices[i] = this->myRank;
4447  processor_chains_in_parts[this->myRank] = -1;
4448 
4449  //set send count to myself to the number of points that I have in part i.
4450  send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4451 
4452  //calculate the shift required for the output_part_numbering_begin_index
4453  for (mj_part_t in = 0; in < i; ++in){
4454  output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4455  }
4456  out_part_index = i;
4457  }
4458  }
4459  //if these was not my group,
4460  //clear the subcomminicator processor array.
4461  if (!did_i_find_my_group){
4462  processor_ranks_for_subcomm.clear();
4463  }
4464  }
4465 
4466  //send points of the nonassigned coordinates to the assigned coordinates.
4467  //starts from the heaviest nonassigned processor.
4468  //TODO we might want to play with this part, that allows more computational imbalance
4469  //but having better communication balance.
4470  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4471  mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4472  mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4473 
4474  //we set number of points to -to_sent - 1 for the assigned processors.
4475  //we reverse it here. This should not happen, as we have already reversed them above.
4476 #ifdef MJ_DEBUG
4477  if (num_points_to_sent < 0) {
4478  std::cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
4479  exit(1);
4480  }
4481 #endif
4482 
4483  switch (migration_type){
4484  case 0:
4485  {
4486  //now sends the points to the assigned processors.
4487  while (num_points_to_sent > 0){
4488  //if the processor has enough space.
4489  if (num_points_to_sent <= space_left_in_sent_proc){
4490  //reduce the space left in the processor.
4491  space_left_in_sent_proc -= num_points_to_sent;
4492  //if my rank is the one that is sending the coordinates.
4493  if (this->myRank == nonassigned_proc_id){
4494  //set my sent count to the sent processor.
4495  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4496  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4497  //that the processor will send its point in part-i.
4498  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4499  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4500  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4501  }
4502  num_points_to_sent = 0;
4503  }
4504  else {
4505  //there might be no space left in the processor.
4506  if(space_left_in_sent_proc > 0){
4507  num_points_to_sent -= space_left_in_sent_proc;
4508 
4509  //send as the space left in the processor.
4510  if (this->myRank == nonassigned_proc_id){
4511  //send as much as the space in this case.
4512  send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4513  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4514  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4515  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4516 
4517  }
4518  }
4519  //change the sent part
4520  ++next_proc_to_send_index;
4521 
4522 #ifdef MJ_DEBUG
4523  if(next_part_to_send_index < nprocs - required_proc_count ){
4524  std::cout << "Migration - processor assignments - for part:"
4525  << i
4526  << " next_part_to_send :" << next_part_to_send_index
4527  << " nprocs:" << nprocs
4528  << " required_proc_count:" << required_proc_count
4529  << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4530  exit(1)l
4531 
4532  }
4533 #endif
4534  //send the new id.
4535  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4536  //set the new space in the processor.
4537  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4538  }
4539  }
4540  }
4541  break;
4542  default:
4543  {
4544  //to minimize messages, we want each processor to send its coordinates to only a single point.
4545  //we do not respect imbalances here, we send all points to the next processor.
4546  if (this->myRank == nonassigned_proc_id){
4547  //set my sent count to the sent processor.
4548  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4549  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4550  //that the processor will send its point in part-i.
4551  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4552  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4553  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4554  }
4555  num_points_to_sent = 0;
4556  ++next_proc_to_send_index;
4557 
4558  //if we made it to the heaviest processor we round robin and go to beginning
4559  if (next_proc_to_send_index == num_procs){
4560  next_proc_to_send_index = num_procs - required_proc_count;
4561  }
4562  //send the new id.
4563  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4564  //set the new space in the processor.
4565  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4566  }
4567  }
4568  }
4569  }
4570 
4571  /*
4572  for (int i = 0; i < num_procs;++i){
4573  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" << send_count_to_each_proc[i] << std::endl;
4574  }
4575  */
4576 
4577 
4578  this->assign_send_destinations(
4579  num_parts,
4580  part_assignment_proc_begin_indices,
4581  processor_chains_in_parts,
4582  send_count_to_each_proc,
4583  coordinate_destinations);
4584 
4585  freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4586  freeArray<mj_part_t>(processor_chains_in_parts);
4587  freeArray<mj_part_t>(processor_part_assignments);
4588  freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4589  freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4590 
4591 }
4592 
4593 
4606 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4607  typename mj_part_t>
4608 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4609  mj_part_t num_parts,
4610  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
4611  int *coordinate_destinations,
4612  mj_part_t &output_part_numbering_begin_index,
4613  std::vector<mj_part_t> *next_future_num_parts_in_parts){
4614 
4615  mj_part_t part_shift_amount = output_part_numbering_begin_index;
4616  mj_part_t previous_processor = -1;
4617  for(mj_part_t i = 0; i < num_parts; ++i){
4618  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4619  //assigned processors are sorted.
4620  mj_lno_t part_begin_index = 0;
4621  if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4622  mj_lno_t part_end_index = this->new_part_xadj[p];
4623 
4624  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4625  if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4626  output_part_numbering_begin_index = part_shift_amount;
4627  }
4628  previous_processor = assigned_proc;
4629  part_shift_amount += (*next_future_num_parts_in_parts)[p];
4630 
4631  for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4632  mj_lno_t localInd = this->new_coordinate_permutations[j];
4633  coordinate_destinations[localInd] = assigned_proc;
4634  }
4635  }
4636 }
4637 
4638 
4655 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4656  typename mj_part_t>
4657 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4658  mj_gno_t * num_points_in_all_processor_parts,
4659  mj_part_t num_parts,
4660  mj_part_t num_procs,
4661  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
4662  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
4663  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
4664  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
4665  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
4666  int *coordinate_destinations){
4667  out_num_part = 0;
4668 
4669  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4670  out_part_indices.clear();
4671 
4672  //to sort the parts that is assigned to the processors.
4673  //id is the part number, sort value is the assigned processor id.
4674  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4675  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4676 
4677 
4678  //calculate the optimal number of coordinates that should be assigned to each processor.
4679  mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
4680  //to hold the left space as the number of coordinates to the optimal number in each proc.
4681  mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4682  //initialize left space in each.
4683  for (mj_part_t i = 0; i < num_procs; ++i){
4684  space_in_each_processor[i] = work_each;
4685  }
4686 
4687  //we keep track of how many parts each processor is assigned to.
4688  //because in some weird inputs, it might be possible that some
4689  //processors is not assigned to any part. Using these variables,
4690  //we force each processor to have at least one part.
4691  mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4692  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
4693  int empty_proc_count = num_procs;
4694 
4695  //to sort the parts with decreasing order of their coordiantes.
4696  //id are the part numbers, sort value is the number of points in each.
4697  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4698 
4699  //initially we will sort the parts according to the number of coordinates they have.
4700  //so that we will start assigning with the part that has the most number of coordinates.
4701  for (mj_part_t i = 0; i < num_parts; ++i){
4702  sort_item_point_counts_in_parts[i].id = i;
4703  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4704  }
4705  //sort parts with increasing order of loads.
4706  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4707 
4708 
4709  //assigning parts to the processors
4710  //traverse the part win decreasing order of load.
4711  //first assign the heaviest part.
4712  for (mj_part_t j = 0; j < num_parts; ++j){
4713  //sorted with increasing order, traverse inverse.
4714  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4715  //load of the part
4716  mj_gno_t load = global_num_points_in_parts[i];
4717 
4718  //assigned processors
4719  mj_part_t assigned_proc = -1;
4720  //if not fit best processor.
4721  mj_part_t best_proc_to_assign = 0;
4722 
4723 
4724  //sort processors with increasing number of points in this part.
4725  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4726  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4727 
4728  //if there are still enough parts to fill empty processors, than proceed normally.
4729  //but if empty processor count is equal to the number of part, then
4730  //we force to part assignments only to empty processors.
4731  if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4732  //how many points processor ii has in part i?
4733  sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4734  }
4735  else {
4736  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4737  }
4738  }
4739  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4740 
4741  //traverse all processors with decreasing load.
4742  for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4743  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4744  mj_lno_t left_space = space_in_each_processor[ii] - load;
4745  //if enought space, assign to this part.
4746  if(left_space >= 0 ){
4747  assigned_proc = ii;
4748  break;
4749  }
4750  //if space is not enough, store the best candidate part.
4751  if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4752  best_proc_to_assign = ii;
4753  }
4754  }
4755 
4756  //if none had enough space, then assign it to best part.
4757  if (assigned_proc == -1){
4758  assigned_proc = best_proc_to_assign;
4759  }
4760 
4761  if (num_parts_proc_assigned[assigned_proc]++ == 0){
4762  --empty_proc_count;
4763  }
4764  space_in_each_processor[assigned_proc] -= load;
4765  //to sort later, part-i is assigned to the proccessor - assignment.
4766  sort_item_part_to_proc_assignment[j].id = i; //part i
4767  sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
4768 
4769 
4770  //if assigned processor is me, increase the number.
4771  if (assigned_proc == this->myRank){
4772  out_num_part++;//assigned_part_count;
4773  out_part_indices.push_back(i);
4774  }
4775  //increase the send to that processor by the number of points in that part.
4776  //as everyone send their coordiantes in this part to the processor assigned to this part.
4777  send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4778  }
4779  freeArray<mj_part_t>(num_parts_proc_assigned);
4780  freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4781  freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4782  freeArray<mj_lno_t >(space_in_each_processor);
4783 
4784 
4785  //sort assignments with respect to the assigned processors.
4786  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4787  //fill sendBuf.
4788 
4789 
4790  this->assign_send_destinations2(
4791  num_parts,
4792  sort_item_part_to_proc_assignment,
4793  coordinate_destinations,
4794  output_part_numbering_begin_index,
4795  next_future_num_parts_in_parts);
4796 
4797  freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4798 }
4799 
4800 
4818 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4819  typename mj_part_t>
4820 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
4821  mj_gno_t * num_points_in_all_processor_parts,
4822  mj_part_t num_parts,
4823  mj_part_t num_procs,
4824  mj_lno_t *send_count_to_each_proc,
4825  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4826  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4827  mj_part_t &out_num_part,
4828  std::vector<mj_part_t> &out_part_indices,
4829  mj_part_t &output_part_numbering_begin_index,
4830  int *coordinate_destinations){
4831 
4832 
4833 
4834  processor_ranks_for_subcomm.clear();
4835  // if (this->num_local_coords > 0)
4836  if (num_procs > num_parts){
4837  //if there are more processors than the number of current part
4838  //then processors share the existing parts.
4839  //at the end each processor will have a single part,
4840  //but a part will be shared by a group of processors.
4841  mj_part_t out_part_index = 0;
4842  this->mj_assign_proc_to_parts(
4843  num_points_in_all_processor_parts,
4844  num_parts,
4845  num_procs,
4846  send_count_to_each_proc,
4847  processor_ranks_for_subcomm,
4848  next_future_num_parts_in_parts,
4849  out_part_index,
4850  output_part_numbering_begin_index,
4851  coordinate_destinations
4852  );
4853 
4854  out_num_part = 1;
4855  out_part_indices.clear();
4856  out_part_indices.push_back(out_part_index);
4857  }
4858  else {
4859 
4860  //there are more parts than the processors.
4861  //therefore a processor will be assigned multiple parts,
4862  //the subcommunicators will only have a single processor.
4863  processor_ranks_for_subcomm.push_back(this->myRank);
4864 
4865  //since there are more parts then procs,
4866  //assign multiple parts to processors.
4867  this->mj_assign_parts_to_procs(
4868  num_points_in_all_processor_parts,
4869  num_parts,
4870  num_procs,
4871  send_count_to_each_proc,
4872  next_future_num_parts_in_parts,
4873  out_num_part,
4874  out_part_indices,
4875  output_part_numbering_begin_index,
4876  coordinate_destinations);
4877  }
4878 }
4879 
4892 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4893  typename mj_part_t>
4894 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
4895  mj_part_t num_procs,
4896  mj_lno_t &num_new_local_points,
4897  std::string iteration,
4898  int *coordinate_destinations,
4899  mj_part_t num_parts)
4900 {
4901 #ifdef ENABLE_ZOLTAN_MIGRATION
4902  if (sizeof(mj_lno_t) <= sizeof(int)) {
4903 
4904  // Cannot use Zoltan_Comm with local ordinals larger than ints.
4905  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
4906  // may overflow.
4907 
4908  ZOLTAN_COMM_OBJ *plan = NULL;
4909  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4910  int num_incoming_gnos = 0;
4911  int message_tag = 7859;
4912 
4913  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4914  int ierr = Zoltan_Comm_Create(
4915  &plan,
4916  int(this->num_local_coords),
4917  coordinate_destinations,
4918  mpi_comm,
4919  message_tag,
4920  &num_incoming_gnos);
4921  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4922  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4923 
4924  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4925  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4926 
4927  //migrate gnos.
4928  message_tag++;
4929  ierr = Zoltan_Comm_Do(
4930  plan,
4931  message_tag,
4932  (char *) this->current_mj_gnos,
4933  sizeof(mj_gno_t),
4934  (char *) incoming_gnos);
4935  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4936 
4937  freeArray<mj_gno_t>(this->current_mj_gnos);
4938  this->current_mj_gnos = incoming_gnos;
4939 
4940 
4941  //migrate coordinates
4942  for (int i = 0; i < this->coord_dim; ++i){
4943  message_tag++;
4944  mj_scalar_t *coord = this->mj_coordinates[i];
4945 
4946  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4947  ierr = Zoltan_Comm_Do(
4948  plan,
4949  message_tag,
4950  (char *) coord,
4951  sizeof(mj_scalar_t),
4952  (char *) this->mj_coordinates[i]);
4953  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4954  freeArray<mj_scalar_t>(coord);
4955  }
4956 
4957  //migrate weights.
4958  for (int i = 0; i < this->num_weights_per_coord; ++i){
4959  message_tag++;
4960  mj_scalar_t *weight = this->mj_weights[i];
4961 
4962  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4963  ierr = Zoltan_Comm_Do(
4964  plan,
4965  message_tag,
4966  (char *) weight,
4967  sizeof(mj_scalar_t),
4968  (char *) this->mj_weights[i]);
4969  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4970  freeArray<mj_scalar_t>(weight);
4971  }
4972 
4973 
4974  //migrate owners.
4975  int *coord_own = allocMemory<int>(num_incoming_gnos);
4976  message_tag++;
4977  ierr = Zoltan_Comm_Do(
4978  plan,
4979  message_tag,
4980  (char *) this->owner_of_coordinate,
4981  sizeof(int), (char *) coord_own);
4982  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4983  freeArray<int>(this->owner_of_coordinate);
4984  this->owner_of_coordinate = coord_own;
4985 
4986 
4987  //if num procs is less than num parts,
4988  //we need the part assigment arrays as well, since
4989  //there will be multiple parts in processor.
4990  mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4991  if(num_procs < num_parts){
4992  message_tag++;
4993  ierr = Zoltan_Comm_Do(
4994  plan,
4995  message_tag,
4996  (char *) this->assigned_part_ids,
4997  sizeof(mj_part_t),
4998  (char *) new_parts);
4999  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5000  }
5001  freeArray<mj_part_t>(this->assigned_part_ids);
5002  this->assigned_part_ids = new_parts;
5003 
5004  ierr = Zoltan_Comm_Destroy(&plan);
5005  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5006  num_new_local_points = num_incoming_gnos;
5007  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
5008  }
5009 
5010  else
5011 
5012 #endif // ENABLE_ZOLTAN_MIGRATION
5013  {
5014  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5015  Tpetra::Distributor distributor(this->comm);
5016  ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5017  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5018  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5019 
5020  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5021  {
5022  //migrate gnos.
5023  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5024  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5025  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5026  freeArray<mj_gno_t>(this->current_mj_gnos);
5027  this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5028  memcpy(
5029  this->current_mj_gnos,
5030  received_gnos.getRawPtr(),
5031  num_incoming_gnos * sizeof(mj_gno_t));
5032  }
5033  //migrate coordinates
5034  for (int i = 0; i < this->coord_dim; ++i){
5035 
5036  ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5037  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5038  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5039  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5040  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5041  memcpy(
5042  this->mj_coordinates[i],
5043  received_coord.getRawPtr(),
5044  num_incoming_gnos * sizeof(mj_scalar_t));
5045  }
5046 
5047  //migrate weights.
5048  for (int i = 0; i < this->num_weights_per_coord; ++i){
5049 
5050  ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5051  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5052  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5053  freeArray<mj_scalar_t>(this->mj_weights[i]);
5054  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5055  memcpy(
5056  this->mj_weights[i],
5057  received_weight.getRawPtr(),
5058  num_incoming_gnos * sizeof(mj_scalar_t));
5059  }
5060 
5061  {
5062  //migrate the owners of the coordinates
5063  ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5064  ArrayRCP<int> received_owners(num_incoming_gnos);
5065  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
5066  freeArray<int>(this->owner_of_coordinate);
5067  this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5068  memcpy(
5069  this->owner_of_coordinate,
5070  received_owners.getRawPtr(),
5071  num_incoming_gnos * sizeof(int));
5072  }
5073 
5074  //if num procs is less than num parts,
5075  //we need the part assigment arrays as well, since
5076  //there will be multiple parts in processor.
5077  if(num_procs < num_parts){
5078  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5079  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5080  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5081  freeArray<mj_part_t>(this->assigned_part_ids);
5082  this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5083  memcpy(
5084  this->assigned_part_ids,
5085  received_partids.getRawPtr(),
5086  num_incoming_gnos * sizeof(mj_part_t));
5087  }
5088  else {
5089  mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5090  freeArray<mj_part_t>(this->assigned_part_ids);
5091  this->assigned_part_ids = new_parts;
5092  }
5093  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5094  num_new_local_points = num_incoming_gnos;
5095 
5096  }
5097 }
5098 
5105 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5106  typename mj_part_t>
5107 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5108  mj_part_t group_size = processor_ranks_for_subcomm.size();
5109  mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5110  for(mj_part_t i = 0; i < group_size; ++i) {
5111  ids[i] = processor_ranks_for_subcomm[i];
5112  }
5113  ArrayView<const mj_part_t> idView(ids, group_size);
5114  this->comm = this->comm->createSubcommunicator(idView);
5115  freeArray<mj_part_t>(ids);
5116 }
5117 
5118 
5124 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5125  typename mj_part_t>
5126 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5127  mj_part_t output_num_parts,
5128  mj_part_t num_parts){
5129  //if there is single output part, then simply fill the permutation array.
5130  if (output_num_parts == 1){
5131  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5132  this->new_coordinate_permutations[i] = i;
5133  }
5134  this->new_part_xadj[0] = this->num_local_coords;
5135  }
5136  else {
5137 
5138  //otherwise we need to count how many points are there in each part.
5139  //we allocate here as num_parts, because the sent partids are up to num_parts,
5140  //although there are outout_num_parts different part.
5141  mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5142  //part shift holds the which part number an old part number corresponds to.
5143  mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5144 
5145  memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
5146 
5147  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5148  mj_part_t ii = this->assigned_part_ids[i];
5149  ++num_points_in_parts[ii];
5150  }
5151 
5152  //write the end points of the parts.
5153  mj_part_t p = 0;
5154  mj_lno_t prev_index = 0;
5155  for(mj_part_t i = 0; i < num_parts; ++i){
5156  if(num_points_in_parts[i] > 0) {
5157  this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5158  prev_index += num_points_in_parts[i];
5159  part_shifts[i] = p++;
5160  }
5161  }
5162 
5163  //for the rest of the parts write the end index as end point.
5164  mj_part_t assigned_num_parts = p - 1;
5165  for (;p < num_parts; ++p){
5166  this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5167  }
5168  for(mj_part_t i = 0; i < output_num_parts; ++i){
5169  num_points_in_parts[i] = this->new_part_xadj[i];
5170  }
5171 
5172  //write the permutation array here.
5173  //get the part of the coordinate i, shift it to obtain the new part number.
5174  //assign it to the end of the new part numbers pointer.
5175  for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5176  mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5177  this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5178  }
5179 
5180  freeArray<mj_lno_t>(num_points_in_parts);
5181  freeArray<mj_part_t>(part_shifts);
5182  }
5183 }
5184 
5185 
5208 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5209  typename mj_part_t>
5210 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5211  mj_part_t input_num_parts, //current number of parts
5212  mj_part_t &output_num_parts, //output number of parts.
5213  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5214  mj_part_t &output_part_begin_index,
5215  size_t migration_reduce_all_population,
5216  mj_lno_t num_coords_for_last_dim_part,
5217  std::string iteration,
5218  RCP<mj_partBoxVector_t> &input_part_boxes,
5219  RCP<mj_partBoxVector_t> &output_part_boxes
5220 )
5221 {
5222  mj_part_t num_procs = this->comm->getSize();
5223  this->myRank = this->comm->getRank();
5224 
5225 
5226  //this array holds how many points each processor has in each part.
5227  //to access how many points processor i has on part j,
5228  //num_points_in_all_processor_parts[i * num_parts + j]
5229  mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5230 
5231  //get the number of coordinates in each part in each processor.
5232  this->get_processor_num_points_in_parts(
5233  num_procs,
5234  input_num_parts,
5235  num_points_in_all_processor_parts);
5236 
5237 
5238  //check if migration will be performed or not.
5239  if (!this->mj_check_to_migrate(
5240  migration_reduce_all_population,
5241  num_coords_for_last_dim_part,
5242  num_procs,
5243  input_num_parts,
5244  num_points_in_all_processor_parts)){
5245  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5246  return false;
5247  }
5248 
5249 
5250  mj_lno_t *send_count_to_each_proc = NULL;
5251  int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5252  send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5253  for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5254 
5255  std::vector<mj_part_t> processor_ranks_for_subcomm;
5256  std::vector<mj_part_t> out_part_indices;
5257 
5258  //determine which processors are assigned to which parts
5259  this->mj_migration_part_proc_assignment(
5260  num_points_in_all_processor_parts,
5261  input_num_parts,
5262  num_procs,
5263  send_count_to_each_proc,
5264  processor_ranks_for_subcomm,
5265  next_future_num_parts_in_parts,
5266  output_num_parts,
5267  out_part_indices,
5268  output_part_begin_index,
5269  coordinate_destinations);
5270 
5271 
5272 
5273 
5274  freeArray<mj_lno_t>(send_count_to_each_proc);
5275  std::vector <mj_part_t> tmpv;
5276 
5277  std::sort (out_part_indices.begin(), out_part_indices.end());
5278  mj_part_t outP = out_part_indices.size();
5279 
5280  mj_gno_t new_global_num_points = 0;
5281  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5282 
5283  if (this->mj_keep_part_boxes){
5284  input_part_boxes->clear();
5285  }
5286 
5287  //now we calculate the new values for next_future_num_parts_in_parts.
5288  //same for the part boxes.
5289  for (mj_part_t i = 0; i < outP; ++i){
5290  mj_part_t ind = out_part_indices[i];
5291  new_global_num_points += global_num_points_in_parts[ind];
5292  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5293  if (this->mj_keep_part_boxes){
5294  input_part_boxes->push_back((*output_part_boxes)[ind]);
5295  }
5296  }
5297  //swap the input and output part boxes.
5298  if (this->mj_keep_part_boxes){
5299  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5300  input_part_boxes = output_part_boxes;
5301  output_part_boxes = tmpPartBoxes;
5302  }
5303  next_future_num_parts_in_parts->clear();
5304  for (mj_part_t i = 0; i < outP; ++i){
5305  mj_part_t p = tmpv[i];
5306  next_future_num_parts_in_parts->push_back(p);
5307  }
5308 
5309  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5310 
5311  mj_lno_t num_new_local_points = 0;
5312 
5313 
5314  //perform the actual migration operation here.
5315  this->mj_migrate_coords(
5316  num_procs,
5317  num_new_local_points,
5318  iteration,
5319  coordinate_destinations,
5320  input_num_parts);
5321 
5322 
5323  freeArray<int>(coordinate_destinations);
5324 
5325  if(this->num_local_coords != num_new_local_points){
5326  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5327  freeArray<mj_lno_t>(this->coordinate_permutations);
5328 
5329  this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5330  this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5331  }
5332  this->num_local_coords = num_new_local_points;
5333  this->num_global_coords = new_global_num_points;
5334 
5335 
5336 
5337  //create subcommunicator.
5338  this->create_sub_communicator(processor_ranks_for_subcomm);
5339  processor_ranks_for_subcomm.clear();
5340 
5341  //fill the new permutation arrays.
5342  this->fill_permutation_array(
5343  output_num_parts,
5344  input_num_parts);
5345  return true;
5346 }
5347 
5348 
5362 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5363  typename mj_part_t>
5364 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5365  mj_part_t num_parts,
5366  mj_scalar_t *mj_current_dim_coords,
5367  mj_scalar_t *current_concurrent_cut_coordinate,
5368  mj_lno_t coordinate_begin,
5369  mj_lno_t coordinate_end,
5370  mj_scalar_t *used_local_cut_line_weight_to_left,
5371  mj_lno_t *out_part_xadj,
5372  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5373 
5374  //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin;
5375  mj_part_t no_cuts = num_parts - 1;
5376 
5377 
5378 
5379  int me = 0;
5380  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5381  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5382 
5383 
5384  //now if the rectilinear partitioning is allowed we decide how
5385  //much weight each thread should put to left and right.
5386  if (this->distribute_points_on_cut_lines){
5387 
5388  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5389  for (mj_part_t i = 0; i < no_cuts; ++i){
5390  //the left to be put on the left of the cut.
5391  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5392  //std::cout << "i:" << i << " left_weight:" << left_weight << std::endl;
5393  for(int ii = 0; ii < this->num_threads; ++ii){
5394  if(left_weight > this->sEpsilon){
5395  //the weight of thread ii on cut.
5396  mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5397  if(thread_ii_weight_on_cut < left_weight){
5398  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5399  }
5400  else {
5401  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5402  }
5403  left_weight -= thread_ii_weight_on_cut;
5404  }
5405  else {
5406  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5407  }
5408  }
5409  }
5410 
5411  if(no_cuts > 0){
5412  //this is a special case. If cutlines share the same coordinate, their weights are equal.
5413  //we need to adjust the ratio for that.
5414  for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5415  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5416  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5417  }
5418  my_local_thread_cut_weights_to_put_left[i] = static_cast<long long>((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
5419  / mj_scalar_t(SIGNIFICANCE_MUL);
5420  }
5421  }
5422  }
5423 
5424  for(mj_part_t ii = 0; ii < num_parts; ++ii){
5425  thread_num_points_in_parts[ii] = 0;
5426  }
5427 
5428  //for this specific case we dont want to distribute the points along the cut position
5429  //randomly, as we need a specific ordering of them. Instead,
5430  //we put the coordinates into a sort item, where we sort those
5431  //using the coordinates of points on other dimensions and the index.
5432 
5433 
5434  //some of the cuts might share the same position.
5435  //in this case, if cut i and cut j share the same position
5436  //cut_map[i] = cut_map[j] = sort item index.
5437  mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5438 
5439 
5440  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5441  typedef std::vector< multiSItem > multiSVector;
5442  typedef std::vector<multiSVector> multiS2Vector;
5443 
5444  //to keep track of the memory allocated.
5445  std::vector<mj_scalar_t *>allocated_memory;
5446 
5447  //vector for which the coordinates will be sorted.
5448  multiS2Vector sort_vector_points_on_cut;
5449 
5450  //the number of cuts that have different coordinates.
5451  mj_part_t different_cut_count = 1;
5452  cut_map[0] = 0;
5453 
5454  //now we insert 1 sort vector for all cuts on the different
5455  //positins.if multiple cuts are on the same position, they share sort vectors.
5456  multiSVector tmpMultiSVector;
5457  sort_vector_points_on_cut.push_back(tmpMultiSVector);
5458 
5459  for (mj_part_t i = 1; i < no_cuts ; ++i){
5460  //if cuts share the same cut coordinates
5461  //set the cutmap accordingly.
5462  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5463  cut_map[i] = cut_map[i-1];
5464  }
5465  else {
5466  cut_map[i] = different_cut_count++;
5467  multiSVector tmp2MultiSVector;
5468  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5469  }
5470  }
5471 
5472 
5473  //now the actual part assigment.
5474  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5475 
5476  mj_lno_t i = this->coordinate_permutations[ii];
5477 
5478  mj_part_t pp = this->assigned_part_ids[i];
5479  mj_part_t p = pp / 2;
5480  //if the coordinate is on a cut.
5481  if(pp % 2 == 1 ){
5482  mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5483  allocated_memory.push_back(vals);
5484 
5485  //we insert the coordinates to the sort item here.
5486  int val_ind = 0;
5487 
5488  if (longest_dim_part){
5489  //std::cout << std::endl << std::endl;
5490  for(int dim = this->coord_dim - 2; dim >= 0; --dim){
5491  //uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted
5492  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5493  //std::cout << "next_largest_coord_dim: " << next_largest_coord_dim << " ";
5494  vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5495  }
5496  }
5497  else {
5498  for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5499  vals[val_ind++] = this->mj_coordinates[dim][i];
5500  }
5501  for(int dim = 0; dim < coordInd; ++dim){
5502  vals[val_ind++] = this->mj_coordinates[dim][i];
5503  }
5504  }
5505  multiSItem tempSortItem(i, this->coord_dim -1, vals);
5506  //inser the point to the sort vector pointed by the cut_map[p].
5507  mj_part_t cmap = cut_map[p];
5508  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5509  }
5510  else {
5511  //if it is not on the cut, simple sorting.
5512  ++thread_num_points_in_parts[p];
5513  this->assigned_part_ids[i] = p;
5514  }
5515  }
5516 
5517  //sort all the sort vectors.
5518  for (mj_part_t i = 0; i < different_cut_count; ++i){
5519  std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5520  }
5521 
5522  //we do the part assignment for the points on cuts here.
5523  mj_part_t previous_cut_map = cut_map[0];
5524 
5525  //this is how much previous part owns the weight of the current part.
5526  //when target part weight is 1.6, and the part on the left is given 2,
5527  //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
5528  //this parameter is used to balance this issues.
5529  //in the above example weight_stolen_from_previous_part will be 0.4.
5530  //if the left part target is 2.2 but it is given 2,
5531  //then weight_stolen_from_previous_part will be -0.2.
5532  mj_scalar_t weight_stolen_from_previous_part = 0;
5533  for (mj_part_t p = 0; p < no_cuts; ++p){
5534 
5535  mj_part_t mapped_cut = cut_map[p];
5536 
5537  //if previous cut map is done, and it does not have the same index,
5538  //then assign all points left on that cut to its right.
5539  if (previous_cut_map != mapped_cut){
5540  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5541  for (; sort_vector_end >= 0; --sort_vector_end){
5542  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5543  mj_lno_t i = t.index;
5544  ++thread_num_points_in_parts[p];
5545  this->assigned_part_ids[i] = p;
5546  }
5547  sort_vector_points_on_cut[previous_cut_map].clear();
5548  }
5549 
5550  //TODO: MD: I dont remember why I have it reverse order here.
5551  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5552  //mj_lno_t sort_vector_begin= 0;
5553  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
5554 
5555  //TODO commented for reverse order
5556  for (; sort_vector_end >= 0; --sort_vector_end){
5557  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5558  //TODO COMMENTED FOR REVERSE ORDER
5559  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5560  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
5561  mj_lno_t i = t.index;
5562  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5563 
5564 
5565  //part p has enough space for point i, then put it to point i.
5566  if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5567  my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5568  > this->sEpsilon){
5569 
5570  my_local_thread_cut_weights_to_put_left[p] -= w;
5571  sort_vector_points_on_cut[mapped_cut].pop_back();
5572  ++thread_num_points_in_parts[p];
5573  this->assigned_part_ids[i] = p;
5574  //if putting this weight to left overweights the left cut, then
5575  //increase the space for the next cut using weight_stolen_from_previous_part.
5576  if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5577  if(mapped_cut == cut_map[p + 1] ){
5578  //if the cut before the cut indexed at p was also at the same position
5579  //special case, as we handle the weight differently here.
5580  if (previous_cut_map != mapped_cut){
5581  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5582  }
5583  else {
5584  //if the cut before the cut indexed at p was also at the same position
5585  //we assign extra weights cumulatively in this case.
5586  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5587  }
5588  }
5589  else{
5590  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5591  }
5592  //end assignment for part p
5593  break;
5594  }
5595  } else {
5596  //if part p does not have enough space for this point
5597  //and if there is another cut sharing the same positon,
5598  //again increase the space for the next
5599  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5600  if (previous_cut_map != mapped_cut){
5601  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5602  }
5603  else {
5604  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5605  }
5606  }
5607  else{
5608  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5609  }
5610  //end assignment for part p
5611  break;
5612  }
5613  }
5614  previous_cut_map = mapped_cut;
5615  }
5616 
5617  //TODO commented for reverse order
5618  //put everything left on the last cut to the last part.
5619  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5620 
5621  //mj_lno_t sort_vector_begin= 0;
5622  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size();
5623  //TODO commented for reverse order
5624  for (; sort_vector_end >= 0; --sort_vector_end){
5625  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5626  //TODO commented for reverse order
5627  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5628  //multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
5629  mj_lno_t i = t.index;
5630  ++thread_num_points_in_parts[no_cuts];
5631  this->assigned_part_ids[i] = no_cuts;
5632  }
5633  sort_vector_points_on_cut[previous_cut_map].clear();
5634  freeArray<mj_part_t> (cut_map);
5635 
5636  //free the memory allocated for vertex sort items .
5637  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5638  for(mj_lno_t i = 0; i < vSize; ++i){
5639  freeArray<mj_scalar_t> (allocated_memory[i]);
5640  }
5641 
5642  //creation of part_xadj as in usual case.
5643  for(mj_part_t j = 0; j < num_parts; ++j){
5644  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5645  for (int i = 0; i < this->num_threads; ++i){
5646  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5647  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5648  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5649 
5650  }
5651  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
5652  }
5653 
5654  //perform prefix sum for num_points in parts.
5655  for(mj_part_t j = 1; j < num_parts; ++j){
5656  out_part_xadj[j] += out_part_xadj[j - 1];
5657  }
5658 
5659 
5660  //shift the num points in threads thread to obtain the
5661  //beginning index of each thread's private space.
5662  for(mj_part_t j = 1; j < num_parts; ++j){
5663  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5664  }
5665 
5666  //now thread gets the coordinate and writes the index of coordinate to the permutation array
5667  //using the part index we calculated.
5668  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5669  mj_lno_t i = this->coordinate_permutations[ii];
5670  mj_part_t p = this->assigned_part_ids[i];
5671  this->new_coordinate_permutations[coordinate_begin +
5672  thread_num_points_in_parts[p]++] = i;
5673  }
5674 }
5675 
5676 
5677 
5687 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5688  typename mj_part_t>
5689 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5690  mj_part_t current_num_parts,
5691  mj_part_t output_part_begin_index,
5692  RCP<mj_partBoxVector_t> &output_part_boxes,
5693  bool is_data_ever_migrated)
5694 {
5695  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5696 
5697 #ifdef HAVE_ZOLTAN2_OMP
5698 #pragma omp parallel for
5699 #endif
5700  for(mj_part_t i = 0; i < current_num_parts;++i){
5701 
5702  mj_lno_t begin = 0;
5703  mj_lno_t end = this->part_xadj[i];
5704 
5705  if(i > 0) begin = this->part_xadj[i -1];
5706  mj_part_t part_to_set_index = i + output_part_begin_index;
5707  if (this->mj_keep_part_boxes){
5708  (*output_part_boxes)[i].setpId(part_to_set_index);
5709  }
5710  for (mj_lno_t ii = begin; ii < end; ++ii){
5711  mj_lno_t k = this->coordinate_permutations[ii];
5712  this->assigned_part_ids[k] = part_to_set_index;
5713  }
5714  }
5715 
5716  //ArrayRCP<const mj_gno_t> gnoList;
5717  if(!is_data_ever_migrated){
5718  //freeArray<mj_gno_t>(this->current_mj_gnos);
5719  //if(this->num_local_coords > 0){
5720  // gnoList = arcpFromArrayView(this->mj_gnos);
5721  //}
5722  }
5723  else {
5724 #ifdef ENABLE_ZOLTAN_MIGRATION
5725  if (sizeof(mj_lno_t) <= sizeof(int)) {
5726 
5727  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5728  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5729  // may overflow.
5730 
5731  //if data is migrated, then send part numbers to the original owners.
5732  ZOLTAN_COMM_OBJ *plan = NULL;
5733  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5734 
5735  int incoming = 0;
5736  int message_tag = 7856;
5737 
5738  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
5739  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
5740  this->owner_of_coordinate, mpi_comm, message_tag,
5741  &incoming);
5742  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5743  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
5744 
5745  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5746 
5747  message_tag++;
5748  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5749  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
5750  sizeof(mj_gno_t), (char *) incoming_gnos);
5751  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5752 
5753  freeArray<mj_gno_t>(this->current_mj_gnos);
5754  this->current_mj_gnos = incoming_gnos;
5755 
5756  mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5757 
5758  message_tag++;
5759  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
5760  sizeof(mj_part_t), (char *) incoming_partIds);
5761  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5762  freeArray<mj_part_t>(this->assigned_part_ids);
5763  this->assigned_part_ids = incoming_partIds;
5764 
5765  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5766  ierr = Zoltan_Comm_Destroy(&plan);
5767  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5768 
5769  this->num_local_coords = incoming;
5770  //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
5771  }
5772  else
5773 
5774 #endif // !ENABLE_ZOLTAN_MIGRATION
5775  {
5776  //if data is migrated, then send part numbers to the original owners.
5777  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
5778  Tpetra::Distributor distributor(this->mj_problemComm);
5779  ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5780  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5781  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
5782 
5783  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5784  //migrate gnos to actual owners.
5785  ArrayRCP<mj_gno_t> received_gnos(incoming);
5786  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5787  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5788  freeArray<mj_gno_t>(this->current_mj_gnos);
5789  this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5790  memcpy( this->current_mj_gnos,
5791  received_gnos.getRawPtr(),
5792  incoming * sizeof(mj_gno_t));
5793 
5794  //migrate part ids to actual owners.
5795  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5796  ArrayRCP<mj_part_t> received_partids(incoming);
5797  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5798  freeArray<mj_part_t>(this->assigned_part_ids);
5799  this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5800  memcpy( this->assigned_part_ids,
5801  received_partids.getRawPtr(),
5802  incoming * sizeof(mj_part_t));
5803 
5804  this->num_local_coords = incoming;
5805  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5806 
5807  }
5808  }
5809 
5810  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5811 
5812  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5813 
5814  //ArrayRCP<mj_part_t> partId;
5815  //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
5816 
5817  if (this->mj_keep_part_boxes){
5818  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5819 
5820  }
5821 
5822  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5823 }
5824 
5827 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5828  typename mj_part_t>
5829 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
5830  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
5831 
5832  for (int i=0; i < this->coord_dim; i++){
5833  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5834  }
5835  freeArray<mj_scalar_t *>(this->mj_coordinates);
5836 
5837  for (int i=0; i < this->num_weights_per_coord; i++){
5838  freeArray<mj_scalar_t>(this->mj_weights[i]);
5839  }
5840  freeArray<mj_scalar_t *>(this->mj_weights);
5841 
5842  freeArray<int>(this->owner_of_coordinate);
5843 
5844  for(int i = 0; i < this->num_threads; ++i){
5845  freeArray<mj_lno_t>(this->thread_point_counts[i]);
5846  }
5847 
5848  freeArray<mj_lno_t *>(this->thread_point_counts);
5849  freeArray<double *> (this->thread_part_weight_work);
5850 
5851  if(this->distribute_points_on_cut_lines){
5852  freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5853  for(int i = 0; i < this->num_threads; ++i){
5854  freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5855  }
5856  freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5857  freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5858  freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5859  }
5860 
5861  freeArray<mj_part_t>(this->my_incomplete_cut_count);
5862 
5863  freeArray<mj_scalar_t>(this->max_min_coords);
5864 
5865  freeArray<mj_lno_t>(this->part_xadj);
5866 
5867  freeArray<mj_lno_t>(this->coordinate_permutations);
5868 
5869  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5870 
5871  freeArray<mj_scalar_t>(this->all_cut_coordinates);
5872 
5873  freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5874 
5875  freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5876 
5877  freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5878 
5879  freeArray<mj_scalar_t>(this->target_part_weights);
5880 
5881  freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5882 
5883  freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5884 
5885  freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5886  freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5887  freeArray<bool>(this->is_cut_line_determined);
5888  freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5889  freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5890 
5891  for(int i = 0; i < this->num_threads; ++i){
5892  freeArray<double>(this->thread_part_weights[i]);
5893  freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5894  freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5895  }
5896 
5897  freeArray<double *>(this->thread_part_weights);
5898  freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5899  freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5900 
5901  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
5902 }
5903 
5912 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5913  typename mj_part_t>
5915  bool distribute_points_on_cut_lines_,
5916  int max_concurrent_part_calculation_,
5917  int check_migrate_avoid_migration_option_,
5918  double minimum_migration_imbalance_,
5919  int migration_type_ ){
5920  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5921  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5922  this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5923  this->minimum_migration_imbalance = minimum_migration_imbalance_;
5924  this->migration_type = migration_type_;
5925 
5926 }
5927 
5928 
5929 
5930 
5959 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5960  typename mj_part_t>
5962 
5963  const RCP<const Environment> &env,
5964  RCP<const Comm<int> > &problemComm,
5965 
5966  double imbalance_tolerance_,
5967  size_t num_global_parts_,
5968  mj_part_t *part_no_array_,
5969  int recursion_depth_,
5970 
5971  int coord_dim_,
5972  mj_lno_t num_local_coords_,
5973  mj_gno_t num_global_coords_,
5974  const mj_gno_t *initial_mj_gnos_,
5975  mj_scalar_t **mj_coordinates_,
5976 
5977  int num_weights_per_coord_,
5978  bool *mj_uniform_weights_,
5979  mj_scalar_t **mj_weights_,
5980  bool *mj_uniform_parts_,
5981  mj_scalar_t **mj_part_sizes_,
5982 
5983  mj_part_t *&result_assigned_part_ids_,
5984  mj_gno_t *&result_mj_gnos_
5985 )
5986 {
5987 
5988 
5989 
5990 #ifdef print_debug
5991  if(comm->getRank() == 0){
5992  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
5993  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
5994  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
5995  }
5996 #endif
5997  this->mj_env = env;
5998  this->mj_problemComm = problemComm;
5999  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
6000 
6001  /*
6002  if (0)
6003  {
6004  int a = rand();
6005  this->mj_problemComm->broadcast(0, sizeof(int), (char *) (&a));
6006  std::string istring = "output_" + Teuchos::toString<int>(a) + "_" + Teuchos::toString<int>(myRank) + ".mtx";
6007 
6008  std::ofstream output(istring.c_str());
6009  output << num_local_coords_ << " " << coord_dim_ << std::endl;
6010  for (int j = 0; j < coord_dim_ ; ++j){
6011  for (size_t i = 0; i < num_local_coords_; ++i){
6012  output << mj_coordinates_[j][i] << std::endl;
6013  }
6014 
6015  }
6016  output.close();
6017  }
6018  */
6019 
6020 
6021  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
6022  this->mj_env->debug(3, "In MultiJagged Jagged");
6023 
6024  {
6025  this->imbalance_tolerance = imbalance_tolerance_;
6026  this->num_global_parts = num_global_parts_;
6027  this->part_no_array = part_no_array_;
6028  this->recursion_depth = recursion_depth_;
6029 
6030  this->coord_dim = coord_dim_;
6031  this->num_local_coords = num_local_coords_;
6032  this->num_global_coords = num_global_coords_;
6033  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
6034  this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
6035 
6036  this->num_weights_per_coord = num_weights_per_coord_;
6037  this->mj_uniform_weights = mj_uniform_weights_;
6038  this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
6039  this->mj_uniform_parts = mj_uniform_parts_;
6040  this->mj_part_sizes = mj_part_sizes_;
6041 
6042  this->num_threads = 1;
6043 #ifdef HAVE_ZOLTAN2_OMP
6044 #pragma omp parallel
6045 
6046  {
6047  this->num_threads = omp_get_num_threads();
6048  }
6049 #endif
6050  }
6051  //this->set_input_data();
6052  this->set_part_specifications();
6053 
6054  this->allocate_set_work_memory();
6055 
6056  //We duplicate the comm as we create subcommunicators during migration.
6057  //We keep the problemComm as it is, while comm changes after each migration.
6058  this->comm = this->mj_problemComm->duplicate();
6059 
6060  //initially there is a single partition
6061  mj_part_t current_num_parts = 1;
6062  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6063 
6064  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6065 
6066  mj_part_t output_part_begin_index = 0;
6067  mj_part_t future_num_parts = this->total_num_part;
6068  bool is_data_ever_migrated = false;
6069 
6070  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
6071  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
6072  next_future_num_parts_in_parts->push_back(this->num_global_parts);
6073 
6074  RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
6075  RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
6076 
6077  compute_global_box();
6078  if(this->mj_keep_part_boxes){
6079  this->init_part_boxes(output_part_boxes);
6080  }
6081 
6082  for (int i = 0; i < this->recursion_depth; ++i){
6083  //partitioning array. size will be as the number of current partitions and this
6084  //holds how many parts that each part will be in the current dimension partitioning.
6085  std::vector <mj_part_t> num_partitioning_in_current_dim;
6086 
6087  //number of parts that will be obtained at the end of this partitioning.
6088  //future_num_part_in_parts is as the size of current number of parts.
6089  //holds how many more parts each should be divided in the further
6090  //iterations. this will be used to calculate num_partitioning_in_current_dim,
6091  //as the number of parts that the part will be partitioned
6092  //in the current dimension partitioning.
6093 
6094  //next_future_num_parts_in_parts will be as the size of outnumParts,
6095  //and this will hold how many more parts that each output part
6096  //should be divided. this array will also be used to determine the weight ratios
6097  //of the parts.
6098  //swap the arrays to use iteratively..
6099  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6100  future_num_part_in_parts = next_future_num_parts_in_parts;
6101  next_future_num_parts_in_parts = tmpPartVect;
6102 
6103  //clear next_future_num_parts_in_parts array as
6104  //getPartitionArrays expects it to be empty.
6105  //it also expects num_partitioning_in_current_dim to be empty as well.
6106  next_future_num_parts_in_parts->clear();
6107 
6108  if(this->mj_keep_part_boxes){
6109  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6110  input_part_boxes = output_part_boxes;
6111  output_part_boxes = tmpPartBoxes;
6112  output_part_boxes->clear();
6113  }
6114 
6115  //returns the total no. of output parts for this dimension partitioning.
6116  mj_part_t output_part_count_in_dimension =
6117  this->update_part_num_arrays(
6118  num_partitioning_in_current_dim,
6119  future_num_part_in_parts,
6120  next_future_num_parts_in_parts,
6121  future_num_parts,
6122  current_num_parts,
6123  i,
6124  input_part_boxes,
6125  output_part_boxes, 1);
6126 
6127  //if the number of obtained parts equal to current number of parts,
6128  //skip this dimension. For example, this happens when 1 is given in the input
6129  //part array is given. P=4,5,1,2
6130  if(output_part_count_in_dimension == current_num_parts) {
6131  //still need to swap the input output arrays.
6132  tmpPartVect= future_num_part_in_parts;
6133  future_num_part_in_parts = next_future_num_parts_in_parts;
6134  next_future_num_parts_in_parts = tmpPartVect;
6135 
6136  if(this->mj_keep_part_boxes){
6137  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6138  input_part_boxes = output_part_boxes;
6139  output_part_boxes = tmpPartBoxes;
6140  }
6141  continue;
6142  }
6143 
6144 
6145  //get the coordinate axis along which the partitioning will be done.
6146  int coordInd = i % this->coord_dim;
6147  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6148 
6149  //convert i to string to be used for debugging purposes.
6150  std::string istring = Teuchos::toString<int>(i);
6151  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6152 
6153  //alloc Memory to point the indices
6154  //of the parts in the permutation array.
6155  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6156 
6157  //the index where in the new_part_xadj will be written.
6158  mj_part_t output_part_index = 0;
6159  //whatever is written to output_part_index will be added with putput_coordinate_end_index
6160  //so that the points will be shifted.
6161  mj_part_t output_coordinate_end_index = 0;
6162 
6163  mj_part_t current_work_part = 0;
6164  mj_part_t current_concurrent_num_parts =
6165  std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6166 
6167  mj_part_t obtained_part_index = 0;
6168 
6169  //run for all available parts.
6170  for (; current_work_part < current_num_parts;
6171  current_work_part += current_concurrent_num_parts){
6172 
6173  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6174  this->max_concurrent_part_calculation);
6175 
6176  mj_part_t actual_work_part_count = 0;
6177  //initialization for 1D partitioning.
6178  //get the min and max coordinates of each part
6179  //together with the part weights of each part.
6180  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6181  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6182 
6183  //if this part wont be partitioned any further
6184  //dont do any work for this part.
6185  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6186  continue;
6187  }
6188  ++actual_work_part_count;
6189  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6190  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
6191 
6192 /*
6193  std::cout << "i:" << i << " j:" << current_work_part + kk
6194  << " coordinate_begin_index:" << coordinate_begin_index
6195  << " coordinate_end_index:" << coordinate_end_index
6196  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
6197  */
6198  this->mj_get_local_min_max_coord_totW(
6199  coordinate_begin_index,
6200  coordinate_end_index,
6201  this->coordinate_permutations,
6202  mj_current_dim_coords,
6203  this->process_local_min_max_coord_total_weight[kk], //min_coordinate
6204  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
6205  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
6206 
6207  }
6208 
6209  //1D partitioning
6210  if (actual_work_part_count > 0){
6211  //obtain global Min max of the part.
6212  this->mj_get_global_min_max_coord_totW(
6213  current_concurrent_num_parts,
6214  this->process_local_min_max_coord_total_weight,
6215  this->global_min_max_coord_total_weight);
6216 
6217  //represents the total number of cutlines
6218  //whose coordinate should be determined.
6219  mj_part_t total_incomplete_cut_count = 0;
6220 
6221  //Compute weight ratios for parts & cuts:
6222  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
6223  //part0 cut0 part1 cut1 part2 cut2 part3
6224  mj_part_t concurrent_part_cut_shift = 0;
6225  mj_part_t concurrent_part_part_shift = 0;
6226  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6227  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6228  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6229  current_concurrent_num_parts];
6230 
6231  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6232  2 * current_concurrent_num_parts];
6233 
6234  mj_part_t concurrent_current_part_index = current_work_part + kk;
6235 
6236  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6237 
6238  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6239  mj_scalar_t *current_target_part_weights = this->target_part_weights +
6240  concurrent_part_part_shift;
6241  //shift the usedCutCoordinate array as noCuts.
6242  concurrent_part_cut_shift += partition_count - 1;
6243  //shift the partRatio array as noParts.
6244  concurrent_part_part_shift += partition_count;
6245 
6246 
6247  //calculate only if part is not empty,
6248  //and part will be further partitioned.
6249  if(partition_count > 1 && min_coordinate <= max_coordinate){
6250 
6251  //increase num_cuts_do_be_determined by the number of cuts of the current
6252  //part's cut line number.
6253  total_incomplete_cut_count += partition_count - 1;
6254  //set the number of cut lines that should be determined
6255  //for this part.
6256  this->my_incomplete_cut_count[kk] = partition_count - 1;
6257 
6258  //get the target weights of the parts.
6259  this->mj_get_initial_cut_coords_target_weights(
6260  min_coordinate,
6261  max_coordinate,
6262  partition_count - 1,
6263  global_total_weight,
6264  usedCutCoordinate,
6265  current_target_part_weights,
6266  future_num_part_in_parts,
6267  next_future_num_parts_in_parts,
6268  concurrent_current_part_index,
6269  obtained_part_index);
6270 
6271  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6272  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
6273 
6274  //get the initial estimated part assignments of the
6275  //coordinates.
6276  this->set_initial_coordinate_parts(
6277  max_coordinate,
6278  min_coordinate,
6279  concurrent_current_part_index,
6280  coordinate_begin_index, coordinate_end_index,
6281  this->coordinate_permutations,
6282  mj_current_dim_coords,
6283  this->assigned_part_ids,
6284  partition_count);
6285  }
6286  else {
6287  // e.g., if have fewer coordinates than parts, don't need to do next dim.
6288  this->my_incomplete_cut_count[kk] = 0;
6289  }
6290  obtained_part_index += partition_count;
6291  }
6292 
6293 
6294 
6295  //used imbalance, it is always 0, as it is difficult to
6296  //estimate a range.
6297  double used_imbalance = 0;
6298 
6299 
6300  // Determine cut lines for all concurrent parts parts here.
6301  this->mj_1D_part(
6302  mj_current_dim_coords,
6303  used_imbalance,
6304  current_work_part,
6305  current_concurrent_num_parts,
6306  current_cut_coordinates,
6307  total_incomplete_cut_count,
6308  num_partitioning_in_current_dim);
6309  }
6310 
6311  //create new part chunks
6312  {
6313  mj_part_t output_array_shift = 0;
6314  mj_part_t cut_shift = 0;
6315  size_t tlr_shift = 0;
6316  size_t partweight_array_shift = 0;
6317 
6318  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6319  mj_part_t current_concurrent_work_part = current_work_part + kk;
6320  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6321 
6322  //if the part is empty, skip the part.
6323  if((num_parts != 1 )
6324  &&
6325  this->global_min_max_coord_total_weight[kk] >
6326  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6327 
6328  //we still need to write the begin and end point of the
6329  //empty part. simply set it zero, the array indices will be shifted later.
6330  for(mj_part_t jj = 0; jj < num_parts; ++jj){
6331  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6332  }
6333  cut_shift += num_parts - 1;
6334  tlr_shift += (4 *(num_parts - 1) + 1);
6335  output_array_shift += num_parts;
6336  partweight_array_shift += (2 * (num_parts - 1) + 1);
6337  continue;
6338  }
6339 
6340  mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6341  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6342  current_concurrent_work_part -1];
6343  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6344  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6345  cut_shift;
6346 
6347  //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift;
6348 
6349  for(int ii = 0; ii < this->num_threads; ++ii){
6350  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6351  }
6352 
6353  if(num_parts > 1){
6354  if(this->mj_keep_part_boxes){
6355  //if part boxes are to be stored update the boundaries.
6356  for (mj_part_t j = 0; j < num_parts - 1; ++j){
6357  (*output_part_boxes)[output_array_shift + output_part_index +
6358  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6359  /*update max*/, coordInd);
6360 
6361  (*output_part_boxes)[output_array_shift + output_part_index + j +
6362  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6363  /*update min*/, coordInd);
6364  }
6365  }
6366 
6367  // Rewrite the indices based on the computed cuts.
6368  this->mj_create_new_partitions(
6369  num_parts,
6370  mj_current_dim_coords,
6371  current_concurrent_cut_coordinate,
6372  coordinate_begin,
6373  coordinate_end,
6374  used_local_cut_line_weight_to_left,
6375  this->thread_part_weight_work,
6376  this->new_part_xadj + output_part_index + output_array_shift
6377  );
6378 
6379  }
6380  else {
6381  //if this part is partitioned into 1 then just copy
6382  //the old values.
6383  mj_lno_t part_size = coordinate_end - coordinate_begin;
6384  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6385  memcpy(
6386  this->new_coordinate_permutations + coordinate_begin,
6387  this->coordinate_permutations + coordinate_begin,
6388  part_size * sizeof(mj_lno_t));
6389  }
6390  cut_shift += num_parts - 1;
6391  tlr_shift += (4 *(num_parts - 1) + 1);
6392  output_array_shift += num_parts;
6393  partweight_array_shift += (2 * (num_parts - 1) + 1);
6394  }
6395 
6396  //shift cut coordinates so that all cut coordinates are stored.
6397  //no shift now because we dont keep the cuts.
6398  //current_cut_coordinates += cut_shift;
6399 
6400  //mj_create_new_partitions from coordinates partitioned the parts and
6401  //write the indices as if there were a single part.
6402  //now we need to shift the beginning indices.
6403  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6404  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6405  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6406  //shift it by previousCount
6407  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6408  }
6409  //increase the previous count by current end.
6410  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6411  //increase the current out.
6412  output_part_index += num_parts ;
6413  }
6414  }
6415  }
6416  // end of this partitioning dimension
6417 
6418 
6419  int current_world_size = this->comm->getSize();
6420  long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6421 
6422 
6423  bool is_migrated_in_current_dimension = false;
6424 
6425  //we migrate if there are more partitionings to be done after this step
6426  //and if the migration is not forced to be avoided.
6427  //and the operation is not sequential.
6428  if (future_num_parts > 1 &&
6429  this->check_migrate_avoid_migration_option >= 0 &&
6430  current_world_size > 1){
6431 
6432  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6433  mj_part_t num_parts = output_part_count_in_dimension;
6434  if ( this->mj_perform_migration(
6435  num_parts,
6436  current_num_parts, //output
6437  next_future_num_parts_in_parts, //output
6438  output_part_begin_index,
6439  migration_reduce_all_population,
6440  this->num_global_coords / (future_num_parts * current_num_parts),
6441  istring,
6442  input_part_boxes, output_part_boxes) ) {
6443  is_migrated_in_current_dimension = true;
6444  is_data_ever_migrated = true;
6445  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
6446  istring);
6447  //since data is migrated, we reduce the number of reduceAll operations for the last part.
6448  this->total_dim_num_reduce_all /= num_parts;
6449  }
6450  else {
6451  is_migrated_in_current_dimension = false;
6452  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6453  }
6454  }
6455 
6456  //swap the coordinate permutations for the next dimension.
6457  mj_lno_t * tmp = this->coordinate_permutations;
6458  this->coordinate_permutations = this->new_coordinate_permutations;
6459  this->new_coordinate_permutations = tmp;
6460 
6461  if(!is_migrated_in_current_dimension){
6462  this->total_dim_num_reduce_all -= current_num_parts;
6463  current_num_parts = output_part_count_in_dimension;
6464  }
6465  freeArray<mj_lno_t>(this->part_xadj);
6466  this->part_xadj = this->new_part_xadj;
6467  this->new_part_xadj = NULL;
6468  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6469  }
6470 
6471  // Partitioning is done
6472  delete future_num_part_in_parts;
6473  delete next_future_num_parts_in_parts;
6474 
6475  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6477 
6478 
6479  //get the final parts of each initial coordinate
6480  //the results will be written to
6481  //this->assigned_part_ids for gnos given in this->current_mj_gnos
6482  this->set_final_parts(
6483  current_num_parts,
6484  output_part_begin_index,
6485  output_part_boxes,
6486  is_data_ever_migrated);
6487 
6488  result_assigned_part_ids_ = this->assigned_part_ids;
6489  result_mj_gnos_ = this->current_mj_gnos;
6490 
6491  this->free_work_memory();
6492  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
6493  this->mj_env->debug(3, "Out of MultiJagged");
6494 
6495 }
6496 
6497 
6501 template <typename Adapter>
6502 class Zoltan2_AlgMJ : public Algorithm<Adapter>
6503 {
6504 private:
6505 
6506 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6507 
6508  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
6509 
6510  // For coordinates and weights, MJ needs floats or doubles
6511  // But Adapter can provide other scalars, e.g., ints.
6512  // So have separate scalar_t for MJ and adapter.
6513  typedef typename Adapter::scalar_t adapter_scalar_t;
6514 
6515  // Provide a default type for mj_scalar_t;
6516  typedef float default_mj_scalar_t;
6517 
6518  // If Adapter provided float or double scalar_t, use it (prevents copies).
6519  // Otherwise, use the default type of mj_scalar_t;
6520  typedef typename
6521  std::conditional<
6522  (std::is_same<adapter_scalar_t, float>::value ||
6523  std::is_same<adapter_scalar_t, double>::value),
6524  adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
6525 
6526  typedef typename Adapter::gno_t mj_gno_t;
6527  typedef typename Adapter::lno_t mj_lno_t;
6528  typedef typename Adapter::node_t mj_node_t;
6529  typedef typename Adapter::part_t mj_part_t;
6530  typedef coordinateModelPartBox mj_partBox_t;
6531  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6532 #endif
6534 
6535  RCP<const Environment> mj_env; //the environment object
6536  RCP<const Comm<int> > mj_problemComm; //initial comm object
6537  RCP<const coordinateModel_t> mj_coords; //coordinate adapter
6538 
6539  //PARAMETERS
6540  double imbalance_tolerance; //input imbalance tolerance.
6541  size_t num_global_parts; //the targeted number of parts
6542  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
6543  int recursion_depth; //the number of steps that partitioning will be solved in.
6544 
6545  int coord_dim; // coordinate dimension.
6546  mj_lno_t num_local_coords; //number of local coords.
6547  mj_gno_t num_global_coords; //number of global coords.
6548  const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
6549  mj_scalar_t **mj_coordinates; //two dimension coordinate array
6550 
6551  int num_weights_per_coord; // number of weights per coordinate
6552  bool *mj_uniform_weights; //if the coordinates have uniform weights.
6553  mj_scalar_t **mj_weights; //two dimensional weight array
6554  bool *mj_uniform_parts; //if the target parts are uniform
6555  mj_scalar_t **mj_part_sizes; //target part weight sizes.
6556 
6557  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
6558  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
6559  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
6560  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
6561  //1 for minimized messages
6562  double minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
6563  bool mj_keep_part_boxes; //if the boxes need to be kept.
6564 
6565  int num_threads;
6566 
6567  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
6568  int mj_premigration_option;
6569  int min_coord_per_rank_for_premigration;
6570 
6571  ArrayRCP<mj_part_t> comXAdj_; //communication graph xadj
6572  ArrayRCP<mj_part_t> comAdj_; //communication graph adj.
6573 
6574 
6575  //when we have strided data, it returns a unstrided data in RCP form.
6576  //we need to hold on to that data, during the execution of mj, so that the data is not released.
6577  //coordinate_rcp_holder will hold that data, and release it when MJ is deleted.
6578  ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6579 
6580  void set_up_partitioning_data(
6581  const RCP<PartitioningSolution<Adapter> >&solution);
6582 
6583  void set_input_parameters(const Teuchos::ParameterList &p);
6584 
6585  void free_work_memory();
6586 
6587  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
6588 
6589  bool mj_premigrate_to_subset(int used_num_ranks, int migration_selection_option,
6590  RCP<const Environment> mj_env_,
6591  RCP<const Comm<int> > mj_problemComm_,
6592  int coord_dim_,
6593  mj_lno_t num_local_coords_,
6594  mj_gno_t num_global_coords_, size_t num_global_parts_,
6595  const mj_gno_t *initial_mj_gnos_,
6596  mj_scalar_t **mj_coordinates_,
6597  int num_weights_per_coord_,
6598  mj_scalar_t **mj_weights_,
6599  //results
6600  RCP<const Comm<int> > &result_problemComm_,
6601  mj_lno_t & result_num_local_coords_,
6602  mj_gno_t * &result_initial_mj_gnos_,
6603  mj_scalar_t ** &result_mj_coordinates_,
6604  mj_scalar_t ** &result_mj_weights_,
6605  int * &result_actual_owner_rank_);
6606 
6607 public:
6608 
6609  Zoltan2_AlgMJ(const RCP<const Environment> &env,
6610  RCP<const Comm<int> > &problemComm,
6611  const RCP<const coordinateModel_t> &coords) :
6612  mj_partitioner(), mj_env(env),
6613  mj_problemComm(problemComm),
6614  mj_coords(coords),
6615  imbalance_tolerance(0),
6616  num_global_parts(1), part_no_array(NULL),
6617  recursion_depth(0),
6618  coord_dim(0),num_local_coords(0), num_global_coords(0),
6619  initial_mj_gnos(NULL), mj_coordinates(NULL),
6620  num_weights_per_coord(0),
6621  mj_uniform_weights(NULL), mj_weights(NULL),
6622  mj_uniform_parts(NULL),
6623  mj_part_sizes(NULL),
6624  distribute_points_on_cut_lines(true),
6625  max_concurrent_part_calculation(1),
6626  check_migrate_avoid_migration_option(0), migration_type(0),
6627  minimum_migration_imbalance(0.30),
6628  mj_keep_part_boxes(false), num_threads(1), mj_run_as_rcb(false),mj_premigration_option(0), min_coord_per_rank_for_premigration(32000),
6629  comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6630  {}
6631 
6633  if (coordinate_ArrayRCP_holder != NULL){
6634  delete [] this->coordinate_ArrayRCP_holder;
6635  this->coordinate_ArrayRCP_holder = NULL;
6636  }
6637  }
6638 
6641  static void getValidParameters(ParameterList & pl)
6642  {
6643  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
6644  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6645  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
6646  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
6647  "algorithm. As many as the dimension count.", mj_parts_Validator);
6648 
6649  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
6650  "coordinates will be calculated concurently.", Environment::getAnyIntValidator());
6651 
6652  pl.set("mj_minimum_migration_imbalance", 1.1,
6653  "mj_minimum_migration_imbalance, the minimum imbalance of the "
6654  "processors to avoid migration",
6656 
6657  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6658  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6659  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
6660  "depending on the imbalance, 1 for forcing migration, 2 for "
6661  "avoiding migration", mj_migration_option_validator);
6662 
6663 
6664 
6665 
6666  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6667  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6668  pl.set("mj_migration_type", 0, "Migration type, 0 for migration to minimize the imbalance "
6669  "1 for migration to minimize messages exchanged the migration." ,
6670  mj_migration_option_validator);
6671 
6672  // bool parameter
6673  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
6674  "geometric partitioning.", Environment::getBoolValidator());
6675 
6676  // bool parameter
6677  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
6679 
6680  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
6681  "greater than 0.", Environment::getAnyIntValidator());
6682 
6683  RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6684  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6685 
6686  pl.set("mj_premigration_option", 0, "Whether to do premigration or not. 0 for no migration "
6687  "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6688  , mj_premigration_option_validator);
6689 
6690  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to assign each rank in multijagged after premigration"
6692 
6693  }
6694 
6701  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
6702 
6703  mj_partBoxVector_t &getPartBoxesView() const
6704  {
6705  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6706  return *pBoxes;
6707  }
6708 
6709  mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
6710 
6711  void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
6712  size_t &nPartsFound, mj_part_t **partsFound) const;
6713 
6714 
6717  void getCommunicationGraph(
6718  const PartitioningSolution<Adapter> *solution,
6719  ArrayRCP<mj_part_t> &comXAdj,
6720  ArrayRCP<mj_part_t> &comAdj);
6721 };
6722 
6723 
6724 
6725 
6726 template <typename Adapter>
6727 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset( int used_num_ranks,
6728  int /* migration_selection_option */,
6729  RCP<const Environment> mj_env_,
6730  RCP<const Comm<int> > mj_problemComm_,
6731  int coord_dim_,
6732  mj_lno_t num_local_coords_,
6733  mj_gno_t /* num_global_coords_ */, size_t /* num_global_parts_ */,
6734  const mj_gno_t *initial_mj_gnos_,
6735  mj_scalar_t **mj_coordinates_,
6736  int num_weights_per_coord_,
6737  mj_scalar_t **mj_weights_,
6738  //results
6739  RCP<const Comm<int> > &result_problemComm_,
6740  mj_lno_t &result_num_local_coords_,
6741  mj_gno_t * &result_initial_mj_gnos_,
6742  mj_scalar_t ** &result_mj_coordinates_,
6743  mj_scalar_t ** &result_mj_weights_,
6744  int * &result_actual_owner_rank_){
6745  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
6746 
6747 
6748  int myRank = mj_problemComm_->getRank();
6749  int worldSize = mj_problemComm_->getSize();
6750 
6751  mj_part_t groupsize = worldSize / used_num_ranks;
6752 
6753  //std::cout << "used_num_ranks:" << used_num_ranks << " groupsize:" << groupsize << std::endl;
6754 
6755  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
6756 
6757  mj_part_t i_am_sending_to = 0;
6758  bool am_i_a_receiver = false;
6759 
6760  for(int i = 0; i < used_num_ranks; ++i){
6761  group_begins[i+ 1] = group_begins[i] + groupsize;
6762  if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
6763  if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
6764  if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
6765  if (myRank == group_begins[i]) am_i_a_receiver= true;
6766  }
6767 
6768  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
6769  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
6770 
6771 
6772  Tpetra::Distributor distributor(mj_problemComm_);
6773 
6774  std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
6775  ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
6776  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6777  result_num_local_coords_ = num_incoming_gnos;
6778  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
6779 
6780  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
6781 
6782  //migrate gnos.
6783  {
6784  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6785 
6786  ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
6787  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6788 
6789  result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
6790  memcpy(
6791  result_initial_mj_gnos_,
6792  received_gnos.getRawPtr(),
6793  num_incoming_gnos * sizeof(mj_gno_t));
6794  }
6795 
6796  //migrate coordinates
6797  result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
6798  for (int i = 0; i < coord_dim_; ++i){
6799  ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
6800  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6801  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
6802  result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6803  memcpy(
6804  result_mj_coordinates_[i],
6805  received_coord.getRawPtr(),
6806  num_incoming_gnos * sizeof(mj_scalar_t));
6807  }
6808 
6809  result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
6810  //migrate weights.
6811  for (int i = 0; i < num_weights_per_coord_; ++i){
6812  ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
6813  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6814  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
6815  result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6816  memcpy(
6817  result_mj_weights_[i],
6818  received_weight.getRawPtr(),
6819  num_incoming_gnos * sizeof(mj_scalar_t));
6820  }
6821 
6822  //migrate the owners of the coordinates
6823  {
6824  std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
6825  ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
6826  ArrayRCP<int> received_owners(num_incoming_gnos);
6827  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
6828  result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
6829  memcpy(
6830  result_actual_owner_rank_,
6831  received_owners.getRawPtr(),
6832  num_incoming_gnos * sizeof(int));
6833  }
6834  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
6835  return am_i_a_receiver;
6836 }
6837 
6838 
6839 
6840 
6841 
6842 
6843 
6853 template <typename Adapter>
6855  const RCP<PartitioningSolution<Adapter> > &solution
6856 )
6857 {
6858  this->set_up_partitioning_data(solution);
6859  this->set_input_parameters(this->mj_env->getParameters());
6860  if (this->mj_keep_part_boxes){
6861  this->mj_partitioner.set_to_keep_part_boxes();
6862  }
6863  this->mj_partitioner.set_partitioning_parameters(
6864  this->distribute_points_on_cut_lines,
6865  this->max_concurrent_part_calculation,
6866  this->check_migrate_avoid_migration_option,
6867  this->minimum_migration_imbalance, this->migration_type);
6868 
6869 
6870  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
6871  mj_lno_t result_num_local_coords = this->num_local_coords;
6872  mj_gno_t * result_initial_mj_gnos = NULL;
6873  mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
6874  mj_scalar_t **result_mj_weights = this->mj_weights;
6875  int *result_actual_owner_rank = NULL;
6876  const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
6877 
6878  //TODO: MD 08/2017: Further discussion is required.
6879  //MueLu calls MJ when it has very few coordinates per processors, such as 10.
6880  //For example, it begins with 1K processor with 1K coordinate in each.
6881  //Then with coarsening this reduces to 10 coordinate per procesor.
6882  //It calls MJ to repartition these to 10 coordinates.
6883  //MJ runs with 1K processor, 10 coordinate in each, and partitions to 10 parts.
6884  //As expected strong scaling is problem here, because computation is almost 0, and
6885  //communication cost of MJ linearly increases.
6886  //Premigration option gathers the coordinates to 10 parts before MJ starts
6887  //therefore MJ will run with a smalller subset of the problem.
6888  //Below, I am migrating the coordinates if mj_premigration_option is set,
6889  //and the result parts are less than the current part count, and the average number of
6890  //local coordinates is less than some threshold.
6891  //For example, premigration may not help if 1000 processors are partitioning data to 10,
6892  //but each of them already have 1M coordinate. In that case, we premigration would not help.
6893  int current_world_size = this->mj_problemComm->getSize();
6894  mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
6895  bool is_pre_migrated = false;
6896  bool am_i_in_subset = true;
6897  if ( mj_premigration_option > 0 &&
6898  size_t (current_world_size) > this->num_global_parts &&
6899  this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
6900  if (this->mj_keep_part_boxes){
6901  throw std::logic_error("Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
6902  }
6903  is_pre_migrated =true;
6904  int migration_selection_option = mj_premigration_option;
6905  if(migration_selection_option * this->num_global_parts > (size_t) (current_world_size)){
6906  migration_selection_option = current_world_size / this->num_global_parts;
6907  }
6908  int used_num_ranks = int (this->num_global_coords / float (threshold_num_local_coords) + 0.5);
6909  if (used_num_ranks == 0) used_num_ranks = 1;
6910 
6911  am_i_in_subset = this->mj_premigrate_to_subset(
6912  used_num_ranks,
6913  migration_selection_option,
6914  this->mj_env,
6915  this->mj_problemComm,
6916  this->coord_dim,
6917  this->num_local_coords,
6918  this->num_global_coords,
6919  this->num_global_parts,
6920  this->initial_mj_gnos,
6921  this->mj_coordinates,
6922  this->num_weights_per_coord,
6923  this->mj_weights,
6924  //results
6925  result_problemComm,
6926  result_num_local_coords,
6927  result_initial_mj_gnos,
6928  result_mj_coordinates,
6929  result_mj_weights,
6930  result_actual_owner_rank);
6931  result_initial_mj_gnos_ = result_initial_mj_gnos;
6932  }
6933 
6934 
6935 
6936  mj_part_t *result_assigned_part_ids = NULL;
6937  mj_gno_t *result_mj_gnos = NULL;
6938 
6939  if (am_i_in_subset){
6940  this->mj_partitioner.multi_jagged_part(
6941  this->mj_env,
6942  result_problemComm, //this->mj_problemComm,
6943 
6944  this->imbalance_tolerance,
6945  this->num_global_parts,
6946  this->part_no_array,
6947  this->recursion_depth,
6948 
6949  this->coord_dim,
6950  result_num_local_coords, //this->num_local_coords,
6951  this->num_global_coords,
6952  result_initial_mj_gnos_, //this->initial_mj_gnos,
6953  result_mj_coordinates, //this->mj_coordinates,
6954 
6955  this->num_weights_per_coord,
6956  this->mj_uniform_weights,
6957  result_mj_weights, //this->mj_weights,
6958  this->mj_uniform_parts,
6959  this->mj_part_sizes,
6960 
6961  result_assigned_part_ids,
6962  result_mj_gnos
6963  );
6964 
6965  }
6966 
6967  // Reorder results so that they match the order of the input
6968 
6969 #if defined(__cplusplus) && __cplusplus >= 201103L
6970  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6971  localGidToLid.reserve(result_num_local_coords);
6972  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6973  localGidToLid[result_initial_mj_gnos_[i]] = i;
6974  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
6975  0, result_num_local_coords, true);
6976 
6977  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6978  mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6979  partId[origLID] = result_assigned_part_ids[i];
6980  }
6981 
6982 #else
6983  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6984  localGidToLid(result_num_local_coords);
6985  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6986  localGidToLid.put(result_initial_mj_gnos_[i], i);
6987 
6988  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
6989  0, result_num_local_coords, true);
6990 
6991  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6992  mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6993  partId[origLID] = result_assigned_part_ids[i];
6994  }
6995 
6996 #endif // C++11 is enabled
6997 
6998  delete [] result_mj_gnos;
6999  delete [] result_assigned_part_ids;
7000 
7001 
7002  //now the results are reordered. but if premigration occured,
7003  //then we need to send these ids to actual owners again.
7004  if (is_pre_migrated){
7005  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
7006  Tpetra::Distributor distributor(this->mj_problemComm);
7007 
7008  ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
7009  mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
7010  if (num_incoming_gnos != this->num_local_coords){
7011  throw std::logic_error("Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
7012  }
7013  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
7014  mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
7015  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7016  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
7017  {
7018  ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7019  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7020  }
7021  {
7022  ArrayView<mj_part_t> sent_partnos(partId());
7023  distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7024  }
7025  partId = arcp(new mj_part_t[this->num_local_coords],
7026  0, this->num_local_coords, true);
7027 
7028  {
7029 #if defined(__cplusplus) && __cplusplus >= 201103L
7030  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7031  localGidToLid2.reserve(this->num_local_coords);
7032  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7033  localGidToLid2[this->initial_mj_gnos[i]] = i;
7034 
7035 
7036  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7037  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7038  partId[origLID] = received_partids[i];
7039  }
7040 
7041 #else
7042  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7043  localGidToLid2(this->num_local_coords);
7044  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7045  localGidToLid2.put(this->initial_mj_gnos[i], i);
7046 
7047 
7048  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7049  mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7050  partId[origLID] = received_partids[i];
7051  }
7052 
7053 #endif // C++11 is enabled
7054 
7055  }
7056 
7057  {
7058  freeArray<mj_gno_t> (result_initial_mj_gnos);
7059  for (int i = 0; i < this->coord_dim; ++i){
7060  freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7061  }
7062  freeArray<mj_scalar_t *> (result_mj_coordinates);
7063 
7064  for (int i = 0; i < this->num_weights_per_coord; ++i){
7065  freeArray<mj_scalar_t> (result_mj_weights[i]);
7066  }
7067  freeArray<mj_scalar_t *> (result_mj_weights);
7068  freeArray<int> (result_actual_owner_rank);
7069  }
7070  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
7071 
7072  }
7073 
7074  solution->setParts(partId);
7075  this->free_work_memory();
7076 }
7077 
7078 /* \brief Freeing the memory allocated.
7079  * */
7080 template <typename Adapter>
7082  freeArray<mj_scalar_t *>(this->mj_coordinates);
7083  freeArray<mj_scalar_t *>(this->mj_weights);
7084  freeArray<bool>(this->mj_uniform_parts);
7085  freeArray<mj_scalar_t *>(this->mj_part_sizes);
7086  freeArray<bool>(this->mj_uniform_weights);
7087 
7088 }
7089 
7090 /* \brief Sets the partitioning data for multijagged algorithm.
7091  * */
7092 template <typename Adapter>
7093 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7094  const RCP<PartitioningSolution<Adapter> > &solution
7095 )
7096 {
7097  this->coord_dim = this->mj_coords->getCoordinateDim();
7098  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7099  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7100  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7101  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7102 
7103  // From the Solution we get part information.
7104  // If the part sizes for a given criteria are not uniform,
7105  // then they are values that sum to 1.0.
7106  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7107  //allocate only two dimensional pointer.
7108  //raw pointer addresess will be obtained from multivector.
7109  this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7110  this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7111 
7112  //if the partitioning results are to be uniform.
7113  this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7114  //if in a criteria dimension, uniform part is false this shows ratios of
7115  //the target part weights.
7116  this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7117  //if the weights of coordinates are uniform in a criteria dimension.
7118  this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7119 
7120  typedef StridedData<mj_lno_t, adapter_scalar_t> input_t;
7121  ArrayView<const mj_gno_t> gnos;
7122  ArrayView<input_t> xyz;
7123  ArrayView<input_t> wgts;
7124 
7125 
7126  this->coordinate_ArrayRCP_holder = new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7127 
7128  this->mj_coords->getCoordinates(gnos, xyz, wgts);
7129  //obtain global ids.
7130  ArrayView<const mj_gno_t> mj_gnos = gnos;
7131  this->initial_mj_gnos = mj_gnos.getRawPtr();
7132 
7133  //extract coordinates from multivector.
7134  for (int dim=0; dim < this->coord_dim; dim++){
7135  ArrayRCP<const mj_scalar_t> ar;
7136  xyz[dim].getInputArray(ar); // will copy if stride != 1 or
7137  // adapter_scalar_t != mj_scalar_t
7138  this->coordinate_ArrayRCP_holder[dim] = ar;
7139 
7140  //multiJagged coordinate values assignment
7141  this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7142  }
7143 
7144  //if no weights are provided set uniform weight.
7145  if (this->num_weights_per_coord == 0){
7146  this->mj_uniform_weights[0] = true;
7147  this->mj_weights[0] = NULL;
7148  }
7149  else{
7150  //if weights are provided get weights for all weight indices
7151  for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7152  ArrayRCP<const mj_scalar_t> ar;
7153  wgts[wdim].getInputArray(ar); // will copy if stride!=1
7154  // or adapter_scalar_t !=
7155  // mj_scalar_t
7156  this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7157  this->mj_uniform_weights[wdim] = false;
7158  this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7159  }
7160  }
7161 
7162  for (int wdim = 0; wdim < criteria_dim; wdim++){
7163  if (solution->criteriaHasUniformPartSizes(wdim)){
7164  this->mj_uniform_parts[wdim] = true;
7165  this->mj_part_sizes[wdim] = NULL;
7166  }
7167  else{
7168  std::cerr << "MJ does not support non uniform target part weights" << std::endl;
7169  exit(1);
7170  }
7171  }
7172 }
7173 
7174 /* \brief Sets the partitioning parameters for multijagged algorithm.
7175  * \param pl: is the parameter list provided to zoltan2 call
7176  * */
7177 template <typename Adapter>
7178 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
7179 
7180  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
7181  if (pe){
7182  double tol;
7183  tol = pe->getValue(&tol);
7184  this->imbalance_tolerance = tol - 1.0;
7185  }
7186 
7187  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
7188  if (this->imbalance_tolerance <= 0)
7189  this->imbalance_tolerance= 10e-4;
7190 
7191  //if an input partitioning array is provided.
7192  this->part_no_array = NULL;
7193  //the length of the input partitioning array.
7194  this->recursion_depth = 0;
7195 
7196  if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
7197  this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
7198  this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
7199  this->mj_env->debug(2, "mj_parts provided by user");
7200  }
7201 
7202  //get mj specific parameters.
7203  this->distribute_points_on_cut_lines = true;
7204  this->max_concurrent_part_calculation = 1;
7205 
7206  this->mj_run_as_rcb = false;
7207  this->mj_premigration_option = 0;
7208  this->min_coord_per_rank_for_premigration = 32000;
7209 
7210  int mj_user_recursion_depth = -1;
7211  this->mj_keep_part_boxes = false;
7212  this->check_migrate_avoid_migration_option = 0;
7213  this->migration_type = 0;
7214  this->minimum_migration_imbalance = 0.35;
7215 
7216  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
7217  if (pe){
7218  double imb;
7219  imb = pe->getValue(&imb);
7220  this->minimum_migration_imbalance = imb - 1.0;
7221  }
7222 
7223  pe = pl.getEntryPtr("mj_migration_option");
7224  if (pe){
7225  this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7226  }else {
7227  this->check_migrate_avoid_migration_option = 0;
7228  }
7229  if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7230 
7232  pe = pl.getEntryPtr("mj_migration_type");
7233  if (pe){
7234  this->migration_type = pe->getValue(&this->migration_type);
7235  }else {
7236  this->migration_type = 0;
7237  }
7238  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
7240 
7241  pe = pl.getEntryPtr("mj_concurrent_part_count");
7242  if (pe){
7243  this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7244  }else {
7245  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
7246  }
7247 
7248  pe = pl.getEntryPtr("mj_keep_part_boxes");
7249  if (pe){
7250  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7251  }else {
7252  this->mj_keep_part_boxes = false; // Set to invalid value
7253  }
7254 
7255 
7256  // For now, need keep_part_boxes to do pointAssign and boxAssign.
7257  // pe = pl.getEntryPtr("keep_cuts");
7258  // if (pe){
7259  // int tmp = pe->getValue(&tmp);
7260  // if (tmp) this->mj_keep_part_boxes = true;
7261  // }
7262 
7263  //need to keep part boxes if mapping type is geometric.
7264  if (this->mj_keep_part_boxes == false){
7265  pe = pl.getEntryPtr("mapping_type");
7266  if (pe){
7267  int mapping_type = -1;
7268  mapping_type = pe->getValue(&mapping_type);
7269  if (mapping_type == 0){
7270  mj_keep_part_boxes = true;
7271  }
7272  }
7273  }
7274 
7275  //need to keep part boxes if mapping type is geometric.
7276  pe = pl.getEntryPtr("mj_enable_rcb");
7277  if (pe){
7278  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7279  }else {
7280  this->mj_run_as_rcb = false; // Set to invalid value
7281  }
7282 
7283  pe = pl.getEntryPtr("mj_premigration_option");
7284  if (pe){
7285  mj_premigration_option = pe->getValue(&mj_premigration_option);
7286  }else {
7287  mj_premigration_option = 0;
7288  }
7289 
7290  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
7291  if (pe){
7292  min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
7293  }else {
7294  min_coord_per_rank_for_premigration = 32000;
7295  }
7296 
7297  pe = pl.getEntryPtr("mj_recursion_depth");
7298  if (pe){
7299  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7300  }else {
7301  mj_user_recursion_depth = -1; // Set to invalid value
7302  }
7303 
7304  bool val = false;
7305  pe = pl.getEntryPtr("rectilinear");
7306  if (pe) val = pe->getValue(&val);
7307  if (val){
7308  this->distribute_points_on_cut_lines = false;
7309  } else {
7310  this->distribute_points_on_cut_lines = true;
7311  }
7312 
7313  if (this->mj_run_as_rcb){
7314  mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7315  }
7316  if (this->recursion_depth < 1){
7317  if (mj_user_recursion_depth > 0){
7318  this->recursion_depth = mj_user_recursion_depth;
7319  }
7320  else {
7321  this->recursion_depth = this->coord_dim;
7322  }
7323  }
7324 
7325  this->num_threads = 1;
7326 #ifdef HAVE_ZOLTAN2_OMP
7327 #pragma omp parallel
7328  {
7329  this->num_threads = omp_get_num_threads();
7330  }
7331 #endif
7332 
7333 }
7334 
7336 template <typename Adapter>
7338  int dim,
7339  adapter_scalar_t *lower,
7340  adapter_scalar_t *upper,
7341  size_t &nPartsFound,
7342  typename Adapter::part_t **partsFound) const
7343 {
7344  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7345  // TODO: complexity. Or at least do a search through the boxes, using
7346  // TODO: p x q x r x ... if possible.
7347 
7348  nPartsFound = 0;
7349  *partsFound = NULL;
7350 
7351  if (this->mj_keep_part_boxes) {
7352 
7353  // Get vector of part boxes
7354  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7355 
7356  size_t nBoxes = (*partBoxes).size();
7357  if (nBoxes == 0) {
7358  throw std::logic_error("no part boxes exist");
7359  }
7360 
7361  // Determine whether the box overlaps the globalBox at all
7362  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7363 
7364  if (globalBox->boxesOverlap(dim, lower, upper)) {
7365 
7366  std::vector<typename Adapter::part_t> partlist;
7367 
7368  // box overlaps the global box; find specific overlapping boxes
7369  for (size_t i = 0; i < nBoxes; i++) {
7370  try {
7371  if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7372  nPartsFound++;
7373  partlist.push_back((*partBoxes)[i].getpId());
7374 
7375 // std::cout << "Given box (";
7376 // for (int j = 0; j < dim; j++)
7377 // std::cout << lower[j] << " ";
7378 // std::cout << ") x (";
7379 // for (int j = 0; j < dim; j++)
7380 // std::cout << upper[j] << " ";
7381 // std::cout << ") overlaps PartBox "
7382 // << (*partBoxes)[i].getpId() << " (";
7383 // for (int j = 0; j < dim; j++)
7384 // std::cout << (*partBoxes)[i].getlmins()[j] << " ";
7385 // std::cout << ") x (";
7386 // for (int j = 0; j < dim; j++)
7387 // std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
7388 // std::cout << ")" << std::endl;
7389  }
7390  }
7392  }
7393  if (nPartsFound) {
7394  *partsFound = new mj_part_t[nPartsFound];
7395  for (size_t i = 0; i < nPartsFound; i++)
7396  (*partsFound)[i] = partlist[i];
7397  }
7398  }
7399  else {
7400  // Box does not overlap the domain at all. Find the closest part
7401  // Not sure how to perform this operation for MJ without having the
7402  // cuts. With the RCB cuts, the concept of a part extending to
7403  // infinity was natural. With the boxes, it is much more difficult.
7404  // TODO: For now, return information indicating NO OVERLAP.
7405 
7406  }
7407  }
7408  else {
7409  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
7410  }
7411 }
7412 
7414 template <typename Adapter>
7416  int dim,
7417  adapter_scalar_t *point) const
7418 {
7419 
7420  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7421  // TODO: complexity. Or at least do a search through the boxes, using
7422  // TODO: p x q x r x ... if possible.
7423 
7424  if (this->mj_keep_part_boxes) {
7425  typename Adapter::part_t foundPart = -1;
7426 
7427  // Get vector of part boxes
7428  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7429 
7430  size_t nBoxes = (*partBoxes).size();
7431  if (nBoxes == 0) {
7432  throw std::logic_error("no part boxes exist");
7433  }
7434 
7435  // Determine whether the point is within the global domain
7436  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7437 
7438  if (globalBox->pointInBox(dim, point)) {
7439 
7440  // point is in the global domain; determine in which part it is.
7441  size_t i;
7442  for (i = 0; i < nBoxes; i++) {
7443  try {
7444  if ((*partBoxes)[i].pointInBox(dim, point)) {
7445  foundPart = (*partBoxes)[i].getpId();
7446 // std::cout << "Point (";
7447 // for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
7448 // std::cout << ") found in box " << i << " part " << foundPart
7449 // << std::endl;
7450 // (*partBoxes)[i].print();
7451  break;
7452  }
7453  }
7455  }
7456 
7457  if (i == nBoxes) {
7458  // This error should never occur
7459  std::ostringstream oss;
7460  oss << "Point (";
7461  for (int j = 0; j < dim; j++) oss << point[j] << " ";
7462  oss << ") not found in domain";
7463  throw std::logic_error(oss.str());
7464  }
7465  }
7466 
7467  else {
7468  // Point is outside the global domain.
7469  // Determine to which part it is closest.
7470  // TODO: with cuts, would not need this special case
7471 
7472  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
7473  size_t closestBox = 0;
7474  coord_t minDistance = std::numeric_limits<coord_t>::max();
7475  coord_t *centroid = new coord_t[dim];
7476  for (size_t i = 0; i < nBoxes; i++) {
7477  (*partBoxes)[i].computeCentroid(centroid);
7478  coord_t sum = 0.;
7479  coord_t diff;
7480  for (int j = 0; j < dim; j++) {
7481  diff = centroid[j] - point[j];
7482  sum += diff * diff;
7483  }
7484  if (sum < minDistance) {
7485  minDistance = sum;
7486  closestBox = i;
7487  }
7488  }
7489  foundPart = (*partBoxes)[closestBox].getpId();
7490  delete [] centroid;
7491  }
7492 
7493  return foundPart;
7494  }
7495  else {
7496  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
7497  }
7498 }
7499 
7500 template <typename Adapter>
7502  const PartitioningSolution<Adapter> * /* solution */,
7503  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
7504  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
7505 {
7506  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7507  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7508  mj_part_t ntasks = (*pBoxes).size();
7509  int dim = (*pBoxes)[0].getDim();
7510  GridHash grid(pBoxes, ntasks, dim);
7511  grid.getAdjArrays(comXAdj_, comAdj_);
7512  }
7513  comAdj = comAdj_;
7514  comXAdj = comXAdj_;
7515 }
7516 
7517 
7518 template <typename Adapter>
7519 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7521 {
7522  return this->mj_partitioner.get_kept_boxes();
7523 }
7524 
7525 
7526 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7527  typename mj_part_t>
7528 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7530 {
7531  if (this->mj_keep_part_boxes)
7532  return this->kept_boxes;
7533  else
7534  throw std::logic_error("Error: part boxes are not stored.");
7535 }
7536 
7537 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7538  typename mj_part_t>
7539 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7541  RCP<mj_partBoxVector_t> &localPartBoxes
7542 ) const
7543 {
7544  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
7545  mj_part_t ntasks = this->num_global_parts;
7546  int dim = (*localPartBoxes)[0].getDim();
7547  coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
7548 
7549  memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
7550 
7551  coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
7552  memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
7553 
7554  coord_t *localPartMins = localPartBoundaries;
7555  coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7556 
7557  coord_t *globalPartMins = globalPartBoundaries;
7558  coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7559 
7560  mj_part_t boxCount = localPartBoxes->size();
7561  for (mj_part_t i = 0; i < boxCount; ++i){
7562  mj_part_t pId = (*localPartBoxes)[i].getpId();
7563  //std::cout << "me:" << comm->getRank() << " has:" << pId << std::endl;
7564 
7565  coord_t *lmins = (*localPartBoxes)[i].getlmins();
7566  coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7567 
7568  for (int j = 0; j < dim; ++j){
7569  localPartMins[dim * pId + j] = lmins[j];
7570  localPartMaxs[dim * pId + j] = lmaxs[j];
7571  /*
7572  std::cout << "me:" << comm->getRank() <<
7573  " dim * pId + j:"<< dim * pId + j <<
7574  " localMin:" << localPartMins[dim * pId + j] <<
7575  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
7576  */
7577  }
7578  }
7579 
7580  Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
7581 
7582  reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
7583  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7584  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
7585  for (mj_part_t i = 0; i < ntasks; ++i){
7586  Zoltan2::coordinateModelPartBox tpb(i, dim, globalPartMins + dim * i,
7587  globalPartMaxs + dim * i);
7588 
7589  /*
7590  for (int j = 0; j < dim; ++j){
7591  std::cout << "me:" << comm->getRank() <<
7592  " dim * pId + j:"<< dim * i + j <<
7593  " globalMin:" << globalPartMins[dim * i + j] <<
7594  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
7595  }
7596  */
7597  pB->push_back(tpb);
7598  }
7599  delete []localPartBoundaries;
7600  delete []globalPartBoundaries;
7601  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
7602  return pB;
7603 }
7604 } // namespace Zoltan2
7605 
7606 #endif
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set(IT index_, CT count_, WT *vals_)
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Defines Parameter related enumerators, declares functions.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static ArrayRCP< ArrayRCP< zscalar_t > > weights
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
#define SIGNIFICANCE_MUL
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
A ParameterList validator for integer range lists.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
SparseMatrixAdapter_t::part_t part_t
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
T * allocMemory(size_t size)
Allocates memory for the given size.
dictionary vals
Definition: xml2dox.py:186
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
#define ZOLTAN2_ABS(x)
tuple root
Definition: xml2dox.py:168
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
#define epsilon
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
Tpetra::global_size_t global_size_t
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.